RubyGems - hivemeta - Versions diffs - 0.0.1 - Mend

hivemeta 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/CHANGELOG +2 -0
data/README +71 -0
data/examples/hivemeta_query.rb +118 -0
data/examples/hivemeta_testrec.rb +74 -0
data/examples/sample-mapper.rb +28 -0
data/lib/hivemeta/connection.rb +107 -0
data/lib/hivemeta/record.rb +29 -0
data/lib/hivemeta/table.rb +53 -0
data/lib/hivemeta.rb +5 -0
metadata +72 -0

data/CHANGELOG ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ * 2011-05-01 - fsf
2	+ - initial import

data/README ADDED Viewed

@@ -0,0 +1,71 @@
+hivemeta
+A ruby API for access to the Hive metastore.  Useful for querying columns
+in map/reduce applications.  Includes a demo application to spit out
+table information from the command-line via table name search or
+by the table's location in HDFS.
+streaming map/reduce code snippet:
+require 'hivemeta'
+h = HiveMeta::Connection.new(...) # see sample-mapper.rb for detail
+inventory = h.table 'sample_inventory'
+STDIN.each_line do |line|
+  begin
+    row = inv_table.process_row line
+  rescue HiveMeta::FieldCountError
+    STDERR.puts "reporter:counter:bad_data:row_size,1"
+    next
+  end
+  item_id = row.item_id # can access by method or [:sym] or ['str']
+  count   = row.inv_cnt.to_i
+  puts "#{item_id}\t#{count}" if count >= 1000
+end
+sample usage for the demo app:
+# query by table names
+$ hivemeta_query.rb join_test_name
+join_test_name
+hdfs://namenode/tmp/join_test_name
+0   userid             # userid
+1   name               # username
+# query by table name wildcards
+$ hivemeta_query.rb join_test%
+join_test_address
+hdfs://namenode/tmp/join_test_address
+0   userid             # uid
+1   address
+2   city
+3   state
+join_test_name
+hdfs://namenode/tmp/join_test_name
+0   userid             # userid
+1   name               # username
+# list the tables using /tmp in HDFS
+$ hivemeta_query -l /tmp
+join_test_address
+join_test_work
+my_test_table
+# view usage information
+$ hivemeta_query.rb -h
+usage: ./hivemeta_query.rb [options] table_name|hdfs_path
+  -h, --help
+  -c, --comments       # display comments along with field detail (default)
+  -C, --no-comments    # do not display comments with the field detail
+  -l, --list-tables    # list matching tables but no detail
+  -f, --list-file-path # list the table HDFS file locations
+  -w, --fit-width      # fit the text to the width of the screen (default)
+  -W, --no-fit-width   # do not fit the text to the width of the screen
+  -u, --db-user=arg    # hive metastore db user (requires read access)
+  -p, --db-pass=arg    # hive metastore db password
+  -H, --db-host=arg    # host running the hive meta db (default: localhost)
+  -d, --db-name=arg    # hive meta db name (default: hivemeta)

data/examples/hivemeta_query.rb ADDED Viewed

@@ -0,0 +1,118 @@
+#!/usr/bin/env ruby
+require 'hivemeta'
+require 'getoptlong'
+db_user    = 'hive'
+db_pass    = 'hivepasshere'
+db_host    = 'localhost'
+db_name    = 'hivemeta'
+def usage
+  puts <<-EOF
+usage: #$0 [options] table_name|hdfs_path
+  -h, --help
+  -c, --comments       # display comments along with field detail (default)
+  -C, --no-comments    # do not display comments with the field detail
+  -l, --list-tables    # list matching tables but no detail
+  -f, --list-file-path # list the table HDFS file locations
+  -w, --fit-width      # fit the text to the width of the screen (default)
+  -W, --no-fit-width   # do not fit the text to the width of the screen
+  -u, --db-user=arg    # hive metastore db user (requires read access)
+  -p, --db-pass=arg    # hive metastore db password
+  -H, --db-host=arg    # host running the hive meta db (default: localhost)
+  -d, --db-name=arg    # hive meta db name (default: hivemeta)
+EOF
+end
+# main
+opts = GetoptLong.new(
+  [ '--comments', '-c', GetoptLong::NO_ARGUMENT ],
+  [ '--no-comments', '-C', GetoptLong::NO_ARGUMENT ],
+  [ '--list-tables', '-l', GetoptLong::NO_ARGUMENT ],
+  [ '--list-file-path', '-f', GetoptLong::NO_ARGUMENT ],
+  [ '--fit-width', '-w', GetoptLong::NO_ARGUMENT ],
+  [ '--no-fit-width', '-W', GetoptLong::NO_ARGUMENT ],
+  [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
+)
+show_comments = true
+list_tables   = false
+list_paths    = false
+fit_width     = true
+opts.each do |opt, arg|
+  case opt
+    when '--comments'
+      show_comments = true
+    when '--no-comments'
+      show_comments = false
+    when '--list-tables'
+      list_tables = true
+    when '--list-file-path'
+      list_paths = true
+    when '--fit-width'
+      fit_width = true
+    when '--no-fit-width'
+      fit_width = false
+    when '--db-host'
+      db_host = arg
+    when '--db-user'
+      db_user = arg
+    when '--db-pass'
+      db_pass = arg
+    when '--db-name'
+      db_name = arg
+    when '--help'
+      usage
+      exit
+  end end
+dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
+h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
+tables = []
+max_col_width = 8
+ARGV.each do |arg|
+  if arg =~ %r|/|
+    h.tables(filter_path: arg).each {|t| tables << t}
+  else
+    h.tables(filter_name: arg).each {|t| tables << t}
+  end
+end
+tables.uniq.sort.each do |table|
+  table.each_col do |col_name|
+    max_col_width = col_name.size if col_name.size > max_col_width
+  end
+end
+first_table = true
+tables.each do |table|
+  puts if not first_table and not list_tables
+  puts table
+  first_table = false
+  next if list_tables
+  puts table.path
+  next if list_paths
+  tput_cols = `tput cols`.chomp.to_i rescue tput_cols = 0
+  table.each_with_index do |col_name, i|
+    print "%-3d %-#{max_col_width}s" % [i, col_name]
+    if show_comments and table.comments[i]
+      if fit_width and tput_cols > 0
+        width = tput_cols - 3 - 1 - max_col_width - 1
+        width = 0 if width < 0
+        print "%-#{width}.#{width}s" % " \# #{table.comments[i]}"
+      else
+        print " \# #{table.comments[i]}"
+      end
+    end
+    puts
+  end
+end

data/examples/hivemeta_testrec.rb ADDED Viewed

@@ -0,0 +1,74 @@
+#!/usr/bin/env ruby
+require 'hivemeta'
+require 'getoptlong'
+db_user    = 'hive'
+db_pass    = 'hivepasshere'
+db_host    = 'localhost'
+db_name    = 'hivemeta'
+def usage
+  puts <<-EOF
+usage: #$0 [options] table_name|hdfs_path
+  -h, --help
+  -u, --db-user=arg    # hive metastore db user (requires read access)
+  -p, --db-pass=arg    # hive metastore db password
+  -H, --db-host=arg    # host running the hive meta db (default: localhost)
+  -d, --db-name=arg    # hive meta db name (default: hivemeta)
+EOF
+end
+# main
+opts = GetoptLong.new(
+  [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
+  [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
+)
+opts.each do |opt, arg|
+  case opt
+    when '--db-host'
+      db_host = arg
+    when '--db-user'
+      db_user = arg
+    when '--db-pass'
+      db_pass = arg
+    when '--db-name'
+      db_name = arg
+    when '--help'
+      usage
+      exit
+  end end
+dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
+h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
+# test table has the following schema
+# i   col_name
+# 0   foo
+# 1   bar
+test_table_name = 'testhive'
+test_table = h.table test_table_name
+begin
+  test_data = "data0\tdata1"
+  row = test_table.process_row test_data
+  p row
+  puts "access via method (best): #{row.foo} | #{row.bar}"
+  puts "access via symbol lookup: #{row[:foo]} | #{row[:bar]}"
+  puts "access via string lookup: #{row['foo']} | #{row['bar']}"
+  # this will bomb
+  test_data = "data0\tdata1\tdata2"
+  row = test_table.process_row test_data
+  p row
+rescue HiveMeta::FieldCountError => e
+  puts e
+  puts "bad data: #{test_data}"
+end

data/examples/sample-mapper.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# a sample streaming mapper
+#   - reads a fictitous sample_inventory table that has a number of
+#     fields, one of which is item_id and another is inv_cnt
+#   - outputs the inventory count for all items that have 1000 or more
+require 'hivemeta'
+db_user    = 'hive'
+db_pass    = 'hivepasshere'
+db_host    = 'localhost'
+db_name    = 'hivemeta'
+dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
+h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
+inventory = h.table 'sample_inventory'
+STDIN.each_line do |line|
+  begin
+    row = inv_table.process_row line
+  rescue HiveMeta::FieldCountError
+    STDERR.puts "reporter:counter:bad_data:row_size,1"
+    next
+  end
+  item_id = row.item_id # can access by method or [:sym] or ['str']
+  count   = row.inv_cnt.to_i
+  puts "#{item_id}\t#{count}" if count >= 1000
+end

data/lib/hivemeta/connection.rb ADDED Viewed

@@ -0,0 +1,107 @@
+require 'dbi'
+require 'hivemeta/table'
+require 'hivemeta/record'
+module HiveMeta
+  class Connection
+    def initialize(dbi_string = nil, db_user = nil, db_pass = nil)
+      @dbi_string = dbi_string
+      @db_user    = db_user
+      @db_pass    = db_pass
+      begin
+        @dbh = DBI.connect(dbi_string, db_user, db_pass)
+      rescue DBI::DatabaseError => e
+        STDERR.puts "cannot connect to metastore %s:\n  error (%s) %s" %
+          [dbi_string, e.err, e.errstr]
+        raise
+      end
+    end
+    def query(sql, *args)
+      results = nil
+#puts "sql: #{sql}"
+#puts "args: #{args}"
+      sth = @dbh.prepare(sql)
+      sth.execute(*args)
+      if block_given?
+        sth.fetch {|row| yield row}
+      else
+        results = []
+        sth.fetch {|row| results << row.dup}
+      end
+      sth.finish
+      results # returns nil if a block is given
+    end
+    def tables(opts = {})
+      args = nil
+      if opts[:filter_path]
+        sql = "select t.TBL_NAME from TBLS t, SDS s
+          where t.SD_ID = s.SD_ID
+          and s.LOCATION like ?"
+        args = "%#{opts[:filter_path]}%"
+      elsif opts[:filter_name]
+        sql = "select TBL_NAME from TBLS
+          where TBL_NAME like ?"
+        args = opts[:filter_name]
+      else
+        sql = "select TBL_NAME from TBLS"
+      end
+      results = query sql, *args
+      table_names = results.map {|result| result[0]}
+#puts "TABLE_NAMES:"
+#p table_names
+      tables = []
+      table_names.each do |name|
+#puts "NAME: "
+#p name
+        table = Table.new(name)
+        sql = "select c.INTEGER_IDX, c.column_name, c.COMMENT, s.LOCATION
+          from TBLS t, COLUMNS c, SDS s
+          where t.SD_ID = c.SD_ID and t.SD_ID = s.SD_ID and t.TBL_NAME = ?"
+        query sql, name do |rec|
+#puts "REC:"
+#p rec
+          col_idx  = rec[0].to_i
+          col_name = rec[1]
+          col_cmt  = rec[2]
+          tbl_loc  = rec[3]
+          table.columns[col_idx]  = col_name
+          table.comments[col_idx] = col_cmt
+          table.path = tbl_loc
+        end
+        tables << table
+      end
+      tables
+    end
+    def table(name)
+      t = tables(:filter_name => name) # appeasing the old skool 1.8 users
+      t[0] # if it comes back with multiple tables, return the first
+    end
+  end
+end
+# fix for broken row dup in 1.9
+# http://rubyforge.org/tracker/index.php?func=detail&aid=28624&group_id=234&atid=967
+module DBI
+  class Row
+    if RUBY_VERSION =~ /^1\.9/
+      def dup
+        row = super
+        row.instance_variable_set :@arr, @arr.dup
+        row
+      end
+    end
+  end
+end

data/lib/hivemeta/record.rb ADDED Viewed

@@ -0,0 +1,29 @@
+module HiveMeta
+  class FieldCountError < StandardError ; end
+  class Record
+    def initialize(line, table)
+      fields = line.chomp.split(table.delimiter, -1)
+      if fields.size != table.columns.size
+        raise FieldCountError
+      end
+      @columns = {}
+      table.each_col_with_index do |col_name, i|
+        @columns[col_name] = fields[i]
+        @columns[col_name.to_sym] = fields[i]
+      end
+    end
+    def [] index
+      "#{@columns[index.to_sym]}"
+    end
+    def method_missing(id, *args)
+      return @columns[id] if @columns[id]
+      raise NoMethodError
+    end
+  end
+end

data/lib/hivemeta/table.rb ADDED Viewed

@@ -0,0 +1,53 @@
+module HiveMeta
+  class Table
+    include Comparable
+    include Enumerable
+    attr_accessor :path, :columns, :comments, :delimiter
+    def initialize(name)
+      @name = name
+      @path = nil
+      @columns   = []
+      @comments  = []
+      @delimiter = "\t"
+    end
+    def to_s
+      "#{@name}"
+    end
+    def each
+      @columns.each_with_index do |column_name, index|
+        yield column_name if column_name
+      end
+    end
+    alias :each_col :each
+    def each_with_index
+      @columns.each_with_index do |column_name, index|
+        yield column_name, index if column_name
+      end
+    end
+    alias :each_col_with_index :each_with_index
+    def <=>(other)
+      self.to_s <=> other.to_s
+    end
+    # process a row and return a record that can be queried
+    # by column name in a variety of ways
+    def process_row(line)
+      return nil if not line
+      if block_given?
+        yield Record.new(line, self)
+      else
+        return Record.new(line, self)
+      end
+    end
+  end
+end

data/lib/hivemeta.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'hivemeta/connection'
+module HiveMeta
+  VERSION = '0.0.1'
+end

metadata ADDED Viewed

@@ -0,0 +1,72 @@
+--- !ruby/object:Gem::Specification
+name: hivemeta
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 0
+  - 1
+  version: 0.0.1
+platform: ruby
+authors:
+- Frank Fejes
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-05-01 00:00:00 -05:00
+default_executable:
+dependencies: []
+description: Use the hive metadb to write map/reduce and easily query table info.
+email: frank@fejes.net
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README
+- CHANGELOG
+- lib/hivemeta.rb
+- lib/hivemeta/connection.rb
+- lib/hivemeta/record.rb
+- lib/hivemeta/table.rb
+- examples/hivemeta_query.rb
+- examples/hivemeta_testrec.rb
+- examples/sample-mapper.rb
+has_rdoc: true
+homepage: https://github.com/fsfiii/hivemeta
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: hivemeta
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: Use the hive metadb to write map/reduce and query table info.
+test_files: []