hivemeta 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-05-01 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,71 @@
1
+ hivemeta
2
+
3
+ A ruby API for access to the Hive metastore. Useful for querying columns
4
+ in map/reduce applications. Includes a demo application to spit out
5
+ table information from the command-line via table name search or
6
+ by the table's location in HDFS.
7
+
8
+ streaming map/reduce code snippet:
9
+
10
+ require 'hivemeta'
11
+
12
+ h = HiveMeta::Connection.new(...) # see sample-mapper.rb for detail
13
+ inventory = h.table 'sample_inventory'
14
+
15
+ STDIN.each_line do |line|
16
+ begin
17
+ row = inv_table.process_row line
18
+ rescue HiveMeta::FieldCountError
19
+ STDERR.puts "reporter:counter:bad_data:row_size,1"
20
+ next
21
+ end
22
+ item_id = row.item_id # can access by method or [:sym] or ['str']
23
+ count = row.inv_cnt.to_i
24
+ puts "#{item_id}\t#{count}" if count >= 1000
25
+ end
26
+
27
+
28
+ sample usage for the demo app:
29
+
30
+ # query by table names
31
+ $ hivemeta_query.rb join_test_name
32
+ join_test_name
33
+ hdfs://namenode/tmp/join_test_name
34
+ 0 userid # userid
35
+ 1 name # username
36
+
37
+ # query by table name wildcards
38
+ $ hivemeta_query.rb join_test%
39
+ join_test_address
40
+ hdfs://namenode/tmp/join_test_address
41
+ 0 userid # uid
42
+ 1 address
43
+ 2 city
44
+ 3 state
45
+
46
+ join_test_name
47
+ hdfs://namenode/tmp/join_test_name
48
+ 0 userid # userid
49
+ 1 name # username
50
+
51
+ # list the tables using /tmp in HDFS
52
+ $ hivemeta_query -l /tmp
53
+ join_test_address
54
+ join_test_work
55
+ my_test_table
56
+
57
+ # view usage information
58
+ $ hivemeta_query.rb -h
59
+ usage: ./hivemeta_query.rb [options] table_name|hdfs_path
60
+ -h, --help
61
+ -c, --comments # display comments along with field detail (default)
62
+ -C, --no-comments # do not display comments with the field detail
63
+ -l, --list-tables # list matching tables but no detail
64
+ -f, --list-file-path # list the table HDFS file locations
65
+ -w, --fit-width # fit the text to the width of the screen (default)
66
+ -W, --no-fit-width # do not fit the text to the width of the screen
67
+ -u, --db-user=arg # hive metastore db user (requires read access)
68
+ -p, --db-pass=arg # hive metastore db password
69
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
70
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
71
+
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hivemeta'
4
+ require 'getoptlong'
5
+
6
+ db_user = 'hive'
7
+ db_pass = 'hivepasshere'
8
+ db_host = 'localhost'
9
+ db_name = 'hivemeta'
10
+
11
+ def usage
12
+ puts <<-EOF
13
+ usage: #$0 [options] table_name|hdfs_path
14
+ -h, --help
15
+ -c, --comments # display comments along with field detail (default)
16
+ -C, --no-comments # do not display comments with the field detail
17
+ -l, --list-tables # list matching tables but no detail
18
+ -f, --list-file-path # list the table HDFS file locations
19
+ -w, --fit-width # fit the text to the width of the screen (default)
20
+ -W, --no-fit-width # do not fit the text to the width of the screen
21
+ -u, --db-user=arg # hive metastore db user (requires read access)
22
+ -p, --db-pass=arg # hive metastore db password
23
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
24
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
25
+ EOF
26
+ end
27
+
28
+ # main
29
+
30
+ opts = GetoptLong.new(
31
+ [ '--comments', '-c', GetoptLong::NO_ARGUMENT ],
32
+ [ '--no-comments', '-C', GetoptLong::NO_ARGUMENT ],
33
+ [ '--list-tables', '-l', GetoptLong::NO_ARGUMENT ],
34
+ [ '--list-file-path', '-f', GetoptLong::NO_ARGUMENT ],
35
+ [ '--fit-width', '-w', GetoptLong::NO_ARGUMENT ],
36
+ [ '--no-fit-width', '-W', GetoptLong::NO_ARGUMENT ],
37
+ [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
38
+ [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
39
+ [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
40
+ [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
41
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
42
+ )
43
+
44
+ show_comments = true
45
+ list_tables = false
46
+ list_paths = false
47
+ fit_width = true
48
+ opts.each do |opt, arg|
49
+ case opt
50
+ when '--comments'
51
+ show_comments = true
52
+ when '--no-comments'
53
+ show_comments = false
54
+ when '--list-tables'
55
+ list_tables = true
56
+ when '--list-file-path'
57
+ list_paths = true
58
+ when '--fit-width'
59
+ fit_width = true
60
+ when '--no-fit-width'
61
+ fit_width = false
62
+ when '--db-host'
63
+ db_host = arg
64
+ when '--db-user'
65
+ db_user = arg
66
+ when '--db-pass'
67
+ db_pass = arg
68
+ when '--db-name'
69
+ db_name = arg
70
+ when '--help'
71
+ usage
72
+ exit
73
+ end end
74
+
75
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
76
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
77
+
78
+ tables = []
79
+ max_col_width = 8
80
+
81
+ ARGV.each do |arg|
82
+ if arg =~ %r|/|
83
+ h.tables(filter_path: arg).each {|t| tables << t}
84
+ else
85
+ h.tables(filter_name: arg).each {|t| tables << t}
86
+ end
87
+ end
88
+
89
+ tables.uniq.sort.each do |table|
90
+ table.each_col do |col_name|
91
+ max_col_width = col_name.size if col_name.size > max_col_width
92
+ end
93
+ end
94
+
95
+ first_table = true
96
+ tables.each do |table|
97
+ puts if not first_table and not list_tables
98
+ puts table
99
+ first_table = false
100
+ next if list_tables
101
+ puts table.path
102
+ next if list_paths
103
+ tput_cols = `tput cols`.chomp.to_i rescue tput_cols = 0
104
+
105
+ table.each_with_index do |col_name, i|
106
+ print "%-3d %-#{max_col_width}s" % [i, col_name]
107
+ if show_comments and table.comments[i]
108
+ if fit_width and tput_cols > 0
109
+ width = tput_cols - 3 - 1 - max_col_width - 1
110
+ width = 0 if width < 0
111
+ print "%-#{width}.#{width}s" % " \# #{table.comments[i]}"
112
+ else
113
+ print " \# #{table.comments[i]}"
114
+ end
115
+ end
116
+ puts
117
+ end
118
+ end
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hivemeta'
4
+ require 'getoptlong'
5
+
6
+ db_user = 'hive'
7
+ db_pass = 'hivepasshere'
8
+ db_host = 'localhost'
9
+ db_name = 'hivemeta'
10
+
11
+ def usage
12
+ puts <<-EOF
13
+ usage: #$0 [options] table_name|hdfs_path
14
+ -h, --help
15
+ -u, --db-user=arg # hive metastore db user (requires read access)
16
+ -p, --db-pass=arg # hive metastore db password
17
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
18
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
19
+ EOF
20
+ end
21
+
22
+ # main
23
+
24
+ opts = GetoptLong.new(
25
+ [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
26
+ [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
27
+ [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
29
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
30
+ )
31
+
32
+ opts.each do |opt, arg|
33
+ case opt
34
+ when '--db-host'
35
+ db_host = arg
36
+ when '--db-user'
37
+ db_user = arg
38
+ when '--db-pass'
39
+ db_pass = arg
40
+ when '--db-name'
41
+ db_name = arg
42
+ when '--help'
43
+ usage
44
+ exit
45
+ end end
46
+
47
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
48
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
49
+
50
+ # test table has the following schema
51
+ # i col_name
52
+ # 0 foo
53
+ # 1 bar
54
+
55
+ test_table_name = 'testhive'
56
+
57
+ test_table = h.table test_table_name
58
+
59
+ begin
60
+ test_data = "data0\tdata1"
61
+ row = test_table.process_row test_data
62
+ p row
63
+ puts "access via method (best): #{row.foo} | #{row.bar}"
64
+ puts "access via symbol lookup: #{row[:foo]} | #{row[:bar]}"
65
+ puts "access via string lookup: #{row['foo']} | #{row['bar']}"
66
+
67
+ # this will bomb
68
+ test_data = "data0\tdata1\tdata2"
69
+ row = test_table.process_row test_data
70
+ p row
71
+ rescue HiveMeta::FieldCountError => e
72
+ puts e
73
+ puts "bad data: #{test_data}"
74
+ end
@@ -0,0 +1,28 @@
1
+ # a sample streaming mapper
2
+ # - reads a fictitous sample_inventory table that has a number of
3
+ # fields, one of which is item_id and another is inv_cnt
4
+ # - outputs the inventory count for all items that have 1000 or more
5
+
6
+ require 'hivemeta'
7
+
8
+ db_user = 'hive'
9
+ db_pass = 'hivepasshere'
10
+ db_host = 'localhost'
11
+ db_name = 'hivemeta'
12
+
13
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
14
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
15
+
16
+ inventory = h.table 'sample_inventory'
17
+
18
+ STDIN.each_line do |line|
19
+ begin
20
+ row = inv_table.process_row line
21
+ rescue HiveMeta::FieldCountError
22
+ STDERR.puts "reporter:counter:bad_data:row_size,1"
23
+ next
24
+ end
25
+ item_id = row.item_id # can access by method or [:sym] or ['str']
26
+ count = row.inv_cnt.to_i
27
+ puts "#{item_id}\t#{count}" if count >= 1000
28
+ end
@@ -0,0 +1,107 @@
1
+ require 'dbi'
2
+ require 'hivemeta/table'
3
+ require 'hivemeta/record'
4
+
5
+ module HiveMeta
6
+
7
+ class Connection
8
+ def initialize(dbi_string = nil, db_user = nil, db_pass = nil)
9
+ @dbi_string = dbi_string
10
+ @db_user = db_user
11
+ @db_pass = db_pass
12
+
13
+ begin
14
+ @dbh = DBI.connect(dbi_string, db_user, db_pass)
15
+ rescue DBI::DatabaseError => e
16
+ STDERR.puts "cannot connect to metastore %s:\n error (%s) %s" %
17
+ [dbi_string, e.err, e.errstr]
18
+ raise
19
+ end
20
+ end
21
+
22
+ def query(sql, *args)
23
+ results = nil
24
+
25
+ #puts "sql: #{sql}"
26
+ #puts "args: #{args}"
27
+ sth = @dbh.prepare(sql)
28
+ sth.execute(*args)
29
+ if block_given?
30
+ sth.fetch {|row| yield row}
31
+ else
32
+ results = []
33
+ sth.fetch {|row| results << row.dup}
34
+ end
35
+ sth.finish
36
+
37
+ results # returns nil if a block is given
38
+ end
39
+
40
+ def tables(opts = {})
41
+ args = nil
42
+ if opts[:filter_path]
43
+ sql = "select t.TBL_NAME from TBLS t, SDS s
44
+ where t.SD_ID = s.SD_ID
45
+ and s.LOCATION like ?"
46
+ args = "%#{opts[:filter_path]}%"
47
+ elsif opts[:filter_name]
48
+ sql = "select TBL_NAME from TBLS
49
+ where TBL_NAME like ?"
50
+ args = opts[:filter_name]
51
+ else
52
+ sql = "select TBL_NAME from TBLS"
53
+ end
54
+
55
+ results = query sql, *args
56
+ table_names = results.map {|result| result[0]}
57
+
58
+ #puts "TABLE_NAMES:"
59
+ #p table_names
60
+
61
+ tables = []
62
+ table_names.each do |name|
63
+ #puts "NAME: "
64
+ #p name
65
+ table = Table.new(name)
66
+
67
+ sql = "select c.INTEGER_IDX, c.column_name, c.COMMENT, s.LOCATION
68
+ from TBLS t, COLUMNS c, SDS s
69
+ where t.SD_ID = c.SD_ID and t.SD_ID = s.SD_ID and t.TBL_NAME = ?"
70
+ query sql, name do |rec|
71
+ #puts "REC:"
72
+ #p rec
73
+ col_idx = rec[0].to_i
74
+ col_name = rec[1]
75
+ col_cmt = rec[2]
76
+ tbl_loc = rec[3]
77
+ table.columns[col_idx] = col_name
78
+ table.comments[col_idx] = col_cmt
79
+ table.path = tbl_loc
80
+ end
81
+
82
+ tables << table
83
+ end
84
+ tables
85
+ end
86
+
87
+ def table(name)
88
+ t = tables(:filter_name => name) # appeasing the old skool 1.8 users
89
+ t[0] # if it comes back with multiple tables, return the first
90
+ end
91
+ end
92
+
93
+ end
94
+
95
+ # fix for broken row dup in 1.9
96
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=28624&group_id=234&atid=967
97
+ module DBI
98
+ class Row
99
+ if RUBY_VERSION =~ /^1\.9/
100
+ def dup
101
+ row = super
102
+ row.instance_variable_set :@arr, @arr.dup
103
+ row
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,29 @@
1
+ module HiveMeta
2
+
3
+ class FieldCountError < StandardError ; end
4
+
5
+ class Record
6
+ def initialize(line, table)
7
+ fields = line.chomp.split(table.delimiter, -1)
8
+ if fields.size != table.columns.size
9
+ raise FieldCountError
10
+ end
11
+
12
+ @columns = {}
13
+ table.each_col_with_index do |col_name, i|
14
+ @columns[col_name] = fields[i]
15
+ @columns[col_name.to_sym] = fields[i]
16
+ end
17
+ end
18
+
19
+ def [] index
20
+ "#{@columns[index.to_sym]}"
21
+ end
22
+
23
+ def method_missing(id, *args)
24
+ return @columns[id] if @columns[id]
25
+ raise NoMethodError
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,53 @@
1
+ module HiveMeta
2
+
3
+ class Table
4
+ include Comparable
5
+ include Enumerable
6
+
7
+ attr_accessor :path, :columns, :comments, :delimiter
8
+
9
+ def initialize(name)
10
+ @name = name
11
+ @path = nil
12
+ @columns = []
13
+ @comments = []
14
+ @delimiter = "\t"
15
+ end
16
+
17
+ def to_s
18
+ "#{@name}"
19
+ end
20
+
21
+ def each
22
+ @columns.each_with_index do |column_name, index|
23
+ yield column_name if column_name
24
+ end
25
+ end
26
+
27
+ alias :each_col :each
28
+
29
+ def each_with_index
30
+ @columns.each_with_index do |column_name, index|
31
+ yield column_name, index if column_name
32
+ end
33
+ end
34
+
35
+ alias :each_col_with_index :each_with_index
36
+
37
+ def <=>(other)
38
+ self.to_s <=> other.to_s
39
+ end
40
+
41
+ # process a row and return a record that can be queried
42
+ # by column name in a variety of ways
43
+ def process_row(line)
44
+ return nil if not line
45
+ if block_given?
46
+ yield Record.new(line, self)
47
+ else
48
+ return Record.new(line, self)
49
+ end
50
+ end
51
+ end
52
+
53
+ end
data/lib/hivemeta.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'hivemeta/connection'
2
+
3
+ module HiveMeta
4
+ VERSION = '0.0.1'
5
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hivemeta
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-05-01 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Use the hive metadb to write map/reduce and easily query table info.
22
+ email: frank@fejes.net
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - CHANGELOG
32
+ - lib/hivemeta.rb
33
+ - lib/hivemeta/connection.rb
34
+ - lib/hivemeta/record.rb
35
+ - lib/hivemeta/table.rb
36
+ - examples/hivemeta_query.rb
37
+ - examples/hivemeta_testrec.rb
38
+ - examples/sample-mapper.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/hivemeta
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project: hivemeta
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Use the hive metadb to write map/reduce and query table info.
71
+ test_files: []
72
+