hivemeta 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-05-01 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,71 @@
1
+ hivemeta
2
+
3
+ A ruby API for access to the Hive metastore. Useful for querying columns
4
+ in map/reduce applications. Includes a demo application to spit out
5
+ table information from the command-line via table name search or
6
+ by the table's location in HDFS.
7
+
8
+ streaming map/reduce code snippet:
9
+
10
+ require 'hivemeta'
11
+
12
+ h = HiveMeta::Connection.new(...) # see sample-mapper.rb for detail
13
+ inventory = h.table 'sample_inventory'
14
+
15
+ STDIN.each_line do |line|
16
+ begin
17
+ row = inv_table.process_row line
18
+ rescue HiveMeta::FieldCountError
19
+ STDERR.puts "reporter:counter:bad_data:row_size,1"
20
+ next
21
+ end
22
+ item_id = row.item_id # can access by method or [:sym] or ['str']
23
+ count = row.inv_cnt.to_i
24
+ puts "#{item_id}\t#{count}" if count >= 1000
25
+ end
26
+
27
+
28
+ sample usage for the demo app:
29
+
30
+ # query by table names
31
+ $ hivemeta_query.rb join_test_name
32
+ join_test_name
33
+ hdfs://namenode/tmp/join_test_name
34
+ 0 userid # userid
35
+ 1 name # username
36
+
37
+ # query by table name wildcards
38
+ $ hivemeta_query.rb join_test%
39
+ join_test_address
40
+ hdfs://namenode/tmp/join_test_address
41
+ 0 userid # uid
42
+ 1 address
43
+ 2 city
44
+ 3 state
45
+
46
+ join_test_name
47
+ hdfs://namenode/tmp/join_test_name
48
+ 0 userid # userid
49
+ 1 name # username
50
+
51
+ # list the tables using /tmp in HDFS
52
+ $ hivemeta_query -l /tmp
53
+ join_test_address
54
+ join_test_work
55
+ my_test_table
56
+
57
+ # view usage information
58
+ $ hivemeta_query.rb -h
59
+ usage: ./hivemeta_query.rb [options] table_name|hdfs_path
60
+ -h, --help
61
+ -c, --comments # display comments along with field detail (default)
62
+ -C, --no-comments # do not display comments with the field detail
63
+ -l, --list-tables # list matching tables but no detail
64
+ -f, --list-file-path # list the table HDFS file locations
65
+ -w, --fit-width # fit the text to the width of the screen (default)
66
+ -W, --no-fit-width # do not fit the text to the width of the screen
67
+ -u, --db-user=arg # hive metastore db user (requires read access)
68
+ -p, --db-pass=arg # hive metastore db password
69
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
70
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
71
+
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hivemeta'
4
+ require 'getoptlong'
5
+
6
+ db_user = 'hive'
7
+ db_pass = 'hivepasshere'
8
+ db_host = 'localhost'
9
+ db_name = 'hivemeta'
10
+
11
+ def usage
12
+ puts <<-EOF
13
+ usage: #$0 [options] table_name|hdfs_path
14
+ -h, --help
15
+ -c, --comments # display comments along with field detail (default)
16
+ -C, --no-comments # do not display comments with the field detail
17
+ -l, --list-tables # list matching tables but no detail
18
+ -f, --list-file-path # list the table HDFS file locations
19
+ -w, --fit-width # fit the text to the width of the screen (default)
20
+ -W, --no-fit-width # do not fit the text to the width of the screen
21
+ -u, --db-user=arg # hive metastore db user (requires read access)
22
+ -p, --db-pass=arg # hive metastore db password
23
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
24
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
25
+ EOF
26
+ end
27
+
28
+ # main
29
+
30
+ opts = GetoptLong.new(
31
+ [ '--comments', '-c', GetoptLong::NO_ARGUMENT ],
32
+ [ '--no-comments', '-C', GetoptLong::NO_ARGUMENT ],
33
+ [ '--list-tables', '-l', GetoptLong::NO_ARGUMENT ],
34
+ [ '--list-file-path', '-f', GetoptLong::NO_ARGUMENT ],
35
+ [ '--fit-width', '-w', GetoptLong::NO_ARGUMENT ],
36
+ [ '--no-fit-width', '-W', GetoptLong::NO_ARGUMENT ],
37
+ [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
38
+ [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
39
+ [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
40
+ [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
41
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
42
+ )
43
+
44
+ show_comments = true
45
+ list_tables = false
46
+ list_paths = false
47
+ fit_width = true
48
+ opts.each do |opt, arg|
49
+ case opt
50
+ when '--comments'
51
+ show_comments = true
52
+ when '--no-comments'
53
+ show_comments = false
54
+ when '--list-tables'
55
+ list_tables = true
56
+ when '--list-file-path'
57
+ list_paths = true
58
+ when '--fit-width'
59
+ fit_width = true
60
+ when '--no-fit-width'
61
+ fit_width = false
62
+ when '--db-host'
63
+ db_host = arg
64
+ when '--db-user'
65
+ db_user = arg
66
+ when '--db-pass'
67
+ db_pass = arg
68
+ when '--db-name'
69
+ db_name = arg
70
+ when '--help'
71
+ usage
72
+ exit
73
+ end end
74
+
75
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
76
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
77
+
78
+ tables = []
79
+ max_col_width = 8
80
+
81
+ ARGV.each do |arg|
82
+ if arg =~ %r|/|
83
+ h.tables(filter_path: arg).each {|t| tables << t}
84
+ else
85
+ h.tables(filter_name: arg).each {|t| tables << t}
86
+ end
87
+ end
88
+
89
+ tables.uniq.sort.each do |table|
90
+ table.each_col do |col_name|
91
+ max_col_width = col_name.size if col_name.size > max_col_width
92
+ end
93
+ end
94
+
95
+ first_table = true
96
+ tables.each do |table|
97
+ puts if not first_table and not list_tables
98
+ puts table
99
+ first_table = false
100
+ next if list_tables
101
+ puts table.path
102
+ next if list_paths
103
+ tput_cols = `tput cols`.chomp.to_i rescue tput_cols = 0
104
+
105
+ table.each_with_index do |col_name, i|
106
+ print "%-3d %-#{max_col_width}s" % [i, col_name]
107
+ if show_comments and table.comments[i]
108
+ if fit_width and tput_cols > 0
109
+ width = tput_cols - 3 - 1 - max_col_width - 1
110
+ width = 0 if width < 0
111
+ print "%-#{width}.#{width}s" % " \# #{table.comments[i]}"
112
+ else
113
+ print " \# #{table.comments[i]}"
114
+ end
115
+ end
116
+ puts
117
+ end
118
+ end
@@ -0,0 +1,74 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'hivemeta'
4
+ require 'getoptlong'
5
+
6
+ db_user = 'hive'
7
+ db_pass = 'hivepasshere'
8
+ db_host = 'localhost'
9
+ db_name = 'hivemeta'
10
+
11
+ def usage
12
+ puts <<-EOF
13
+ usage: #$0 [options] table_name|hdfs_path
14
+ -h, --help
15
+ -u, --db-user=arg # hive metastore db user (requires read access)
16
+ -p, --db-pass=arg # hive metastore db password
17
+ -H, --db-host=arg # host running the hive meta db (default: localhost)
18
+ -d, --db-name=arg # hive meta db name (default: hivemeta)
19
+ EOF
20
+ end
21
+
22
+ # main
23
+
24
+ opts = GetoptLong.new(
25
+ [ '--db-user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
26
+ [ '--db-pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
27
+ [ '--db-name', '-d', GetoptLong::REQUIRED_ARGUMENT ],
28
+ [ '--db-host', '-H', GetoptLong::REQUIRED_ARGUMENT ],
29
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
30
+ )
31
+
32
+ opts.each do |opt, arg|
33
+ case opt
34
+ when '--db-host'
35
+ db_host = arg
36
+ when '--db-user'
37
+ db_user = arg
38
+ when '--db-pass'
39
+ db_pass = arg
40
+ when '--db-name'
41
+ db_name = arg
42
+ when '--help'
43
+ usage
44
+ exit
45
+ end end
46
+
47
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
48
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
49
+
50
+ # test table has the following schema
51
+ # i col_name
52
+ # 0 foo
53
+ # 1 bar
54
+
55
+ test_table_name = 'testhive'
56
+
57
+ test_table = h.table test_table_name
58
+
59
+ begin
60
+ test_data = "data0\tdata1"
61
+ row = test_table.process_row test_data
62
+ p row
63
+ puts "access via method (best): #{row.foo} | #{row.bar}"
64
+ puts "access via symbol lookup: #{row[:foo]} | #{row[:bar]}"
65
+ puts "access via string lookup: #{row['foo']} | #{row['bar']}"
66
+
67
+ # this will bomb
68
+ test_data = "data0\tdata1\tdata2"
69
+ row = test_table.process_row test_data
70
+ p row
71
+ rescue HiveMeta::FieldCountError => e
72
+ puts e
73
+ puts "bad data: #{test_data}"
74
+ end
@@ -0,0 +1,28 @@
1
+ # a sample streaming mapper
2
+ # - reads a fictitous sample_inventory table that has a number of
3
+ # fields, one of which is item_id and another is inv_cnt
4
+ # - outputs the inventory count for all items that have 1000 or more
5
+
6
+ require 'hivemeta'
7
+
8
+ db_user = 'hive'
9
+ db_pass = 'hivepasshere'
10
+ db_host = 'localhost'
11
+ db_name = 'hivemeta'
12
+
13
+ dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
14
+ h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
15
+
16
+ inventory = h.table 'sample_inventory'
17
+
18
+ STDIN.each_line do |line|
19
+ begin
20
+ row = inv_table.process_row line
21
+ rescue HiveMeta::FieldCountError
22
+ STDERR.puts "reporter:counter:bad_data:row_size,1"
23
+ next
24
+ end
25
+ item_id = row.item_id # can access by method or [:sym] or ['str']
26
+ count = row.inv_cnt.to_i
27
+ puts "#{item_id}\t#{count}" if count >= 1000
28
+ end
@@ -0,0 +1,107 @@
1
+ require 'dbi'
2
+ require 'hivemeta/table'
3
+ require 'hivemeta/record'
4
+
5
+ module HiveMeta
6
+
7
+ class Connection
8
+ def initialize(dbi_string = nil, db_user = nil, db_pass = nil)
9
+ @dbi_string = dbi_string
10
+ @db_user = db_user
11
+ @db_pass = db_pass
12
+
13
+ begin
14
+ @dbh = DBI.connect(dbi_string, db_user, db_pass)
15
+ rescue DBI::DatabaseError => e
16
+ STDERR.puts "cannot connect to metastore %s:\n error (%s) %s" %
17
+ [dbi_string, e.err, e.errstr]
18
+ raise
19
+ end
20
+ end
21
+
22
+ def query(sql, *args)
23
+ results = nil
24
+
25
+ #puts "sql: #{sql}"
26
+ #puts "args: #{args}"
27
+ sth = @dbh.prepare(sql)
28
+ sth.execute(*args)
29
+ if block_given?
30
+ sth.fetch {|row| yield row}
31
+ else
32
+ results = []
33
+ sth.fetch {|row| results << row.dup}
34
+ end
35
+ sth.finish
36
+
37
+ results # returns nil if a block is given
38
+ end
39
+
40
+ def tables(opts = {})
41
+ args = nil
42
+ if opts[:filter_path]
43
+ sql = "select t.TBL_NAME from TBLS t, SDS s
44
+ where t.SD_ID = s.SD_ID
45
+ and s.LOCATION like ?"
46
+ args = "%#{opts[:filter_path]}%"
47
+ elsif opts[:filter_name]
48
+ sql = "select TBL_NAME from TBLS
49
+ where TBL_NAME like ?"
50
+ args = opts[:filter_name]
51
+ else
52
+ sql = "select TBL_NAME from TBLS"
53
+ end
54
+
55
+ results = query sql, *args
56
+ table_names = results.map {|result| result[0]}
57
+
58
+ #puts "TABLE_NAMES:"
59
+ #p table_names
60
+
61
+ tables = []
62
+ table_names.each do |name|
63
+ #puts "NAME: "
64
+ #p name
65
+ table = Table.new(name)
66
+
67
+ sql = "select c.INTEGER_IDX, c.column_name, c.COMMENT, s.LOCATION
68
+ from TBLS t, COLUMNS c, SDS s
69
+ where t.SD_ID = c.SD_ID and t.SD_ID = s.SD_ID and t.TBL_NAME = ?"
70
+ query sql, name do |rec|
71
+ #puts "REC:"
72
+ #p rec
73
+ col_idx = rec[0].to_i
74
+ col_name = rec[1]
75
+ col_cmt = rec[2]
76
+ tbl_loc = rec[3]
77
+ table.columns[col_idx] = col_name
78
+ table.comments[col_idx] = col_cmt
79
+ table.path = tbl_loc
80
+ end
81
+
82
+ tables << table
83
+ end
84
+ tables
85
+ end
86
+
87
+ def table(name)
88
+ t = tables(:filter_name => name) # appeasing the old skool 1.8 users
89
+ t[0] # if it comes back with multiple tables, return the first
90
+ end
91
+ end
92
+
93
+ end
94
+
95
+ # fix for broken row dup in 1.9
96
+ # http://rubyforge.org/tracker/index.php?func=detail&aid=28624&group_id=234&atid=967
97
+ module DBI
98
+ class Row
99
+ if RUBY_VERSION =~ /^1\.9/
100
+ def dup
101
+ row = super
102
+ row.instance_variable_set :@arr, @arr.dup
103
+ row
104
+ end
105
+ end
106
+ end
107
+ end
@@ -0,0 +1,29 @@
1
+ module HiveMeta
2
+
3
+ class FieldCountError < StandardError ; end
4
+
5
+ class Record
6
+ def initialize(line, table)
7
+ fields = line.chomp.split(table.delimiter, -1)
8
+ if fields.size != table.columns.size
9
+ raise FieldCountError
10
+ end
11
+
12
+ @columns = {}
13
+ table.each_col_with_index do |col_name, i|
14
+ @columns[col_name] = fields[i]
15
+ @columns[col_name.to_sym] = fields[i]
16
+ end
17
+ end
18
+
19
+ def [] index
20
+ "#{@columns[index.to_sym]}"
21
+ end
22
+
23
+ def method_missing(id, *args)
24
+ return @columns[id] if @columns[id]
25
+ raise NoMethodError
26
+ end
27
+ end
28
+
29
+ end
@@ -0,0 +1,53 @@
1
+ module HiveMeta
2
+
3
+ class Table
4
+ include Comparable
5
+ include Enumerable
6
+
7
+ attr_accessor :path, :columns, :comments, :delimiter
8
+
9
+ def initialize(name)
10
+ @name = name
11
+ @path = nil
12
+ @columns = []
13
+ @comments = []
14
+ @delimiter = "\t"
15
+ end
16
+
17
+ def to_s
18
+ "#{@name}"
19
+ end
20
+
21
+ def each
22
+ @columns.each_with_index do |column_name, index|
23
+ yield column_name if column_name
24
+ end
25
+ end
26
+
27
+ alias :each_col :each
28
+
29
+ def each_with_index
30
+ @columns.each_with_index do |column_name, index|
31
+ yield column_name, index if column_name
32
+ end
33
+ end
34
+
35
+ alias :each_col_with_index :each_with_index
36
+
37
+ def <=>(other)
38
+ self.to_s <=> other.to_s
39
+ end
40
+
41
+ # process a row and return a record that can be queried
42
+ # by column name in a variety of ways
43
+ def process_row(line)
44
+ return nil if not line
45
+ if block_given?
46
+ yield Record.new(line, self)
47
+ else
48
+ return Record.new(line, self)
49
+ end
50
+ end
51
+ end
52
+
53
+ end
data/lib/hivemeta.rb ADDED
@@ -0,0 +1,5 @@
1
+ require 'hivemeta/connection'
2
+
3
+ module HiveMeta
4
+ VERSION = '0.0.1'
5
+ end
metadata ADDED
@@ -0,0 +1,72 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: hivemeta
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: ruby
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-05-01 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: Use the hive metadb to write map/reduce and easily query table info.
22
+ email: frank@fejes.net
23
+ executables: []
24
+
25
+ extensions: []
26
+
27
+ extra_rdoc_files: []
28
+
29
+ files:
30
+ - README
31
+ - CHANGELOG
32
+ - lib/hivemeta.rb
33
+ - lib/hivemeta/connection.rb
34
+ - lib/hivemeta/record.rb
35
+ - lib/hivemeta/table.rb
36
+ - examples/hivemeta_query.rb
37
+ - examples/hivemeta_testrec.rb
38
+ - examples/sample-mapper.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/hivemeta
41
+ licenses: []
42
+
43
+ post_install_message:
44
+ rdoc_options: []
45
+
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ segments:
54
+ - 0
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ segments:
62
+ - 0
63
+ version: "0"
64
+ requirements: []
65
+
66
+ rubyforge_project: hivemeta
67
+ rubygems_version: 1.3.7
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Use the hive metadb to write map/reduce and query table info.
71
+ test_files: []
72
+