hivemeta 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README +43 -6
- data/lib/hivemeta/connection.rb +0 -1
- data/lib/hivemeta/record.rb +12 -4
- data/lib/hivemeta/table.rb +1 -1
- metadata +3 -3
data/README
CHANGED
@@ -1,15 +1,47 @@
|
|
1
1
|
hivemeta
|
2
2
|
|
3
|
-
A ruby API for access to
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
A ruby API for access to a Hive metastore running under MySQL.
|
4
|
+
|
5
|
+
Useful for querying columns in Hadoop map/reduce applications. Normally,
|
6
|
+
a developer needs to handle both the splitting of incoming data and the
|
7
|
+
assignment of numerically indexed fields to friendly variables like so:
|
8
|
+
|
9
|
+
fields = line.chomp.split /\t/
|
10
|
+
item_id = fields[0]
|
11
|
+
inv_cnt = fields[7].to_i
|
12
|
+
puts "#{item_id}\t#{inv_cnt}"
|
13
|
+
|
14
|
+
This is not overly traumatic, however it's susceptible to errors creeping
|
15
|
+
in from file format changes. Ongoing maintenance can easily become a burden
|
16
|
+
if there are many map/reduce programs reading the same changed data files.
|
17
|
+
Code size increases as the column count increases.
|
18
|
+
|
19
|
+
With hivemeta, the process is streamlined. That same task is now:
|
20
|
+
|
21
|
+
row = inv_table.process_row line
|
22
|
+
puts "#{row.item_id}\t#{row.inv_cnt.to_i}"
|
23
|
+
|
24
|
+
The row object automagically knows its column names and they can be
|
25
|
+
referenced in one of the following ways (in order of best to worst
|
26
|
+
performance and coolness):
|
27
|
+
|
28
|
+
row.col_name
|
29
|
+
row[:col_name]
|
30
|
+
row['col_name']
|
31
|
+
|
32
|
+
Also included is a demo application, hivemeta_query.rb, to spit out table
|
33
|
+
information from the command-line via table name search or by the table's
|
34
|
+
location in HDFS.
|
35
|
+
|
36
|
+
---
|
7
37
|
|
8
38
|
Installation
|
9
39
|
|
10
40
|
gem install hivemeta
|
11
41
|
|
12
|
-
|
42
|
+
---
|
43
|
+
|
44
|
+
API Usage
|
13
45
|
|
14
46
|
streaming map/reduce code snippet:
|
15
47
|
|
@@ -30,12 +62,15 @@ STDIN.each_line do |line|
|
|
30
62
|
puts "#{item_id}\t#{count}" if count >= 1000
|
31
63
|
end
|
32
64
|
|
33
|
-
|
65
|
+
---
|
66
|
+
|
67
|
+
hivemeta_query.rb Usage
|
34
68
|
|
35
69
|
# query by table names
|
36
70
|
$ hivemeta_query.rb join_test_name
|
37
71
|
join_test_name
|
38
72
|
hdfs://namenode/tmp/join_test_name
|
73
|
+
delimiter: "\t" (ASCII 9)
|
39
74
|
0 userid # userid
|
40
75
|
1 name # username
|
41
76
|
|
@@ -43,6 +78,7 @@ hdfs://namenode/tmp/join_test_name
|
|
43
78
|
$ hivemeta_query.rb join_test%
|
44
79
|
join_test_address
|
45
80
|
hdfs://namenode/tmp/join_test_address
|
81
|
+
delimiter: "," (ASCII 44)
|
46
82
|
0 userid # uid
|
47
83
|
1 address
|
48
84
|
2 city
|
@@ -50,6 +86,7 @@ hdfs://namenode/tmp/join_test_address
|
|
50
86
|
|
51
87
|
join_test_name
|
52
88
|
hdfs://namenode/tmp/join_test_name
|
89
|
+
delimiter: "\t" (ASCII 9)
|
53
90
|
0 userid # userid
|
54
91
|
1 name # username
|
55
92
|
|
data/lib/hivemeta/connection.rb
CHANGED
data/lib/hivemeta/record.rb
CHANGED
@@ -4,22 +4,30 @@ module HiveMeta
|
|
4
4
|
|
5
5
|
class Record
|
6
6
|
def initialize(line, table)
|
7
|
-
fields = line.chomp.split(table.delimiter, -1)
|
8
|
-
if fields.size != table.columns.size
|
7
|
+
@fields = line.chomp.split(table.delimiter, -1)
|
8
|
+
if @fields.size != table.columns.size
|
9
9
|
raise FieldCountError
|
10
10
|
end
|
11
11
|
|
12
12
|
@columns = {}
|
13
13
|
table.each_col_with_index do |col_name, i|
|
14
|
-
@columns[col_name] = fields[i]
|
15
|
-
@columns[col_name.to_sym] = fields[i]
|
14
|
+
@columns[col_name] = @fields[i]
|
15
|
+
@columns[col_name.to_sym] = @fields[i]
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
+
# allow for column access via column name as an index
|
20
|
+
# example: rec[:col_name]
|
21
|
+
# or: rec['col_name']
|
22
|
+
# can also use the numeric index as stored in the file
|
23
|
+
# example: rec[7]
|
19
24
|
def [] index
|
25
|
+
return "#{@fields[index]}" if index.is_a? Integer
|
20
26
|
"#{@columns[index.to_sym]}"
|
21
27
|
end
|
22
28
|
|
29
|
+
# allow for column access via column name as a method
|
30
|
+
# example: rec.col_name
|
23
31
|
def method_missing(id, *args)
|
24
32
|
return @columns[id] if @columns[id]
|
25
33
|
raise NoMethodError
|
data/lib/hivemeta/table.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-05-
|
17
|
+
date: 2011-05-17 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|