hivemeta 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +43 -6
- data/lib/hivemeta/connection.rb +0 -1
- data/lib/hivemeta/record.rb +12 -4
- data/lib/hivemeta/table.rb +1 -1
- metadata +3 -3
data/README
CHANGED
@@ -1,15 +1,47 @@
|
|
1
1
|
hivemeta
|
2
2
|
|
3
|
-
A ruby API for access to
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
A ruby API for access to a Hive metastore running under MySQL.
|
4
|
+
|
5
|
+
Useful for querying columns in Hadoop map/reduce applications. Normally,
|
6
|
+
a developer needs to handle both the splitting of incoming data and the
|
7
|
+
assignment of numerically indexed fields to friendly variables like so:
|
8
|
+
|
9
|
+
fields = line.chomp.split /\t/
|
10
|
+
item_id = fields[0]
|
11
|
+
inv_cnt = fields[7].to_i
|
12
|
+
puts "#{item_id}\t#{inv_cnt}"
|
13
|
+
|
14
|
+
This is not overly traumatic, however it's susceptible to errors creeping
|
15
|
+
in from file format changes. Ongoing maintenance can easily become a burden
|
16
|
+
if there are many map/reduce programs reading the same changed data files.
|
17
|
+
Code size increases as the column count increases.
|
18
|
+
|
19
|
+
With hivemeta, the process is streamlined. That same task is now:
|
20
|
+
|
21
|
+
row = inv_table.process_row line
|
22
|
+
puts "#{row.item_id}\t#{row.inv_cnt.to_i}"
|
23
|
+
|
24
|
+
The row object automagically knows its column names and they can be
|
25
|
+
referenced in one of the following ways (in order of best to worst
|
26
|
+
performance and coolness):
|
27
|
+
|
28
|
+
row.col_name
|
29
|
+
row[:col_name]
|
30
|
+
row['col_name']
|
31
|
+
|
32
|
+
Also included is a demo application, hivemeta_query.rb, to spit out table
|
33
|
+
information from the command-line via table name search or by the table's
|
34
|
+
location in HDFS.
|
35
|
+
|
36
|
+
---
|
7
37
|
|
8
38
|
Installation
|
9
39
|
|
10
40
|
gem install hivemeta
|
11
41
|
|
12
|
-
|
42
|
+
---
|
43
|
+
|
44
|
+
API Usage
|
13
45
|
|
14
46
|
streaming map/reduce code snippet:
|
15
47
|
|
@@ -30,12 +62,15 @@ STDIN.each_line do |line|
|
|
30
62
|
puts "#{item_id}\t#{count}" if count >= 1000
|
31
63
|
end
|
32
64
|
|
33
|
-
|
65
|
+
---
|
66
|
+
|
67
|
+
hivemeta_query.rb Usage
|
34
68
|
|
35
69
|
# query by table names
|
36
70
|
$ hivemeta_query.rb join_test_name
|
37
71
|
join_test_name
|
38
72
|
hdfs://namenode/tmp/join_test_name
|
73
|
+
delimiter: "\t" (ASCII 9)
|
39
74
|
0 userid # userid
|
40
75
|
1 name # username
|
41
76
|
|
@@ -43,6 +78,7 @@ hdfs://namenode/tmp/join_test_name
|
|
43
78
|
$ hivemeta_query.rb join_test%
|
44
79
|
join_test_address
|
45
80
|
hdfs://namenode/tmp/join_test_address
|
81
|
+
delimiter: "," (ASCII 44)
|
46
82
|
0 userid # uid
|
47
83
|
1 address
|
48
84
|
2 city
|
@@ -50,6 +86,7 @@ hdfs://namenode/tmp/join_test_address
|
|
50
86
|
|
51
87
|
join_test_name
|
52
88
|
hdfs://namenode/tmp/join_test_name
|
89
|
+
delimiter: "\t" (ASCII 9)
|
53
90
|
0 userid # userid
|
54
91
|
1 name # username
|
55
92
|
|
data/lib/hivemeta/connection.rb
CHANGED
data/lib/hivemeta/record.rb
CHANGED
@@ -4,22 +4,30 @@ module HiveMeta
|
|
4
4
|
|
5
5
|
class Record
|
6
6
|
def initialize(line, table)
|
7
|
-
fields = line.chomp.split(table.delimiter, -1)
|
8
|
-
if fields.size != table.columns.size
|
7
|
+
@fields = line.chomp.split(table.delimiter, -1)
|
8
|
+
if @fields.size != table.columns.size
|
9
9
|
raise FieldCountError
|
10
10
|
end
|
11
11
|
|
12
12
|
@columns = {}
|
13
13
|
table.each_col_with_index do |col_name, i|
|
14
|
-
@columns[col_name] = fields[i]
|
15
|
-
@columns[col_name.to_sym] = fields[i]
|
14
|
+
@columns[col_name] = @fields[i]
|
15
|
+
@columns[col_name.to_sym] = @fields[i]
|
16
16
|
end
|
17
17
|
end
|
18
18
|
|
19
|
+
# allow for column access via column name as an index
|
20
|
+
# example: rec[:col_name]
|
21
|
+
# or: rec['col_name']
|
22
|
+
# can also use the numeric index as stored in the file
|
23
|
+
# example: rec[7]
|
19
24
|
def [] index
|
25
|
+
return "#{@fields[index]}" if index.is_a? Integer
|
20
26
|
"#{@columns[index.to_sym]}"
|
21
27
|
end
|
22
28
|
|
29
|
+
# allow for column access via column name as a method
|
30
|
+
# example: rec.col_name
|
23
31
|
def method_missing(id, *args)
|
24
32
|
return @columns[id] if @columns[id]
|
25
33
|
raise NoMethodError
|
data/lib/hivemeta/table.rb
CHANGED
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 0
|
8
|
-
-
|
9
|
-
version: 0.0.
|
8
|
+
- 3
|
9
|
+
version: 0.0.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Frank Fejes
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-05-
|
17
|
+
date: 2011-05-17 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies: []
|
20
20
|
|