hivemeta 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +4 -1
- data/README +43 -2
- data/lib/hivemeta/connection.rb +6 -0
- data/lib/hivemeta/table.rb +18 -0
- metadata +2 -2
data/CHANGELOG
CHANGED
@@ -3,7 +3,10 @@
|
|
3
3
|
- perf: 4x+ faster ... now basically on par with manual split into array
|
4
4
|
- perf: create extra hash for column index by name
|
5
5
|
- perf: remove unnecessary string indexed assignment
|
6
|
-
- clean:
|
6
|
+
- clean: Table#each does each inside rather than each_with_index
|
7
|
+
- new: Table#process works on file input, by default STDIN
|
8
|
+
- new: can now use environmental variables in order to minimize code
|
9
|
+
all prefixed by hivemeta_ : db_user, db_pass, db_host, db_name
|
7
10
|
|
8
11
|
* 2011-05-17 - fsf
|
9
12
|
- bugfix: default unspecified delimiter is ^A rather than TAB
|
data/README
CHANGED
@@ -43,11 +43,23 @@ gem install hivemeta
|
|
43
43
|
|
44
44
|
API Usage
|
45
45
|
|
46
|
-
streaming map/reduce code snippet:
|
46
|
+
streaming map/reduce code snippet (abstracted processing loop):
|
47
47
|
|
48
48
|
require 'hivemeta'
|
49
49
|
|
50
|
-
h = HiveMeta::Connection.new
|
50
|
+
h = HiveMeta::Connection.new # see below for detail
|
51
|
+
|
52
|
+
h.table('sample_inventory').process do |row|
|
53
|
+
item_id = row.item_id # can access by method or [:sym] or ['str']
|
54
|
+
count = row.inv_cnt.to_i
|
55
|
+
puts "#{item_id}\t#{count}" if count >= 1000
|
56
|
+
end
|
57
|
+
|
58
|
+
streaming map/reduce code snippet (normal STDIN processing loop):
|
59
|
+
|
60
|
+
require 'hivemeta'
|
61
|
+
|
62
|
+
h = HiveMeta::Connection.new # see below for detail
|
51
63
|
inv_table = h.table 'sample_inventory'
|
52
64
|
|
53
65
|
STDIN.each_line do |line|
|
@@ -62,6 +74,35 @@ STDIN.each_line do |line|
|
|
62
74
|
puts "#{item_id}\t#{count}" if count >= 1000
|
63
75
|
end
|
64
76
|
|
77
|
+
establishing a connection (in ruby code):
|
78
|
+
|
79
|
+
db_user = 'hive'
|
80
|
+
db_pass = 'hivepasshere'
|
81
|
+
db_host = 'localhost'
|
82
|
+
db_name = 'hivemeta'
|
83
|
+
|
84
|
+
dbi_string = "DBI:Mysql:#{db_name}:#{db_host}"
|
85
|
+
h = HiveMeta::Connection.new(dbi_string, db_user, db_pass)
|
86
|
+
|
87
|
+
establishing a connection (environment variables):
|
88
|
+
|
89
|
+
# when no arguments are passed, the following env variables will be used:
|
90
|
+
#
|
91
|
+
# hivemeta_db_host
|
92
|
+
# hivemeta_db_name
|
93
|
+
# hivemeta_db_user
|
94
|
+
# hivemeta_db_pass
|
95
|
+
#
|
96
|
+
# to set these in a streaming map/reduce job, use -D arguments like so:
|
97
|
+
#
|
98
|
+
# -D hivemeta.db_host=mydbhost \
|
99
|
+
# -D hivemeta.db_name=hivemeta \
|
100
|
+
# -D hivemeta.db_user=hive \
|
101
|
+
# -D hivemeta.db_pass=mydbpass \
|
102
|
+
|
103
|
+
# the connection will made with those env variables without any other code
|
104
|
+
h = HiveMeta::Connection.new
|
105
|
+
|
65
106
|
---
|
66
107
|
|
67
108
|
hivemeta_query.rb Usage
|
data/lib/hivemeta/connection.rb
CHANGED
@@ -6,6 +6,12 @@ module HiveMeta
|
|
6
6
|
|
7
7
|
class Connection
|
8
8
|
def initialize(dbi_string = nil, db_user = nil, db_pass = nil)
|
9
|
+
db_name = ENV['hivemeta_db_name']
|
10
|
+
db_host = ENV['hivemeta_db_host']
|
11
|
+
dbi_string ||= "DBI:Mysql:#{db_name}:#{db_host}"
|
12
|
+
db_user ||= ENV['hivemeta_db_user']
|
13
|
+
db_pass ||= ENV['hivemeta_db_pass']
|
14
|
+
|
9
15
|
@dbi_string = dbi_string
|
10
16
|
@db_user = db_user
|
11
17
|
@db_pass = db_pass
|
data/lib/hivemeta/table.rb
CHANGED
@@ -49,6 +49,24 @@ module HiveMeta
|
|
49
49
|
return Record.new(line, self)
|
50
50
|
end
|
51
51
|
end
|
52
|
+
|
53
|
+
# process all input (default to STDIN for Hadoop Streaming)
|
54
|
+
# via a provided block
|
55
|
+
def process(f = STDIN, warning = nil)
|
56
|
+
if not block_given?
|
57
|
+
return process_row f.readline
|
58
|
+
end
|
59
|
+
|
60
|
+
f.each_line do |line|
|
61
|
+
begin
|
62
|
+
process_row(line) {|row| yield row}
|
63
|
+
rescue HiveMeta::FieldCountError
|
64
|
+
warning ||= "reporter:counter:bad_data,row_size,1"
|
65
|
+
STDERR.puts warning
|
66
|
+
next
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
52
70
|
end
|
53
71
|
|
54
72
|
end
|