bigrecord-driver 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,396 @@
1
+ require File.dirname(__FILE__) + '/../column_descriptor'
2
+ require File.dirname(__FILE__) + '/../exceptions'
3
+ require File.dirname(__FILE__) + '/../bigrecord_server'
4
+
5
+ module BigRecordDriver
6
+
7
+ class HbaseServer < BigRecordServer
8
+ include_class "java.util.TreeMap"
9
+
10
+ include_class "org.apache.hadoop.hbase.client.HTable"
11
+ include_class "org.apache.hadoop.hbase.client.HBaseAdmin"
12
+ include_class "org.apache.hadoop.hbase.io.BatchUpdate"
13
+ include_class "org.apache.hadoop.hbase.io.hfile.Compression"
14
+ include_class "org.apache.hadoop.hbase.HBaseConfiguration"
15
+ include_class "org.apache.hadoop.hbase.HConstants"
16
+ include_class "org.apache.hadoop.hbase.HStoreKey"
17
+ include_class "org.apache.hadoop.hbase.HTableDescriptor"
18
+ include_class "org.apache.hadoop.hbase.HColumnDescriptor"
19
+
20
+ include_class "org.apache.hadoop.io.Writable"
21
+
22
+ # Establish the connection with HBase with the given configuration parameters.
23
+ def configure(config = {})
24
+ config[:zookeeper_quorum] ||= 'localhost'
25
+ config[:zookeeper_client_port] ||= '2181'
26
+
27
+ @config = config
28
+
29
+ init_connection
30
+ end
31
+
32
+ # Atomic row insertion/update. Example:
33
+ # update('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', {'attribute:name' => "--- Oahu\n",
34
+ # 'attribute:travel_rank' => "--- 0.90124565\n"})
35
+ # => 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8'
36
+ def update(table_name, row, values, timestamp=nil)
37
+ safe_exec do
38
+ return nil unless row
39
+ table = connect_table(table_name)
40
+
41
+ batch = timestamp ? BatchUpdate.new(row, timestamp) : BatchUpdate.new(row)
42
+
43
+ values.each do |column, value|
44
+ batch.put(column, value.to_bytes)
45
+ end
46
+
47
+ table.commit(batch)
48
+ row
49
+ end
50
+ end
51
+
52
+ # Returns a column of a row. Example:
53
+ # get('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', 'attribute:travel_rank')
54
+ # => "--- 0.90124565\n"
55
+ #
56
+ # valid options:
57
+ # :timestamp => integer corresponding to the time when the record was saved in hbase
58
+ # :versions => number of versions to retreive, starting at the specified timestamp (or the latest)
59
+ def get(table_name, row, column, options={})
60
+ safe_exec do
61
+ return nil unless row
62
+ table = connect_table(table_name)
63
+
64
+ # Retreive only the last version by default
65
+ options[:versions] ||= options[:num_versions]
66
+ options[:versions] ||= 1
67
+
68
+ # validate the arguments
69
+ raise ArgumentError, "versions must be >= 1" unless options[:versions] >= 1
70
+
71
+ # get the raw data from hbase
72
+ unless options[:timestamp]
73
+ if options[:versions] == 1
74
+ raw_data = table.get(row, column)
75
+ else
76
+ raw_data = table.get(row,
77
+ column,
78
+ options[:versions])
79
+ end
80
+ else
81
+ raw_data = table.get(row,
82
+ column,
83
+ options[:timestamp],
84
+ options[:versions])
85
+ end
86
+
87
+ # Return either a single value or an array, depending on the number of version that have been requested
88
+ if options[:versions] == 1
89
+ return nil unless raw_data
90
+ raw_data = raw_data[0] if options[:timestamp]
91
+ to_ruby_string(raw_data)
92
+ else
93
+ return [] unless raw_data
94
+ raw_data.collect do |raw_data_version|
95
+ to_ruby_string(raw_data_version)
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ # Returns the last version of the given columns of the given row. The columns works with
102
+ # regular expressions (e.g. 'attribute:' matches all attributes columns). Example:
103
+ # get_columns('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', ['attribute:'])
104
+ # => {"attribute:name" => "--- Oahu\n", "attribute:travel_rank" => "--- 0.90124565\n", etc...}
105
+ def get_columns(table_name, row, columns, options={})
106
+ safe_exec do
107
+ return nil unless row
108
+ table_name = table_name.to_s
109
+ table = connect_table(table_name)
110
+
111
+ java_cols = Java::String[columns.size].new
112
+ columns.each_with_index do |col, i|
113
+ java_cols[i] = Java::String.new(col)
114
+ end
115
+
116
+ result =
117
+ if options[:timestamp]
118
+ table.getRow(row, java_cols, options[:timestamp])
119
+ else
120
+ table.getRow(row, java_cols)
121
+ end
122
+
123
+ unless !result or result.isEmpty
124
+ values = {}
125
+ result.entrySet.each do |entry|
126
+ column_name = Java::String.new(entry.getKey).to_s
127
+ values[column_name] = to_ruby_string(entry.getValue)
128
+ end
129
+ values["id"] = row
130
+ values
131
+ else
132
+ nil
133
+ end
134
+ end
135
+ end
136
+
137
+ # Get consecutive rows. Example to get 100 records starting with the one specified and get all the
138
+ # columns in the column family 'attribute:' :
139
+ # get_consecutive_rows('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', 100, ['attribute:'])
140
+ def get_consecutive_rows(table_name, start_row, limit, columns, stop_row = nil)
141
+ safe_exec do
142
+ table_name = table_name.to_s
143
+ table = connect_table(table_name)
144
+
145
+ java_cols = Java::String[columns.size].new
146
+ columns.each_with_index do |col, i|
147
+ java_cols[i] = Java::String.new(col)
148
+ end
149
+
150
+ start_row ||= ""
151
+ start_row = start_row.to_s
152
+
153
+ # We cannot set stop_row like start_row because a
154
+ # default stop row would have to be the biggest value possible
155
+ if stop_row
156
+ scanner = table.getScanner(java_cols, start_row, stop_row, HConstants::LATEST_TIMESTAMP)
157
+ else
158
+ scanner = table.getScanner(java_cols, start_row)
159
+ end
160
+
161
+ row_count = 0 if limit
162
+ result = []
163
+ while (row_result = scanner.next) != nil
164
+ if limit
165
+ break if row_count == limit
166
+ row_count += 1
167
+ end
168
+ values = {}
169
+ row_result.entrySet.each do |entry|
170
+ column_name = Java::String.new(entry.getKey).to_s
171
+ data = to_ruby_string(entry.getValue)
172
+ values[column_name] = data
173
+ end
174
+ unless values.empty?
175
+ # TODO: is this really supposed to be hard coded?
176
+ values['id'] = Java::String.new(row_result.getRow).to_s
177
+ result << values
178
+ end
179
+ end
180
+ scanner.close
181
+ result
182
+ end
183
+ end
184
+
185
+ # Delete a whole row.
186
+ def delete(table_name, row, timestamp = nil)
187
+ safe_exec do
188
+ table = connect_table(table_name)
189
+ timestamp ? table.deleteAll(row.to_bytes, timestamp) : table.deleteAll(row.to_bytes)
190
+ end
191
+ end
192
+
193
+ # Create a table
194
+ def create_table(table_name, column_descriptors)
195
+ safe_exec do
196
+ table_name = table_name.to_s
197
+ unless table_exists?(table_name)
198
+ tdesc = HTableDescriptor.new(table_name)
199
+
200
+ column_descriptors.each do |cd|
201
+ cdesc = generate_column_descriptor(cd)
202
+
203
+ tdesc.addFamily(cdesc)
204
+ end
205
+ @admin.createTable(tdesc)
206
+ else
207
+ raise BigRecordDriver::TableAlreadyExists, table_name
208
+ end
209
+ end
210
+ end
211
+
212
+ # Delete a table
213
+ def drop_table(table_name)
214
+ safe_exec do
215
+ table_name = table_name.to_s
216
+
217
+ if @admin.tableExists(table_name)
218
+ @admin.disableTable(table_name)
219
+ @admin.deleteTable(table_name)
220
+
221
+ # Remove the table connection from the cache
222
+ @tables.delete(table_name) if @tables.has_key?(table_name)
223
+ else
224
+ raise BigRecordDriver::TableNotFound, table_name
225
+ end
226
+ end
227
+ end
228
+
229
+ def add_column(table_name, column_descriptor)
230
+ safe_exec do
231
+ table_name = table_name.to_s
232
+
233
+ if @admin.tableExists(table_name)
234
+ @admin.disableTable(table_name)
235
+
236
+ cdesc = generate_column_descriptor(column_descriptor)
237
+ @admin.addColumn(table_name, cdesc)
238
+
239
+ @admin.enableTable(table_name)
240
+ else
241
+ raise BigRecordDriver::TableNotFound, table_name
242
+ end
243
+ end
244
+ end
245
+
246
+ def remove_column(table_name, column_name)
247
+ safe_exec do
248
+ table_name = table_name.to_s
249
+ column_name = column_name.to_s
250
+
251
+ if @admin.tableExists(table_name)
252
+ @admin.disableTable(table_name)
253
+
254
+ column_name << ":" unless column_name =~ /:$/
255
+ @admin.deleteColumn(table_name, column_name)
256
+
257
+ @admin.enableTable(table_name)
258
+ else
259
+ raise BigRecordDriver::TableNotFound, table_name
260
+ end
261
+ end
262
+ end
263
+
264
+ def modify_column(table_name, column_descriptor)
265
+ safe_exec do
266
+ table_name = table_name.to_s
267
+ column_name = column_name.to_s
268
+
269
+ if @admin.tableExists(table_name)
270
+ @admin.disableTable(table_name)
271
+
272
+ cdesc = generate_column_descriptor(column_descriptor)
273
+ @admin.modifyColumn(table_name, column_descriptor.name, cdesc)
274
+
275
+ @admin.enableTable(table_name)
276
+ else
277
+ raise BigRecordDriver::TableNotFound, table_name
278
+ end
279
+ end
280
+ end
281
+
282
+ def truncate_table(table_name)
283
+ safe_exec do
284
+ table_name = table_name.to_s
285
+ table = connect_table(table_name)
286
+ tableDescriptor = table.getTableDescriptor
287
+ drop_table(table_name)
288
+ @admin.createTable(tableDescriptor)
289
+ end
290
+ end
291
+
292
+ def ping
293
+ safe_exec do
294
+ @admin.isMasterRunning
295
+ end
296
+ end
297
+
298
+ def table_exists?(table_name)
299
+ safe_exec do
300
+ @admin.tableExists(table_name.to_s)
301
+ end
302
+ end
303
+
304
+ def table_names
305
+ safe_exec do
306
+ @admin.listTables.collect{|td| Java::String.new(td.getName).to_s}
307
+ end
308
+ end
309
+
310
+ # def const_missing(const)
311
+ # super
312
+ # rescue NameError => ex
313
+ # raise NameError, "uninitialized constant #{const}"
314
+ # end
315
+
316
+ private
317
+ # Create a connection to a Hbase table and keep it in memory.
318
+ def connect_table(table_name)
319
+ safe_exec do
320
+ table_name = table_name.to_s
321
+ return @tables[table_name] if @tables.has_key?(table_name)
322
+
323
+ if table_exists?(table_name)
324
+ @tables[table_name] = HTable.new(@conf, table_name)
325
+ else
326
+ if table_name and !table_name.empty?
327
+ raise BigRecordDriver::TableNotFound, table_name
328
+ else
329
+ raise ArgumentError, "Table name not specified"
330
+ end
331
+ end
332
+ @tables[table_name]
333
+ end
334
+ end
335
+
336
+ def init_connection
337
+ safe_exec do
338
+ @conf = HBaseConfiguration.new
339
+ @conf.set('hbase.zookeeper.quorum', "#{@config[:zookeeper_quorum]}")
340
+ @conf.set('hbase.zookeeper.property.clientPort', "#{@config[:zookeeper_client_port]}")
341
+ @admin = HBaseAdmin.new(@conf)
342
+ @tables = {}
343
+ end
344
+ end
345
+
346
+ def generate_column_descriptor(column_descriptor)
347
+ raise ArgumentError, "a column descriptor is missing a name" unless column_descriptor.name
348
+ raise "bloom_filter option not supported yet" if column_descriptor.bloom_filter
349
+
350
+ if column_descriptor.compression
351
+ compression =
352
+ case column_descriptor.compression.to_s
353
+ when 'none'; Compression::Algorithm::NONE.getName()
354
+ when 'gz'; Compression::Algorithm::GZ.getName()
355
+ when 'lzo'; Compression::Algorithm::LZO.getName()
356
+ else
357
+ raise ArgumentError, "Invalid compression type: #{column_descriptor.compression} for the column_family #{column_descriptor.name}"
358
+ end
359
+ end
360
+
361
+ n_versions = column_descriptor.versions
362
+ in_memory = column_descriptor.in_memory
363
+
364
+ # set the default values of the missing parameters
365
+ n_versions ||= HColumnDescriptor::DEFAULT_VERSIONS
366
+ compression ||= HColumnDescriptor::DEFAULT_COMPRESSION
367
+ in_memory ||= HColumnDescriptor::DEFAULT_IN_MEMORY
368
+ block_cache ||= HColumnDescriptor::DEFAULT_BLOCKCACHE
369
+ block_size ||= HColumnDescriptor::DEFAULT_BLOCKSIZE
370
+ bloomfilter ||= HColumnDescriptor::DEFAULT_BLOOMFILTER
371
+ ttl ||= HColumnDescriptor::DEFAULT_TTL
372
+
373
+ # add the ':' at the end if the user didn't specify it
374
+ column_descriptor.name << ":" unless column_descriptor.name =~ /:$/
375
+
376
+ cdesc = HColumnDescriptor.new(column_descriptor.name.to_bytes,
377
+ n_versions,
378
+ compression,
379
+ in_memory,
380
+ block_cache,
381
+ block_size,
382
+ ttl,
383
+ bloomfilter)
384
+
385
+ return cdesc
386
+ end
387
+
388
+ end
389
+
390
+ end
391
+
392
+ port = ARGV[0]
393
+ port ||= 40000
394
+ DRb.start_service("druby://:#{port}", BigRecordDriver::HbaseServer.new)
395
+ puts "Started drb server on port #{port}."
396
+ DRb.thread.join
@@ -0,0 +1,6 @@
1
+ LIB_ROOT = File.dirname(__FILE__)
2
+
3
+ require LIB_ROOT + '/big_record_driver/client'
4
+ require LIB_ROOT + '/big_record_driver/exceptions'
5
+ require LIB_ROOT + '/big_record_driver/column_descriptor'
6
+ require LIB_ROOT + '/big_record_driver/driver_manager'
@@ -0,0 +1 @@
1
+ require 'big_record_driver'