bigrecord-driver 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,396 @@
1
+ require File.dirname(__FILE__) + '/../column_descriptor'
2
+ require File.dirname(__FILE__) + '/../exceptions'
3
+ require File.dirname(__FILE__) + '/../bigrecord_server'
4
+
5
+ module BigRecordDriver
6
+
7
+ class HbaseServer < BigRecordServer
8
+ include_class "java.util.TreeMap"
9
+
10
+ include_class "org.apache.hadoop.hbase.client.HTable"
11
+ include_class "org.apache.hadoop.hbase.client.HBaseAdmin"
12
+ include_class "org.apache.hadoop.hbase.io.BatchUpdate"
13
+ include_class "org.apache.hadoop.hbase.io.hfile.Compression"
14
+ include_class "org.apache.hadoop.hbase.HBaseConfiguration"
15
+ include_class "org.apache.hadoop.hbase.HConstants"
16
+ include_class "org.apache.hadoop.hbase.HStoreKey"
17
+ include_class "org.apache.hadoop.hbase.HTableDescriptor"
18
+ include_class "org.apache.hadoop.hbase.HColumnDescriptor"
19
+
20
+ include_class "org.apache.hadoop.io.Writable"
21
+
22
+ # Establish the connection with HBase with the given configuration parameters.
23
+ def configure(config = {})
24
+ config[:zookeeper_quorum] ||= 'localhost'
25
+ config[:zookeeper_client_port] ||= '2181'
26
+
27
+ @config = config
28
+
29
+ init_connection
30
+ end
31
+
32
+ # Atomic row insertion/update. Example:
33
+ # update('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', {'attribute:name' => "--- Oahu\n",
34
+ # 'attribute:travel_rank' => "--- 0.90124565\n"})
35
+ # => 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8'
36
+ def update(table_name, row, values, timestamp=nil)
37
+ safe_exec do
38
+ return nil unless row
39
+ table = connect_table(table_name)
40
+
41
+ batch = timestamp ? BatchUpdate.new(row, timestamp) : BatchUpdate.new(row)
42
+
43
+ values.each do |column, value|
44
+ batch.put(column, value.to_bytes)
45
+ end
46
+
47
+ table.commit(batch)
48
+ row
49
+ end
50
+ end
51
+
52
+ # Returns a column of a row. Example:
53
+ # get('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', 'attribute:travel_rank')
54
+ # => "--- 0.90124565\n"
55
+ #
56
+ # valid options:
57
+ # :timestamp => integer corresponding to the time when the record was saved in hbase
58
+ # :versions => number of versions to retreive, starting at the specified timestamp (or the latest)
59
+ def get(table_name, row, column, options={})
60
+ safe_exec do
61
+ return nil unless row
62
+ table = connect_table(table_name)
63
+
64
+ # Retreive only the last version by default
65
+ options[:versions] ||= options[:num_versions]
66
+ options[:versions] ||= 1
67
+
68
+ # validate the arguments
69
+ raise ArgumentError, "versions must be >= 1" unless options[:versions] >= 1
70
+
71
+ # get the raw data from hbase
72
+ unless options[:timestamp]
73
+ if options[:versions] == 1
74
+ raw_data = table.get(row, column)
75
+ else
76
+ raw_data = table.get(row,
77
+ column,
78
+ options[:versions])
79
+ end
80
+ else
81
+ raw_data = table.get(row,
82
+ column,
83
+ options[:timestamp],
84
+ options[:versions])
85
+ end
86
+
87
+ # Return either a single value or an array, depending on the number of version that have been requested
88
+ if options[:versions] == 1
89
+ return nil unless raw_data
90
+ raw_data = raw_data[0] if options[:timestamp]
91
+ to_ruby_string(raw_data)
92
+ else
93
+ return [] unless raw_data
94
+ raw_data.collect do |raw_data_version|
95
+ to_ruby_string(raw_data_version)
96
+ end
97
+ end
98
+ end
99
+ end
100
+
101
+ # Returns the last version of the given columns of the given row. The columns works with
102
+ # regular expressions (e.g. 'attribute:' matches all attributes columns). Example:
103
+ # get_columns('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', ['attribute:'])
104
+ # => {"attribute:name" => "--- Oahu\n", "attribute:travel_rank" => "--- 0.90124565\n", etc...}
105
+ def get_columns(table_name, row, columns, options={})
106
+ safe_exec do
107
+ return nil unless row
108
+ table_name = table_name.to_s
109
+ table = connect_table(table_name)
110
+
111
+ java_cols = Java::String[columns.size].new
112
+ columns.each_with_index do |col, i|
113
+ java_cols[i] = Java::String.new(col)
114
+ end
115
+
116
+ result =
117
+ if options[:timestamp]
118
+ table.getRow(row, java_cols, options[:timestamp])
119
+ else
120
+ table.getRow(row, java_cols)
121
+ end
122
+
123
+ unless !result or result.isEmpty
124
+ values = {}
125
+ result.entrySet.each do |entry|
126
+ column_name = Java::String.new(entry.getKey).to_s
127
+ values[column_name] = to_ruby_string(entry.getValue)
128
+ end
129
+ values["id"] = row
130
+ values
131
+ else
132
+ nil
133
+ end
134
+ end
135
+ end
136
+
137
+ # Get consecutive rows. Example to get 100 records starting with the one specified and get all the
138
+ # columns in the column family 'attribute:' :
139
+ # get_consecutive_rows('entities', 'b9cef848-a4e0-11dc-a7ba-0018f3137ea8', 100, ['attribute:'])
140
+ def get_consecutive_rows(table_name, start_row, limit, columns, stop_row = nil)
141
+ safe_exec do
142
+ table_name = table_name.to_s
143
+ table = connect_table(table_name)
144
+
145
+ java_cols = Java::String[columns.size].new
146
+ columns.each_with_index do |col, i|
147
+ java_cols[i] = Java::String.new(col)
148
+ end
149
+
150
+ start_row ||= ""
151
+ start_row = start_row.to_s
152
+
153
+ # We cannot set stop_row like start_row because a
154
+ # default stop row would have to be the biggest value possible
155
+ if stop_row
156
+ scanner = table.getScanner(java_cols, start_row, stop_row, HConstants::LATEST_TIMESTAMP)
157
+ else
158
+ scanner = table.getScanner(java_cols, start_row)
159
+ end
160
+
161
+ row_count = 0 if limit
162
+ result = []
163
+ while (row_result = scanner.next) != nil
164
+ if limit
165
+ break if row_count == limit
166
+ row_count += 1
167
+ end
168
+ values = {}
169
+ row_result.entrySet.each do |entry|
170
+ column_name = Java::String.new(entry.getKey).to_s
171
+ data = to_ruby_string(entry.getValue)
172
+ values[column_name] = data
173
+ end
174
+ unless values.empty?
175
+ # TODO: is this really supposed to be hard coded?
176
+ values['id'] = Java::String.new(row_result.getRow).to_s
177
+ result << values
178
+ end
179
+ end
180
+ scanner.close
181
+ result
182
+ end
183
+ end
184
+
185
+ # Delete a whole row.
186
+ def delete(table_name, row, timestamp = nil)
187
+ safe_exec do
188
+ table = connect_table(table_name)
189
+ timestamp ? table.deleteAll(row.to_bytes, timestamp) : table.deleteAll(row.to_bytes)
190
+ end
191
+ end
192
+
193
+ # Create a table
194
+ def create_table(table_name, column_descriptors)
195
+ safe_exec do
196
+ table_name = table_name.to_s
197
+ unless table_exists?(table_name)
198
+ tdesc = HTableDescriptor.new(table_name)
199
+
200
+ column_descriptors.each do |cd|
201
+ cdesc = generate_column_descriptor(cd)
202
+
203
+ tdesc.addFamily(cdesc)
204
+ end
205
+ @admin.createTable(tdesc)
206
+ else
207
+ raise BigRecordDriver::TableAlreadyExists, table_name
208
+ end
209
+ end
210
+ end
211
+
212
+ # Delete a table
213
+ def drop_table(table_name)
214
+ safe_exec do
215
+ table_name = table_name.to_s
216
+
217
+ if @admin.tableExists(table_name)
218
+ @admin.disableTable(table_name)
219
+ @admin.deleteTable(table_name)
220
+
221
+ # Remove the table connection from the cache
222
+ @tables.delete(table_name) if @tables.has_key?(table_name)
223
+ else
224
+ raise BigRecordDriver::TableNotFound, table_name
225
+ end
226
+ end
227
+ end
228
+
229
+ def add_column(table_name, column_descriptor)
230
+ safe_exec do
231
+ table_name = table_name.to_s
232
+
233
+ if @admin.tableExists(table_name)
234
+ @admin.disableTable(table_name)
235
+
236
+ cdesc = generate_column_descriptor(column_descriptor)
237
+ @admin.addColumn(table_name, cdesc)
238
+
239
+ @admin.enableTable(table_name)
240
+ else
241
+ raise BigRecordDriver::TableNotFound, table_name
242
+ end
243
+ end
244
+ end
245
+
246
+ def remove_column(table_name, column_name)
247
+ safe_exec do
248
+ table_name = table_name.to_s
249
+ column_name = column_name.to_s
250
+
251
+ if @admin.tableExists(table_name)
252
+ @admin.disableTable(table_name)
253
+
254
+ column_name << ":" unless column_name =~ /:$/
255
+ @admin.deleteColumn(table_name, column_name)
256
+
257
+ @admin.enableTable(table_name)
258
+ else
259
+ raise BigRecordDriver::TableNotFound, table_name
260
+ end
261
+ end
262
+ end
263
+
264
+ def modify_column(table_name, column_descriptor)
265
+ safe_exec do
266
+ table_name = table_name.to_s
267
+ column_name = column_name.to_s
268
+
269
+ if @admin.tableExists(table_name)
270
+ @admin.disableTable(table_name)
271
+
272
+ cdesc = generate_column_descriptor(column_descriptor)
273
+ @admin.modifyColumn(table_name, column_descriptor.name, cdesc)
274
+
275
+ @admin.enableTable(table_name)
276
+ else
277
+ raise BigRecordDriver::TableNotFound, table_name
278
+ end
279
+ end
280
+ end
281
+
282
+ def truncate_table(table_name)
283
+ safe_exec do
284
+ table_name = table_name.to_s
285
+ table = connect_table(table_name)
286
+ tableDescriptor = table.getTableDescriptor
287
+ drop_table(table_name)
288
+ @admin.createTable(tableDescriptor)
289
+ end
290
+ end
291
+
292
+ def ping
293
+ safe_exec do
294
+ @admin.isMasterRunning
295
+ end
296
+ end
297
+
298
+ def table_exists?(table_name)
299
+ safe_exec do
300
+ @admin.tableExists(table_name.to_s)
301
+ end
302
+ end
303
+
304
+ def table_names
305
+ safe_exec do
306
+ @admin.listTables.collect{|td| Java::String.new(td.getName).to_s}
307
+ end
308
+ end
309
+
310
+ # def const_missing(const)
311
+ # super
312
+ # rescue NameError => ex
313
+ # raise NameError, "uninitialized constant #{const}"
314
+ # end
315
+
316
+ private
317
+ # Create a connection to a Hbase table and keep it in memory.
318
+ def connect_table(table_name)
319
+ safe_exec do
320
+ table_name = table_name.to_s
321
+ return @tables[table_name] if @tables.has_key?(table_name)
322
+
323
+ if table_exists?(table_name)
324
+ @tables[table_name] = HTable.new(@conf, table_name)
325
+ else
326
+ if table_name and !table_name.empty?
327
+ raise BigRecordDriver::TableNotFound, table_name
328
+ else
329
+ raise ArgumentError, "Table name not specified"
330
+ end
331
+ end
332
+ @tables[table_name]
333
+ end
334
+ end
335
+
336
+ def init_connection
337
+ safe_exec do
338
+ @conf = HBaseConfiguration.new
339
+ @conf.set('hbase.zookeeper.quorum', "#{@config[:zookeeper_quorum]}")
340
+ @conf.set('hbase.zookeeper.property.clientPort', "#{@config[:zookeeper_client_port]}")
341
+ @admin = HBaseAdmin.new(@conf)
342
+ @tables = {}
343
+ end
344
+ end
345
+
346
+ def generate_column_descriptor(column_descriptor)
347
+ raise ArgumentError, "a column descriptor is missing a name" unless column_descriptor.name
348
+ raise "bloom_filter option not supported yet" if column_descriptor.bloom_filter
349
+
350
+ if column_descriptor.compression
351
+ compression =
352
+ case column_descriptor.compression.to_s
353
+ when 'none'; Compression::Algorithm::NONE.getName()
354
+ when 'gz'; Compression::Algorithm::GZ.getName()
355
+ when 'lzo'; Compression::Algorithm::LZO.getName()
356
+ else
357
+ raise ArgumentError, "Invalid compression type: #{column_descriptor.compression} for the column_family #{column_descriptor.name}"
358
+ end
359
+ end
360
+
361
+ n_versions = column_descriptor.versions
362
+ in_memory = column_descriptor.in_memory
363
+
364
+ # set the default values of the missing parameters
365
+ n_versions ||= HColumnDescriptor::DEFAULT_VERSIONS
366
+ compression ||= HColumnDescriptor::DEFAULT_COMPRESSION
367
+ in_memory ||= HColumnDescriptor::DEFAULT_IN_MEMORY
368
+ block_cache ||= HColumnDescriptor::DEFAULT_BLOCKCACHE
369
+ block_size ||= HColumnDescriptor::DEFAULT_BLOCKSIZE
370
+ bloomfilter ||= HColumnDescriptor::DEFAULT_BLOOMFILTER
371
+ ttl ||= HColumnDescriptor::DEFAULT_TTL
372
+
373
+ # add the ':' at the end if the user didn't specify it
374
+ column_descriptor.name << ":" unless column_descriptor.name =~ /:$/
375
+
376
+ cdesc = HColumnDescriptor.new(column_descriptor.name.to_bytes,
377
+ n_versions,
378
+ compression,
379
+ in_memory,
380
+ block_cache,
381
+ block_size,
382
+ ttl,
383
+ bloomfilter)
384
+
385
+ return cdesc
386
+ end
387
+
388
+ end
389
+
390
+ end
391
+
392
+ port = ARGV[0]
393
+ port ||= 40000
394
+ DRb.start_service("druby://:#{port}", BigRecordDriver::HbaseServer.new)
395
+ puts "Started drb server on port #{port}."
396
+ DRb.thread.join
@@ -0,0 +1,6 @@
1
+ LIB_ROOT = File.dirname(__FILE__)
2
+
3
+ require LIB_ROOT + '/big_record_driver/client'
4
+ require LIB_ROOT + '/big_record_driver/exceptions'
5
+ require LIB_ROOT + '/big_record_driver/column_descriptor'
6
+ require LIB_ROOT + '/big_record_driver/driver_manager'
@@ -0,0 +1 @@
1
+ require 'big_record_driver'