hbase-jruby 0.2.6-java → 0.3.0-java
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +16 -0
- data/README.md +303 -207
- data/hbase-jruby.gemspec +1 -1
- data/lib/hbase-jruby/byte_array.rb +25 -5
- data/lib/hbase-jruby/cell.rb +21 -10
- data/lib/hbase-jruby/dependency.rb +1 -5
- data/lib/hbase-jruby/hbase.rb +16 -1
- data/lib/hbase-jruby/row.rb +123 -260
- data/lib/hbase-jruby/schema.rb +115 -0
- data/lib/hbase-jruby/scoped/aggregation.rb +14 -0
- data/lib/hbase-jruby/scoped.rb +30 -23
- data/lib/hbase-jruby/table.rb +44 -22
- data/lib/hbase-jruby/util.rb +39 -5
- data/lib/hbase-jruby/version.rb +1 -1
- data/lib/hbase-jruby.rb +13 -13
- data/test/helper.rb +7 -1
- data/test/test_aggregation.rb +1 -1
- data/test/test_byte_array.rb +1 -1
- data/test/test_cell.rb +4 -5
- data/test/test_schema.rb +275 -0
- data/test/test_scoped.rb +33 -30
- data/test/test_table.rb +49 -86
- data/test/test_table_admin.rb +3 -3
- data/test/test_util.rb +7 -7
- metadata +5 -5
- data/lib/hbase-jruby/column_key.rb +0 -72
- data/test/test_column_key.rb +0 -49
data/README.md
CHANGED
@@ -7,52 +7,99 @@
|
|
7
7
|
- ActiveRecord-like method chaining for data retrieval
|
8
8
|
- Automatic Hadoop/HBase dependency resolution
|
9
9
|
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
gem install hbase-jruby
|
13
|
+
|
10
14
|
## A quick example
|
11
15
|
|
12
16
|
```ruby
|
13
17
|
require 'hbase-jruby'
|
14
18
|
|
15
|
-
|
19
|
+
# Load required JAR files from CDH distribution using Maven
|
20
|
+
HBase.resolve_dependency! 'cdh4.2.1'
|
16
21
|
|
22
|
+
# Connect to HBase on localhost
|
17
23
|
hbase = HBase.new
|
18
|
-
|
24
|
+
|
25
|
+
# Define table schema for easier data access
|
26
|
+
hbase.schema = {
|
27
|
+
book: {
|
28
|
+
# Columns in cf1 family
|
29
|
+
cf1: {
|
30
|
+
title: :string,
|
31
|
+
author: :string,
|
32
|
+
category: :string,
|
33
|
+
year: :short,
|
34
|
+
pages: :fixnum,
|
35
|
+
price: :bigdecimal,
|
36
|
+
weight: :float,
|
37
|
+
in_print: :boolean
|
38
|
+
},
|
39
|
+
# Columns in cf2 family
|
40
|
+
cf2: {
|
41
|
+
summary: :string,
|
42
|
+
reviews: :fixnum,
|
43
|
+
stars: :fixnum,
|
44
|
+
/^comment\d+/ => :string
|
45
|
+
}
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
# Create book table with two column families
|
50
|
+
table = hbase[:book]
|
51
|
+
unless table.exists?
|
52
|
+
table.create! cf1: { min_versions: 2 },
|
53
|
+
cf2: { bloomfilter: :rowcol, versions: 5 }
|
54
|
+
end
|
19
55
|
|
20
56
|
# PUT
|
21
|
-
table.put
|
57
|
+
table.put 1 => {
|
58
|
+
title: 'The Golden Bough: A Study of Magic and Religion',
|
59
|
+
author: 'Sir James G. Frazer',
|
60
|
+
category: 'Occult',
|
61
|
+
year: 1890,
|
62
|
+
pages: 1006,
|
63
|
+
price: BigDecimal('21.50'),
|
64
|
+
weight: 3.0,
|
65
|
+
in_print: true,
|
66
|
+
summary: 'A wide-ranging, comparative study of mythology and religion',
|
67
|
+
reviews: 52,
|
68
|
+
stars: 226,
|
69
|
+
comment1: 'A must-have',
|
70
|
+
comment2: 'Rewarding purchase'
|
71
|
+
}
|
22
72
|
|
23
73
|
# GET
|
24
|
-
|
25
|
-
|
26
|
-
|
74
|
+
book = table.get(1)
|
75
|
+
title = book[:title]
|
76
|
+
comment2 = book[:comment2]
|
77
|
+
as_hash = book.to_h
|
27
78
|
|
28
79
|
# SCAN
|
29
|
-
table.range(
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
80
|
+
table.range(0..100)
|
81
|
+
.filter(year: 1880...1900,
|
82
|
+
in_print: true,
|
83
|
+
category: ['Comics', 'Fiction', /cult/i],
|
84
|
+
price: { lt: BigDecimal('30.00') },
|
85
|
+
summary: /myth/i)
|
86
|
+
.project(:cf1, :reviews)
|
87
|
+
.each do |book|
|
88
|
+
|
89
|
+
# Update columns
|
90
|
+
table.put book.rowkey, price: book[:price] + BigDecimal('1')
|
91
|
+
|
92
|
+
# Atomic increment
|
93
|
+
table.increment book.rowkey, reviews: 1, stars: 5
|
94
|
+
|
95
|
+
# Delete a column
|
96
|
+
table.delete book.rowkey, :comment1
|
37
97
|
end
|
38
98
|
|
39
|
-
#
|
40
|
-
table.delete
|
99
|
+
# Delete row
|
100
|
+
table.delete 1
|
41
101
|
```
|
42
102
|
|
43
|
-
## Installation
|
44
|
-
|
45
|
-
### From Rubygems
|
46
|
-
|
47
|
-
gem install hbase-jruby
|
48
|
-
|
49
|
-
### From source
|
50
|
-
|
51
|
-
git clone -b devel https://github.com/junegunn/hbase-jruby.git
|
52
|
-
cd hbase-jruby
|
53
|
-
rake build
|
54
|
-
gem install pkg/hbase-jruby-0.2.2-java.gem
|
55
|
-
|
56
103
|
## Setting up
|
57
104
|
|
58
105
|
### Resolving Hadoop/HBase dependency
|
@@ -72,7 +119,7 @@ Call `HBase.resolve_dependency!` helper method passing one of the arguments list
|
|
72
119
|
| cdh4.1[.*] | Cloudera CDH4.1 | cdh4.1.4 | mvn |
|
73
120
|
| cdh3[u*] | Cloudera CDH3 | cdh3u6 | mvn |
|
74
121
|
| 0.95[.*] | Apache HBase 0.95 | 0.95.0 | mvn |
|
75
|
-
| 0.94[.*] | Apache HBase 0.94 | 0.94.
|
122
|
+
| 0.94[.*] | Apache HBase 0.94 | 0.94.7 | mvn |
|
76
123
|
| 0.92[.*] | Apache HBase 0.92 | 0.92.2 | mvn |
|
77
124
|
| *POM PATH* | Custom Maven POM file | - | mvn |
|
78
125
|
| `:local` | Local HBase installation | - | hbase |
|
@@ -84,16 +131,16 @@ Call `HBase.resolve_dependency!` helper method passing one of the arguments list
|
|
84
131
|
|
85
132
|
```ruby
|
86
133
|
# Load JAR files from CDH4 using Maven
|
87
|
-
HBase.resolve_dependency! 'cdh4.2.
|
134
|
+
HBase.resolve_dependency! 'cdh4.2.1'
|
88
135
|
HBase.resolve_dependency! 'cdh4.1.3'
|
89
136
|
|
90
137
|
# Load JAR files of HBase 0.94.x using Maven
|
91
|
-
HBase.resolve_dependency! '0.94.
|
92
|
-
HBase.resolve_dependency! '0.94.2', :
|
138
|
+
HBase.resolve_dependency! '0.94.7'
|
139
|
+
HBase.resolve_dependency! '0.94.2', verbose: true
|
93
140
|
|
94
141
|
# Dependency resolution with custom POM file
|
95
142
|
HBase.resolve_dependency! '/path/to/my/pom.xml'
|
96
|
-
HBase.resolve_dependency! '/path/to/my/pom.xml', :
|
143
|
+
HBase.resolve_dependency! '/path/to/my/pom.xml', profile: 'trunk'
|
97
144
|
|
98
145
|
# Load JAR files from local HBase installation
|
99
146
|
# (equivalent to: export CLASSPATH=$CLASSPATH:`hbase classpath`)
|
@@ -126,10 +173,10 @@ hbase = HBase.new
|
|
126
173
|
hbase = HBase.new 'hbase.zookeeper.quorum' => 'remote-server.mydomain.net'
|
127
174
|
|
128
175
|
# Extra configuration
|
129
|
-
hbase = HBase.new 'hbase.zookeeper.quorum'
|
130
|
-
'hbase.client.retries.number'
|
176
|
+
hbase = HBase.new 'hbase.zookeeper.quorum' => 'remote-server.mydomain.net',
|
177
|
+
'hbase.client.retries.number' => 3,
|
131
178
|
'hbase.client.scanner.caching' => 1000,
|
132
|
-
'hbase.rpc.timeout'
|
179
|
+
'hbase.rpc.timeout' => 120000
|
133
180
|
|
134
181
|
# Close HBase connection
|
135
182
|
hbase.close
|
@@ -154,132 +201,190 @@ table = hbase[:test_table]
|
|
154
201
|
table.drop! if table.exists?
|
155
202
|
|
156
203
|
# Create table with two column families
|
157
|
-
table.create! :
|
158
|
-
:
|
204
|
+
table.create! cf1: {},
|
205
|
+
cf2: { compression: :snappy, bloomfilter: :row }
|
159
206
|
```
|
160
207
|
|
161
208
|
## Basic operations
|
162
209
|
|
210
|
+
### Defining table schema for easier data access
|
211
|
+
|
212
|
+
HBase stores everything as plain Java byte arrays. So it's completely up to
|
213
|
+
users to encode and decode column values of various types into and from byte
|
214
|
+
arrays, and that is a quite tedious and error-prone task.
|
215
|
+
|
216
|
+
To remedy this situation, `hbase-jruby` implements the concept of table schema.
|
217
|
+
|
218
|
+
Using table schema greatly simplifies the way you access data:
|
219
|
+
- With schema, byte array conversion becomes automatic
|
220
|
+
- It allows you to omit column family names (e.g. `:title` instead of `"cf1:title"`)
|
221
|
+
|
222
|
+
We'll use the following schema throughout the examples.
|
223
|
+
|
224
|
+
```ruby
|
225
|
+
hbase.schema = {
|
226
|
+
# Schema for `book` table
|
227
|
+
book: {
|
228
|
+
# Columns in cf1 family
|
229
|
+
cf1: {
|
230
|
+
title: :string,
|
231
|
+
author: :string,
|
232
|
+
category: :string,
|
233
|
+
year: :short,
|
234
|
+
pages: :fixnum,
|
235
|
+
price: :bigdecimal,
|
236
|
+
weight: :float,
|
237
|
+
in_print: :boolean
|
238
|
+
},
|
239
|
+
# Columns in cf2 family
|
240
|
+
cf2: {
|
241
|
+
summary: :string,
|
242
|
+
reviews: :fixnum,
|
243
|
+
stars: :fixnum,
|
244
|
+
/^comment\d+/ => :string
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
```
|
249
|
+
|
250
|
+
Columns that are not defined in the schema can be referenced
|
251
|
+
using `FAMILY:QUALIFIER` notation or 2-element Array of column family name (as Symbol) and qualifier,
|
252
|
+
however since there's no type information, they are returned as Java byte arrays,
|
253
|
+
which have to be decoded manually.
|
254
|
+
|
163
255
|
### PUT
|
164
256
|
|
165
257
|
```ruby
|
166
258
|
# Putting a single row
|
167
|
-
|
259
|
+
# - Row keys can be of any type, in this case, we use String type
|
260
|
+
table.put 'rowkey1', title: "Hello World", year: 2013
|
168
261
|
|
169
262
|
# Putting multiple rows
|
170
|
-
table.put 'rowkey1' => {
|
171
|
-
'rowkey2' => {
|
172
|
-
'rowkey3' => {
|
263
|
+
table.put 'rowkey1' => { title: 'foo', year: 2013 },
|
264
|
+
'rowkey2' => { title: "bar", year: 2014 },
|
265
|
+
'rowkey3' => { title: 'foobar', year: 2015 }
|
173
266
|
|
174
267
|
# Putting values with timestamps
|
175
268
|
table.put 'rowkey1' => {
|
176
|
-
|
177
|
-
1353143856665 => "Hello",
|
178
|
-
1352978648642 => "Goodbye"
|
179
|
-
|
269
|
+
title: {
|
270
|
+
1353143856665 => "Hello world",
|
271
|
+
1352978648642 => "Goodbye world"
|
272
|
+
},
|
273
|
+
year: 2013
|
180
274
|
}
|
181
275
|
```
|
182
276
|
|
183
277
|
### GET
|
184
278
|
|
185
|
-
HBase stores everything as a byte array, so when you fetch data from HBase,
|
186
|
-
you need to explicitly specify the type of each value stored.
|
187
|
-
|
188
279
|
```ruby
|
189
|
-
|
280
|
+
book = table.get('rowkey1')
|
190
281
|
|
191
282
|
# Rowkey
|
192
|
-
|
283
|
+
rowkey = row.rowkey # Rowkey as raw Java byte array
|
284
|
+
rowkey = row.rowkey :string # Rowkey as String
|
285
|
+
|
286
|
+
# Access columns in schema
|
287
|
+
title = book[:title]
|
288
|
+
author = book[:author]
|
289
|
+
year = book[:year]
|
193
290
|
|
194
|
-
#
|
195
|
-
|
291
|
+
# Convert to simple Hash
|
292
|
+
hash = book.to_h
|
196
293
|
|
197
|
-
#
|
198
|
-
|
199
|
-
col2 = row.fixnum 'cf1:col2'
|
200
|
-
col3 = row.bigdecimal 'cf1:col3'
|
201
|
-
col4 = row.float 'cf1:col4'
|
202
|
-
col5 = row.boolean 'cf1:col5'
|
203
|
-
col6 = row.symbol 'cf1:col6'
|
294
|
+
# Convert to Hash containing all versions of values indexed by their timestamps
|
295
|
+
all_hash = book.to_H
|
204
296
|
|
205
|
-
#
|
206
|
-
|
207
|
-
|
297
|
+
# Columns not defined in the schema are returned as Java byte arrays
|
298
|
+
# They need to be decoded manually
|
299
|
+
extra = HBase::Util.from_bytes(:bigdecimal, book['cf2:extra'])
|
300
|
+
# or, simply
|
301
|
+
extra = book.bigdecimal 'cf2:extra'
|
208
302
|
```
|
209
303
|
|
210
|
-
|
304
|
+
### Batch-GET
|
211
305
|
|
212
306
|
```ruby
|
213
307
|
# Pass an array of row keys as the parameter
|
214
|
-
|
308
|
+
books = table.get(['rowkey1', 'rowkey2', 'rowkey3'])
|
215
309
|
```
|
216
310
|
|
217
|
-
####
|
311
|
+
#### `to_h`
|
218
312
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
313
|
+
`to_h` and `to_H` return the Hash representation of the row.
|
314
|
+
(The latter returns all values with their timestamp)
|
315
|
+
|
316
|
+
If a column is defined in the schema, it is referenced using its quailifier in Symbol type.
|
317
|
+
If a column is not defined, it is represented as a 2-element Array
|
318
|
+
of column family in Symbol and column qualifier as ByteArray.
|
319
|
+
Even so, to make it easier to reference those columns, an extended version of
|
320
|
+
Hash is returned with which you can also reference them with `FAMILY:QUALIFIER`
|
321
|
+
notation or `[cf, cq]` array notation.
|
223
322
|
|
224
|
-
|
225
|
-
|
226
|
-
#
|
227
|
-
#
|
228
|
-
|
229
|
-
#
|
323
|
+
```ruby
|
324
|
+
table.put 1000 => {
|
325
|
+
title: 'Hello world', # Known column
|
326
|
+
comment100: 'foo', # Known column
|
327
|
+
'cf2:extra' => 'bar', # Unknown column
|
328
|
+
[:cf2, 10] => 'foobar' # Unknown column, non-string qualifier
|
329
|
+
}
|
230
330
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
331
|
+
book = table.get 10000
|
332
|
+
hash = book.to_h
|
333
|
+
# {
|
334
|
+
# :title => "Hello world",
|
335
|
+
# [:cf2, HBase::ByteArray<0, 0, 0, 0, 0, 0, 0, 10>] =>
|
336
|
+
# byte[102, 111, 111, 98, 97, 114]@6f28bb44,
|
337
|
+
# :comment100 => "foo",
|
338
|
+
# [:cf2, HBase::ByteArray<101, 120, 116, 114, 97>] =>
|
339
|
+
# byte[98, 97, 114]@77190cfc}
|
340
|
+
# }
|
341
|
+
|
342
|
+
hash['cf2:extra']
|
343
|
+
# byte[98, 97, 114]@77190cfc
|
344
|
+
|
345
|
+
hash[%w[cf2 extra]]
|
346
|
+
# byte[98, 97, 114]@77190cfc
|
347
|
+
|
348
|
+
hash[[:cf2, HBase::ByteArray['extra']]]
|
349
|
+
# byte[98, 97, 114]@77190cfc
|
350
|
+
|
351
|
+
hash['cf2:extra'].to_s
|
352
|
+
# 'bar'
|
353
|
+
|
354
|
+
# Columns with non-string qualifiers must be referenced using 2-element Array notation
|
355
|
+
hash['cf2:10']
|
356
|
+
# nil
|
357
|
+
hash[[:cf2, 10]]
|
358
|
+
# byte[102, 111, 111, 98, 97, 114]@6f28bb44
|
359
|
+
|
360
|
+
hash_with_versions = book.to_H
|
361
|
+
# {
|
362
|
+
# :title => {1369019227766 => "Hello world"},
|
363
|
+
# [:cf2, HBase::ByteArray<0, 0, 0, 0, 0, 0, 0, 10>] =>
|
364
|
+
# {1369019227766 => byte[102, 111, 111, 98, 97, 114]@6f28bb44},
|
365
|
+
# :comment100 => {1369019227766 => "foo"},
|
366
|
+
# [:cf2, HBase::ByteArray<101, 120, 116, 114, 97>] =>
|
367
|
+
# {1369019227766 => byte[98, 97, 114]@77190cfc}}
|
368
|
+
# }
|
239
369
|
```
|
240
370
|
|
241
371
|
#### Intra-row scan
|
242
372
|
|
243
|
-
Intra-row scan can be done
|
373
|
+
Intra-row scan can be done using `each` method which yields `HBase::Cell` instances.
|
244
374
|
|
245
375
|
```ruby
|
246
376
|
# Intra-row scan (all versions)
|
247
377
|
row.each do |cell|
|
248
378
|
family = cell.family
|
249
|
-
qualifier = cell.qualifier
|
379
|
+
qualifier = cell.qualifier :string # Column qualifier as String
|
250
380
|
timestamp = cell.timestamp
|
251
|
-
|
252
|
-
# Cell value as Java byte array
|
253
|
-
bytes = cell.raw
|
254
|
-
|
255
|
-
# Typed access
|
256
|
-
# value_as_string = cell.string
|
257
|
-
# value_as_fixnum = cell.fixnum
|
258
|
-
# ...
|
381
|
+
value = cell.value
|
259
382
|
end
|
260
383
|
|
261
384
|
# Array of HBase::Cells
|
262
385
|
cells = row.to_a
|
263
386
|
```
|
264
387
|
|
265
|
-
#### `to_hash`
|
266
|
-
|
267
|
-
```ruby
|
268
|
-
# Returns the Hash representation of the record with the specified schema
|
269
|
-
schema = {
|
270
|
-
'cf1:col1' => :string,
|
271
|
-
'cf1:col2' => :fixnum,
|
272
|
-
'cf1:col3' => :bigdecimal,
|
273
|
-
'cf1:col4' => :float,
|
274
|
-
'cf1:col5' => :boolean,
|
275
|
-
'cf1:col6' => :symbol }
|
276
|
-
|
277
|
-
table.get('rowkey1').to_hash(schema)
|
278
|
-
|
279
|
-
# Returns all versions for each column indexed by their timestamps
|
280
|
-
table.get('rowkey1').to_hash_with_versions(schema)
|
281
|
-
```
|
282
|
-
|
283
388
|
### DELETE
|
284
389
|
|
285
390
|
```ruby
|
@@ -287,23 +392,23 @@ table.get('rowkey1').to_hash_with_versions(schema)
|
|
287
392
|
table.delete('rowkey1')
|
288
393
|
|
289
394
|
# Deletes all columns in the specified column family
|
290
|
-
table.delete('rowkey1',
|
395
|
+
table.delete('rowkey1', :cf1)
|
291
396
|
|
292
397
|
# Deletes a column
|
293
|
-
table.delete('rowkey1',
|
398
|
+
table.delete('rowkey1', :author)
|
294
399
|
|
295
400
|
# Deletes a column with empty qualifier.
|
296
401
|
# (!= deleing the entire columns in the family. See the trailing colon.)
|
297
402
|
table.delete('rowkey1', 'cf1:')
|
298
403
|
|
299
404
|
# Deletes a version of a column
|
300
|
-
table.delete('rowkey1',
|
405
|
+
table.delete('rowkey1', :author, 1352978648642)
|
301
406
|
|
302
407
|
# Deletes multiple versions of a column
|
303
|
-
table.delete('rowkey1',
|
408
|
+
table.delete('rowkey1', :author, 1352978648642, 1352978649642)
|
304
409
|
|
305
410
|
# Batch delete
|
306
|
-
table.delete(['rowkey1'], ['rowkey2'], ['rowkey3',
|
411
|
+
table.delete(['rowkey1'], ['rowkey2'], ['rowkey3', :author, 1352978648642, 135297864964])
|
307
412
|
```
|
308
413
|
|
309
414
|
However, the last syntax seems a bit unwieldy when you just wish to delete a few rows.
|
@@ -318,11 +423,16 @@ table.delete_row 'rowkey1', 'rowkey2', 'rowkey3'
|
|
318
423
|
### Atomic increment of column values
|
319
424
|
|
320
425
|
```ruby
|
321
|
-
# Atomically increase
|
322
|
-
table.increment('rowkey1',
|
426
|
+
# Atomically increase cf2:reviews by one
|
427
|
+
table.increment('rowkey1', reviews: 1)
|
323
428
|
|
324
|
-
# Atomically increase two columns by one and
|
325
|
-
table.increment('rowkey1',
|
429
|
+
# Atomically increase two columns by one and five respectively
|
430
|
+
table.increment('rowkey1', reviews: 1, stars: 5)
|
431
|
+
|
432
|
+
# Increase column values of multiple rows.
|
433
|
+
# - Atomicity is only guaranteed within each row.
|
434
|
+
table.increment 'rowkey1' => { reviews: 1, stars: 5 },
|
435
|
+
'rowkey2' => { reviews: 1, stars: 3 }
|
326
436
|
```
|
327
437
|
|
328
438
|
### SCAN
|
@@ -332,10 +442,11 @@ table.increment('rowkey1', 'cf1:counter' => 1, 'cf1:counter2' => 2)
|
|
332
442
|
```ruby
|
333
443
|
# Full scan
|
334
444
|
table.each do |row|
|
335
|
-
|
336
|
-
name = row.string('cf:name')
|
337
|
-
# ...
|
445
|
+
p row.to_h
|
338
446
|
end
|
447
|
+
|
448
|
+
# Returns Enumerator when block is not given
|
449
|
+
table.each.with_index.each_slice(10).to_a
|
339
450
|
```
|
340
451
|
|
341
452
|
## Scoped access
|
@@ -374,13 +485,13 @@ you can retrieve data with the following methods.
|
|
374
485
|
import org.apache.hadoop.hbase.filter.RandomRowFilter
|
375
486
|
|
376
487
|
table.range('A'..'Z'). # Row key range,
|
377
|
-
project(
|
488
|
+
project(:author). # Select cf1:author column
|
378
489
|
project('cf2'). # Select cf2 family as well
|
379
|
-
filter(
|
380
|
-
filter(
|
381
|
-
filter(
|
490
|
+
filter(category: 'Comics'). # Filter by cf1:category value
|
491
|
+
filter(year: [1990, 2000, 2010]). # Set-inclusion condition on cf1:year
|
492
|
+
filter(weight: 2.0..4.0). # Range filter on cf1:weight
|
382
493
|
filter(RandomRowFilter.new(0.5)). # Any Java HBase filter
|
383
|
-
while(
|
494
|
+
while(reviews: { gt: 20 }). # Early termination of scan
|
384
495
|
time_range(Time.now - 600, Time.now). # Scan data of the last 10 minutes
|
385
496
|
limit(10). # Limits the size of the result set
|
386
497
|
versions(2). # Only fetches 2 versions for each value
|
@@ -389,7 +500,7 @@ table.range('A'..'Z'). # Row key range,
|
|
389
500
|
with_java_scan { |scan| # Directly access Java Scan object
|
390
501
|
scan.setCacheBlocks false
|
391
502
|
}.
|
392
|
-
to_a # To Array
|
503
|
+
to_a # To Array of HBase::Rows
|
393
504
|
```
|
394
505
|
|
395
506
|
### *range*
|
@@ -420,15 +531,15 @@ Optionally, prefix filter can be applied as follows.
|
|
420
531
|
# Row keys with "APPLE" prefix
|
421
532
|
# Start key is automatically set to "APPLE",
|
422
533
|
# stop key "APPLF" to avoid unnecessary disk access
|
423
|
-
table.range(:
|
534
|
+
table.range(prefix: 'APPLE')
|
424
535
|
|
425
536
|
# Row keys with "ACE", "BLUE" or "APPLE" prefix
|
426
537
|
# Start key is automatically set to "ACE",
|
427
538
|
# stop key "BLUF"
|
428
|
-
table.range(:
|
539
|
+
table.range(prefix: ['ACE', 'BLUE', 'APPLE'])
|
429
540
|
|
430
541
|
# Prefix filter with start key and stop key.
|
431
|
-
table.range('ACE', 'BLUEMARINE', :
|
542
|
+
table.range('ACE', 'BLUEMARINE', prefix: ['ACE', 'BLUE', 'APPLE'])
|
432
543
|
```
|
433
544
|
|
434
545
|
Subsequent calls to `#range` override the range previously defined.
|
@@ -437,7 +548,7 @@ Subsequent calls to `#range` override the range previously defined.
|
|
437
548
|
# Previous ranges are discarded
|
438
549
|
scope.range(1, 100).
|
439
550
|
range(50..100).
|
440
|
-
range(:
|
551
|
+
range(prefix: 'A').
|
441
552
|
range(1, 1000)
|
442
553
|
# Same as `scope.range(1, 1000)`
|
443
554
|
```
|
@@ -451,27 +562,24 @@ Multiple calls have conjunctive effects.
|
|
451
562
|
# Range scanning the table with filters
|
452
563
|
table.range(nil, 1000).
|
453
564
|
filter(
|
454
|
-
#
|
455
|
-
|
456
|
-
'cf1:b' => 1024,
|
565
|
+
# Equality match
|
566
|
+
year: 2013,
|
457
567
|
|
458
568
|
# Range of numbers or characters: Checks if the value falls within the range
|
459
|
-
|
460
|
-
|
569
|
+
weight: 2.0..4.0
|
570
|
+
author: 'A'..'C'
|
461
571
|
|
462
572
|
# Regular expression: Checks if the value matches the regular expression
|
463
|
-
|
573
|
+
summary: /classic$/i,
|
464
574
|
|
465
575
|
# Hash: Tests the value with 6 types of operators (:gt, :lt, :gte, :lte, :eq, :ne)
|
466
|
-
|
467
|
-
'cf1:g' => { ne: 1000 },
|
576
|
+
reviews: { gt: 100, lte: 200 },
|
468
577
|
|
469
578
|
# Array of the aforementioned types: OR condition (disjunctive)
|
470
|
-
|
471
|
-
'cf1:i' => ['A'...'B', 'C', /^D/, { lt: 'F' }]).
|
579
|
+
category: ['Fiction', 'Comic', /science/i, { ne: 'Political Science' }]).
|
472
580
|
|
473
581
|
# Multiple calls for conjunctive filtering
|
474
|
-
filter(
|
582
|
+
filter(summary: /instant/i).
|
475
583
|
|
476
584
|
# Any number of Java filters can be applied
|
477
585
|
filter(org.apache.hadoop.hbase.filter.RandomRowFilter.new(0.5)).
|
@@ -489,12 +597,12 @@ See the following example.
|
|
489
597
|
|
490
598
|
```ruby
|
491
599
|
(0...30).each do |idx|
|
492
|
-
table.put idx,
|
600
|
+
table.put idx, year: 2000 + idx % 10
|
493
601
|
end
|
494
602
|
|
495
|
-
table.filter(
|
603
|
+
table.filter(year: { lte: 2001 }).map { |r| r.rowkey :fixnum }
|
496
604
|
# [0, 1, 10, 11, 20, 21]
|
497
|
-
table.while(
|
605
|
+
table.while(year: { lte: 2001 }).map { |r| r.rowkey :fixnum }
|
498
606
|
# [0, 1]
|
499
607
|
# Scan terminates immediately when condition not met.
|
500
608
|
```
|
@@ -505,9 +613,9 @@ table.while('cf1:a' => { lte: 1 }).map { |r| r.rowkey :fixnum }
|
|
505
613
|
Multiple calls have additive effects.
|
506
614
|
|
507
615
|
```ruby
|
508
|
-
# Fetches cf1:
|
509
|
-
scoped.project(
|
510
|
-
project(
|
616
|
+
# Fetches cf1:title, cf1:author, and all columns in column family cf2 and cf3
|
617
|
+
scoped.project(:title, :author, :cf2).
|
618
|
+
project(:cf3)
|
511
619
|
```
|
512
620
|
|
513
621
|
HBase filters can not only filter rows but also columns.
|
@@ -519,17 +627,17 @@ to pass column filter to filter method.
|
|
519
627
|
```ruby
|
520
628
|
# Column prefix filter:
|
521
629
|
# Fetch columns whose qualifiers start with the specified prefixes
|
522
|
-
scoped.project(:
|
523
|
-
project(:
|
630
|
+
scoped.project(prefix: 'alice').
|
631
|
+
project(prefix: %w[alice bob])
|
524
632
|
|
525
633
|
# Column range filter:
|
526
634
|
# Fetch columns whose qualifiers within the ranges
|
527
|
-
scoped.project(:
|
528
|
-
project(:
|
635
|
+
scoped.project(range: 'a'...'c').
|
636
|
+
project(range: ['i'...'k', 'x'...'z'])
|
529
637
|
|
530
638
|
# Column pagination filter:
|
531
639
|
# Fetch columns within the specified intra-scan offset and limit
|
532
|
-
scoped.project(:
|
640
|
+
scoped.project(offset: 1000, limit: 10)
|
533
641
|
```
|
534
642
|
|
535
643
|
When using column filters on *fat* rows with many columns,
|
@@ -540,7 +648,7 @@ However setting batch size allows multiple rows with the same row key are return
|
|
540
648
|
```ruby
|
541
649
|
# Let's say that we have rows with more than 10 columns whose qualifiers start with `str`
|
542
650
|
puts scoped.range(1..100).
|
543
|
-
project(:
|
651
|
+
project(prefix: 'str').
|
544
652
|
batch(10).
|
545
653
|
map { |row| [row.rowkey(:fixnum), row.count].map(&:to_s).join ': ' }
|
546
654
|
|
@@ -556,12 +664,10 @@ puts scoped.range(1..100).
|
|
556
664
|
### Scoped SCAN / GET
|
557
665
|
|
558
666
|
```ruby
|
559
|
-
scoped = table.versions(1)
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
range('rowkey0'..'rowkey2') # Range of rowkeys.
|
564
|
-
project('cf1', 'cf2:x') # Projection
|
667
|
+
scoped = table.versions(1) # Limits the number of versions
|
668
|
+
.filter(year: 1990...2000)
|
669
|
+
.range('rowkey0'..'rowkey2') # Range of rowkeys.
|
670
|
+
.project('cf1', 'cf2:x') # Projection
|
565
671
|
|
566
672
|
# Scoped GET
|
567
673
|
# Nonexistent or filtered rows are returned as nils
|
@@ -603,22 +709,22 @@ of the projected columns.
|
|
603
709
|
|
604
710
|
```ruby
|
605
711
|
# cf1:a must hold 8-byte integer values
|
606
|
-
table.project(
|
607
|
-
table.project(
|
608
|
-
table.project(
|
609
|
-
table.project(
|
610
|
-
table.project(
|
611
|
-
table.project(
|
712
|
+
table.project(:reviews).aggregate(:sum)
|
713
|
+
table.project(:reviews).aggregate(:avg)
|
714
|
+
table.project(:reviews).aggregate(:min)
|
715
|
+
table.project(:reviews).aggregate(:max)
|
716
|
+
table.project(:reviews).aggregate(:std)
|
717
|
+
table.project(:reviews).aggregate(:row_count)
|
612
718
|
|
613
719
|
# Aggregation of multiple columns
|
614
|
-
table.project(
|
720
|
+
table.project(:reviews, :stars).aggregate(:sum)
|
615
721
|
```
|
616
722
|
|
617
723
|
By default, aggregate method assumes that the projected values are 8-byte integers.
|
618
724
|
For other data types, you can pass your own ColumnInterpreter.
|
619
725
|
|
620
726
|
```ruby
|
621
|
-
table.project(
|
727
|
+
table.project(:price).aggregate(:sum, MyColumnInterpreter.new)
|
622
728
|
```
|
623
729
|
|
624
730
|
## Table inspection
|
@@ -691,8 +797,7 @@ With `regions` method, you can even presplit the new table just like the old one
|
|
691
797
|
```ruby
|
692
798
|
hbase[:dupe_table].create!(
|
693
799
|
table.raw_families,
|
694
|
-
table.raw_properties.merge(
|
695
|
-
:splits => table.regions.map { |r| r[:start_key] }.compact))
|
800
|
+
table.raw_properties.merge(splits: table.regions.map { |r| r[:start_key] }.compact))
|
696
801
|
```
|
697
802
|
|
698
803
|
## Table administration
|
@@ -709,21 +814,22 @@ and come with non-bang, asynchronous counterparts.
|
|
709
814
|
table.create!(
|
710
815
|
# 1st Hash: Column family specification
|
711
816
|
{
|
712
|
-
:
|
713
|
-
:
|
817
|
+
cf1: { compression: snappy },
|
818
|
+
cf2: { bloomfilter: row }
|
714
819
|
},
|
715
820
|
|
716
821
|
# 2nd Hash: Table properties
|
717
|
-
:
|
718
|
-
:
|
719
|
-
:
|
822
|
+
max_filesize: 256 * 1024 ** 2,
|
823
|
+
deferred_log_flush: false,
|
824
|
+
splits: [1000, 2000, 3000]
|
825
|
+
)
|
720
826
|
|
721
827
|
# Alter table properties (synchronous with optional block)
|
722
828
|
table.alter!(
|
723
|
-
:
|
724
|
-
:
|
725
|
-
:
|
726
|
-
:
|
829
|
+
max_filesize: 512 * 1024 ** 2,
|
830
|
+
memstore_flushsize: 64 * 1024 ** 2,
|
831
|
+
readonly: false,
|
832
|
+
deferred_log_flush: true
|
727
833
|
) { |progress, total|
|
728
834
|
# Progress report with an optional block
|
729
835
|
puts [progress, total].join('/')
|
@@ -731,10 +837,10 @@ table.alter!(
|
|
731
837
|
|
732
838
|
# Alter table properties (asynchronous)
|
733
839
|
table.alter(
|
734
|
-
:
|
735
|
-
:
|
736
|
-
:
|
737
|
-
:
|
840
|
+
max_filesize: 512 * 1024 ** 2,
|
841
|
+
memstore_flushsize: 64 * 1024 ** 2,
|
842
|
+
readonly: false,
|
843
|
+
deferred_log_flush: true
|
738
844
|
)
|
739
845
|
```
|
740
846
|
|
@@ -780,11 +886,10 @@ http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/HTableDescriptor.html
|
|
780
886
|
|
781
887
|
```ruby
|
782
888
|
# Add column family
|
783
|
-
table.add_family! :cf3, :
|
784
|
-
:bloomfilter => :row
|
889
|
+
table.add_family! :cf3, compression: :snappy, bloomfilter: :row
|
785
890
|
|
786
891
|
# Alter column family
|
787
|
-
table.alter_family! :cf2, :
|
892
|
+
table.alter_family! :cf2, bloomfilter: :rowcol
|
788
893
|
|
789
894
|
# Remove column family
|
790
895
|
table.delete_family! :cf1
|
@@ -797,8 +902,7 @@ table.delete_family! :cf1
|
|
797
902
|
unless table.has_coprocessor?(cp_class_name1)
|
798
903
|
table.add_coprocessor! cp_class_name1
|
799
904
|
end
|
800
|
-
table.add_coprocessor! cp_class_name2,
|
801
|
-
:path => path, :priority => priority, :params => params
|
905
|
+
table.add_coprocessor! cp_class_name2, path: path, priority: priority, params: params
|
802
906
|
|
803
907
|
# Remove coprocessor
|
804
908
|
table.remove_coprocessor! cp_class_name1
|
@@ -868,17 +972,14 @@ table.range('1'..'3').map { |r| r.rowkey :string }
|
|
868
972
|
|
869
973
|
### Non-string column qualifier
|
870
974
|
|
871
|
-
If a column qualifier is not a String,
|
872
|
-
instead of a conventional `FAMILY:QUALIFIER` String.
|
975
|
+
If a column qualifier is not a String, a 2-element Array should be used.
|
873
976
|
|
874
977
|
```ruby
|
875
978
|
table.put 'rowkey',
|
876
|
-
|
877
|
-
|
878
|
-
HBase::ColumnKey(:cf1, bytes) => "Qualifier is an arbitrary byte array"
|
979
|
+
[:cf1, 100 ] => "Byte representation of an 8-byte integer",
|
980
|
+
[:cf1, bytes] => "Qualifier is an arbitrary byte array"
|
879
981
|
|
880
|
-
table.get('rowkey')
|
881
|
-
table.get('rowkey').string(HBase::ColumnKey(:cf1, 100))
|
982
|
+
table.get('rowkey')[:cf1, 100]
|
882
983
|
# ...
|
883
984
|
```
|
884
985
|
|
@@ -895,12 +996,7 @@ table.put({ int: 12345 }, 'cf1:a' => { byte: 100 }, # 1-byte integer
|
|
895
996
|
'cf1:c' => { int: 300 }, # 4-byte integer
|
896
997
|
'cf1:4' => 400) # Ordinary 8-byte integer
|
897
998
|
|
898
|
-
|
899
|
-
|
900
|
-
result.byte('cf1:a') # 100
|
901
|
-
result.short('cf1:b') # 200
|
902
|
-
result.int('cf1:c') # 300
|
903
|
-
# ...
|
999
|
+
row = table.get(int: 12345)
|
904
1000
|
```
|
905
1001
|
|
906
1002
|
### Working with byte arrays
|
@@ -919,7 +1015,7 @@ which makes byte array manipulation much easier.
|
|
919
1015
|
A ByteArray can be created as a concatenation of any number of objects.
|
920
1016
|
|
921
1017
|
```ruby
|
922
|
-
ba = HBase::ByteArray
|
1018
|
+
ba = HBase::ByteArray[100, 3.14, {int: 300}, "Hello World"]
|
923
1019
|
```
|
924
1020
|
|
925
1021
|
Then you can slice it and decode each part,
|
@@ -943,7 +1039,7 @@ ba << { short: 300 }
|
|
943
1039
|
concatenate another ByteArray,
|
944
1040
|
|
945
1041
|
```ruby
|
946
|
-
ba += HBase::ByteArray
|
1042
|
+
ba += HBase::ByteArray[1024]
|
947
1043
|
```
|
948
1044
|
|
949
1045
|
or shift decoded objects from it.
|