hbase-jruby 0.2.6-java → 0.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +16 -0
- data/README.md +303 -207
- data/hbase-jruby.gemspec +1 -1
- data/lib/hbase-jruby/byte_array.rb +25 -5
- data/lib/hbase-jruby/cell.rb +21 -10
- data/lib/hbase-jruby/dependency.rb +1 -5
- data/lib/hbase-jruby/hbase.rb +16 -1
- data/lib/hbase-jruby/row.rb +123 -260
- data/lib/hbase-jruby/schema.rb +115 -0
- data/lib/hbase-jruby/scoped/aggregation.rb +14 -0
- data/lib/hbase-jruby/scoped.rb +30 -23
- data/lib/hbase-jruby/table.rb +44 -22
- data/lib/hbase-jruby/util.rb +39 -5
- data/lib/hbase-jruby/version.rb +1 -1
- data/lib/hbase-jruby.rb +13 -13
- data/test/helper.rb +7 -1
- data/test/test_aggregation.rb +1 -1
- data/test/test_byte_array.rb +1 -1
- data/test/test_cell.rb +4 -5
- data/test/test_schema.rb +275 -0
- data/test/test_scoped.rb +33 -30
- data/test/test_table.rb +49 -86
- data/test/test_table_admin.rb +3 -3
- data/test/test_util.rb +7 -7
- metadata +5 -5
- data/lib/hbase-jruby/column_key.rb +0 -72
- data/test/test_column_key.rb +0 -49
data/README.md
CHANGED
@@ -7,52 +7,99 @@
|
|
7
7
|
- ActiveRecord-like method chaining for data retrieval
|
8
8
|
- Automatic Hadoop/HBase dependency resolution
|
9
9
|
|
10
|
+
## Installation
|
11
|
+
|
12
|
+
gem install hbase-jruby
|
13
|
+
|
10
14
|
## A quick example
|
11
15
|
|
12
16
|
```ruby
|
13
17
|
require 'hbase-jruby'
|
14
18
|
|
15
|
-
|
19
|
+
# Load required JAR files from CDH distribution using Maven
|
20
|
+
HBase.resolve_dependency! 'cdh4.2.1'
|
16
21
|
|
22
|
+
# Connect to HBase on localhost
|
17
23
|
hbase = HBase.new
|
18
|
-
|
24
|
+
|
25
|
+
# Define table schema for easier data access
|
26
|
+
hbase.schema = {
|
27
|
+
book: {
|
28
|
+
# Columns in cf1 family
|
29
|
+
cf1: {
|
30
|
+
title: :string,
|
31
|
+
author: :string,
|
32
|
+
category: :string,
|
33
|
+
year: :short,
|
34
|
+
pages: :fixnum,
|
35
|
+
price: :bigdecimal,
|
36
|
+
weight: :float,
|
37
|
+
in_print: :boolean
|
38
|
+
},
|
39
|
+
# Columns in cf2 family
|
40
|
+
cf2: {
|
41
|
+
summary: :string,
|
42
|
+
reviews: :fixnum,
|
43
|
+
stars: :fixnum,
|
44
|
+
/^comment\d+/ => :string
|
45
|
+
}
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
# Create book table with two column families
|
50
|
+
table = hbase[:book]
|
51
|
+
unless table.exists?
|
52
|
+
table.create! cf1: { min_versions: 2 },
|
53
|
+
cf2: { bloomfilter: :rowcol, versions: 5 }
|
54
|
+
end
|
19
55
|
|
20
56
|
# PUT
|
21
|
-
table.put
|
57
|
+
table.put 1 => {
|
58
|
+
title: 'The Golden Bough: A Study of Magic and Religion',
|
59
|
+
author: 'Sir James G. Frazer',
|
60
|
+
category: 'Occult',
|
61
|
+
year: 1890,
|
62
|
+
pages: 1006,
|
63
|
+
price: BigDecimal('21.50'),
|
64
|
+
weight: 3.0,
|
65
|
+
in_print: true,
|
66
|
+
summary: 'A wide-ranging, comparative study of mythology and religion',
|
67
|
+
reviews: 52,
|
68
|
+
stars: 226,
|
69
|
+
comment1: 'A must-have',
|
70
|
+
comment2: 'Rewarding purchase'
|
71
|
+
}
|
22
72
|
|
23
73
|
# GET
|
24
|
-
|
25
|
-
|
26
|
-
|
74
|
+
book = table.get(1)
|
75
|
+
title = book[:title]
|
76
|
+
comment2 = book[:comment2]
|
77
|
+
as_hash = book.to_h
|
27
78
|
|
28
79
|
# SCAN
|
29
|
-
table.range(
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
80
|
+
table.range(0..100)
|
81
|
+
.filter(year: 1880...1900,
|
82
|
+
in_print: true,
|
83
|
+
category: ['Comics', 'Fiction', /cult/i],
|
84
|
+
price: { lt: BigDecimal('30.00') },
|
85
|
+
summary: /myth/i)
|
86
|
+
.project(:cf1, :reviews)
|
87
|
+
.each do |book|
|
88
|
+
|
89
|
+
# Update columns
|
90
|
+
table.put book.rowkey, price: book[:price] + BigDecimal('1')
|
91
|
+
|
92
|
+
# Atomic increment
|
93
|
+
table.increment book.rowkey, reviews: 1, stars: 5
|
94
|
+
|
95
|
+
# Delete a column
|
96
|
+
table.delete book.rowkey, :comment1
|
37
97
|
end
|
38
98
|
|
39
|
-
#
|
40
|
-
table.delete
|
99
|
+
# Delete row
|
100
|
+
table.delete 1
|
41
101
|
```
|
42
102
|
|
43
|
-
## Installation
|
44
|
-
|
45
|
-
### From Rubygems
|
46
|
-
|
47
|
-
gem install hbase-jruby
|
48
|
-
|
49
|
-
### From source
|
50
|
-
|
51
|
-
git clone -b devel https://github.com/junegunn/hbase-jruby.git
|
52
|
-
cd hbase-jruby
|
53
|
-
rake build
|
54
|
-
gem install pkg/hbase-jruby-0.2.2-java.gem
|
55
|
-
|
56
103
|
## Setting up
|
57
104
|
|
58
105
|
### Resolving Hadoop/HBase dependency
|
@@ -72,7 +119,7 @@ Call `HBase.resolve_dependency!` helper method passing one of the arguments list
|
|
72
119
|
| cdh4.1[.*] | Cloudera CDH4.1 | cdh4.1.4 | mvn |
|
73
120
|
| cdh3[u*] | Cloudera CDH3 | cdh3u6 | mvn |
|
74
121
|
| 0.95[.*] | Apache HBase 0.95 | 0.95.0 | mvn |
|
75
|
-
| 0.94[.*] | Apache HBase 0.94 | 0.94.
|
122
|
+
| 0.94[.*] | Apache HBase 0.94 | 0.94.7 | mvn |
|
76
123
|
| 0.92[.*] | Apache HBase 0.92 | 0.92.2 | mvn |
|
77
124
|
| *POM PATH* | Custom Maven POM file | - | mvn |
|
78
125
|
| `:local` | Local HBase installation | - | hbase |
|
@@ -84,16 +131,16 @@ Call `HBase.resolve_dependency!` helper method passing one of the arguments list
|
|
84
131
|
|
85
132
|
```ruby
|
86
133
|
# Load JAR files from CDH4 using Maven
|
87
|
-
HBase.resolve_dependency! 'cdh4.2.
|
134
|
+
HBase.resolve_dependency! 'cdh4.2.1'
|
88
135
|
HBase.resolve_dependency! 'cdh4.1.3'
|
89
136
|
|
90
137
|
# Load JAR files of HBase 0.94.x using Maven
|
91
|
-
HBase.resolve_dependency! '0.94.
|
92
|
-
HBase.resolve_dependency! '0.94.2', :
|
138
|
+
HBase.resolve_dependency! '0.94.7'
|
139
|
+
HBase.resolve_dependency! '0.94.2', verbose: true
|
93
140
|
|
94
141
|
# Dependency resolution with custom POM file
|
95
142
|
HBase.resolve_dependency! '/path/to/my/pom.xml'
|
96
|
-
HBase.resolve_dependency! '/path/to/my/pom.xml', :
|
143
|
+
HBase.resolve_dependency! '/path/to/my/pom.xml', profile: 'trunk'
|
97
144
|
|
98
145
|
# Load JAR files from local HBase installation
|
99
146
|
# (equivalent to: export CLASSPATH=$CLASSPATH:`hbase classpath`)
|
@@ -126,10 +173,10 @@ hbase = HBase.new
|
|
126
173
|
hbase = HBase.new 'hbase.zookeeper.quorum' => 'remote-server.mydomain.net'
|
127
174
|
|
128
175
|
# Extra configuration
|
129
|
-
hbase = HBase.new 'hbase.zookeeper.quorum'
|
130
|
-
'hbase.client.retries.number'
|
176
|
+
hbase = HBase.new 'hbase.zookeeper.quorum' => 'remote-server.mydomain.net',
|
177
|
+
'hbase.client.retries.number' => 3,
|
131
178
|
'hbase.client.scanner.caching' => 1000,
|
132
|
-
'hbase.rpc.timeout'
|
179
|
+
'hbase.rpc.timeout' => 120000
|
133
180
|
|
134
181
|
# Close HBase connection
|
135
182
|
hbase.close
|
@@ -154,132 +201,190 @@ table = hbase[:test_table]
|
|
154
201
|
table.drop! if table.exists?
|
155
202
|
|
156
203
|
# Create table with two column families
|
157
|
-
table.create! :
|
158
|
-
:
|
204
|
+
table.create! cf1: {},
|
205
|
+
cf2: { compression: :snappy, bloomfilter: :row }
|
159
206
|
```
|
160
207
|
|
161
208
|
## Basic operations
|
162
209
|
|
210
|
+
### Defining table schema for easier data access
|
211
|
+
|
212
|
+
HBase stores everything as plain Java byte arrays. So it's completely up to
|
213
|
+
users to encode and decode column values of various types into and from byte
|
214
|
+
arrays, and that is a quite tedious and error-prone task.
|
215
|
+
|
216
|
+
To remedy this situation, `hbase-jruby` implements the concept of table schema.
|
217
|
+
|
218
|
+
Using table schema greatly simplifies the way you access data:
|
219
|
+
- With schema, byte array conversion becomes automatic
|
220
|
+
- It allows you to omit column family names (e.g. `:title` instead of `"cf1:title"`)
|
221
|
+
|
222
|
+
We'll use the following schema throughout the examples.
|
223
|
+
|
224
|
+
```ruby
|
225
|
+
hbase.schema = {
|
226
|
+
# Schema for `book` table
|
227
|
+
book: {
|
228
|
+
# Columns in cf1 family
|
229
|
+
cf1: {
|
230
|
+
title: :string,
|
231
|
+
author: :string,
|
232
|
+
category: :string,
|
233
|
+
year: :short,
|
234
|
+
pages: :fixnum,
|
235
|
+
price: :bigdecimal,
|
236
|
+
weight: :float,
|
237
|
+
in_print: :boolean
|
238
|
+
},
|
239
|
+
# Columns in cf2 family
|
240
|
+
cf2: {
|
241
|
+
summary: :string,
|
242
|
+
reviews: :fixnum,
|
243
|
+
stars: :fixnum,
|
244
|
+
/^comment\d+/ => :string
|
245
|
+
}
|
246
|
+
}
|
247
|
+
}
|
248
|
+
```
|
249
|
+
|
250
|
+
Columns that are not defined in the schema can be referenced
|
251
|
+
using `FAMILY:QUALIFIER` notation or 2-element Array of column family name (as Symbol) and qualifier,
|
252
|
+
however since there's no type information, they are returned as Java byte arrays,
|
253
|
+
which have to be decoded manually.
|
254
|
+
|
163
255
|
### PUT
|
164
256
|
|
165
257
|
```ruby
|
166
258
|
# Putting a single row
|
167
|
-
|
259
|
+
# - Row keys can be of any type, in this case, we use String type
|
260
|
+
table.put 'rowkey1', title: "Hello World", year: 2013
|
168
261
|
|
169
262
|
# Putting multiple rows
|
170
|
-
table.put 'rowkey1' => {
|
171
|
-
'rowkey2' => {
|
172
|
-
'rowkey3' => {
|
263
|
+
table.put 'rowkey1' => { title: 'foo', year: 2013 },
|
264
|
+
'rowkey2' => { title: "bar", year: 2014 },
|
265
|
+
'rowkey3' => { title: 'foobar', year: 2015 }
|
173
266
|
|
174
267
|
# Putting values with timestamps
|
175
268
|
table.put 'rowkey1' => {
|
176
|
-
|
177
|
-
1353143856665 => "Hello",
|
178
|
-
1352978648642 => "Goodbye"
|
179
|
-
|
269
|
+
title: {
|
270
|
+
1353143856665 => "Hello world",
|
271
|
+
1352978648642 => "Goodbye world"
|
272
|
+
},
|
273
|
+
year: 2013
|
180
274
|
}
|
181
275
|
```
|
182
276
|
|
183
277
|
### GET
|
184
278
|
|
185
|
-
HBase stores everything as a byte array, so when you fetch data from HBase,
|
186
|
-
you need to explicitly specify the type of each value stored.
|
187
|
-
|
188
279
|
```ruby
|
189
|
-
|
280
|
+
book = table.get('rowkey1')
|
190
281
|
|
191
282
|
# Rowkey
|
192
|
-
|
283
|
+
rowkey = row.rowkey # Rowkey as raw Java byte array
|
284
|
+
rowkey = row.rowkey :string # Rowkey as String
|
285
|
+
|
286
|
+
# Access columns in schema
|
287
|
+
title = book[:title]
|
288
|
+
author = book[:author]
|
289
|
+
year = book[:year]
|
193
290
|
|
194
|
-
#
|
195
|
-
|
291
|
+
# Convert to simple Hash
|
292
|
+
hash = book.to_h
|
196
293
|
|
197
|
-
#
|
198
|
-
|
199
|
-
col2 = row.fixnum 'cf1:col2'
|
200
|
-
col3 = row.bigdecimal 'cf1:col3'
|
201
|
-
col4 = row.float 'cf1:col4'
|
202
|
-
col5 = row.boolean 'cf1:col5'
|
203
|
-
col6 = row.symbol 'cf1:col6'
|
294
|
+
# Convert to Hash containing all versions of values indexed by their timestamps
|
295
|
+
all_hash = book.to_H
|
204
296
|
|
205
|
-
#
|
206
|
-
|
207
|
-
|
297
|
+
# Columns not defined in the schema are returned as Java byte arrays
|
298
|
+
# They need to be decoded manually
|
299
|
+
extra = HBase::Util.from_bytes(:bigdecimal, book['cf2:extra'])
|
300
|
+
# or, simply
|
301
|
+
extra = book.bigdecimal 'cf2:extra'
|
208
302
|
```
|
209
303
|
|
210
|
-
|
304
|
+
### Batch-GET
|
211
305
|
|
212
306
|
```ruby
|
213
307
|
# Pass an array of row keys as the parameter
|
214
|
-
|
308
|
+
books = table.get(['rowkey1', 'rowkey2', 'rowkey3'])
|
215
309
|
```
|
216
310
|
|
217
|
-
####
|
311
|
+
#### `to_h`
|
218
312
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
313
|
+
`to_h` and `to_H` return the Hash representation of the row.
|
314
|
+
(The latter returns all values with their timestamp)
|
315
|
+
|
316
|
+
If a column is defined in the schema, it is referenced using its quailifier in Symbol type.
|
317
|
+
If a column is not defined, it is represented as a 2-element Array
|
318
|
+
of column family in Symbol and column qualifier as ByteArray.
|
319
|
+
Even so, to make it easier to reference those columns, an extended version of
|
320
|
+
Hash is returned with which you can also reference them with `FAMILY:QUALIFIER`
|
321
|
+
notation or `[cf, cq]` array notation.
|
223
322
|
|
224
|
-
|
225
|
-
|
226
|
-
#
|
227
|
-
#
|
228
|
-
|
229
|
-
#
|
323
|
+
```ruby
|
324
|
+
table.put 1000 => {
|
325
|
+
title: 'Hello world', # Known column
|
326
|
+
comment100: 'foo', # Known column
|
327
|
+
'cf2:extra' => 'bar', # Unknown column
|
328
|
+
[:cf2, 10] => 'foobar' # Unknown column, non-string qualifier
|
329
|
+
}
|
230
330
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
331
|
+
book = table.get 10000
|
332
|
+
hash = book.to_h
|
333
|
+
# {
|
334
|
+
# :title => "Hello world",
|
335
|
+
# [:cf2, HBase::ByteArray<0, 0, 0, 0, 0, 0, 0, 10>] =>
|
336
|
+
# byte[102, 111, 111, 98, 97, 114]@6f28bb44,
|
337
|
+
# :comment100 => "foo",
|
338
|
+
# [:cf2, HBase::ByteArray<101, 120, 116, 114, 97>] =>
|
339
|
+
# byte[98, 97, 114]@77190cfc}
|
340
|
+
# }
|
341
|
+
|
342
|
+
hash['cf2:extra']
|
343
|
+
# byte[98, 97, 114]@77190cfc
|
344
|
+
|
345
|
+
hash[%w[cf2 extra]]
|
346
|
+
# byte[98, 97, 114]@77190cfc
|
347
|
+
|
348
|
+
hash[[:cf2, HBase::ByteArray['extra']]]
|
349
|
+
# byte[98, 97, 114]@77190cfc
|
350
|
+
|
351
|
+
hash['cf2:extra'].to_s
|
352
|
+
# 'bar'
|
353
|
+
|
354
|
+
# Columns with non-string qualifiers must be referenced using 2-element Array notation
|
355
|
+
hash['cf2:10']
|
356
|
+
# nil
|
357
|
+
hash[[:cf2, 10]]
|
358
|
+
# byte[102, 111, 111, 98, 97, 114]@6f28bb44
|
359
|
+
|
360
|
+
hash_with_versions = book.to_H
|
361
|
+
# {
|
362
|
+
# :title => {1369019227766 => "Hello world"},
|
363
|
+
# [:cf2, HBase::ByteArray<0, 0, 0, 0, 0, 0, 0, 10>] =>
|
364
|
+
# {1369019227766 => byte[102, 111, 111, 98, 97, 114]@6f28bb44},
|
365
|
+
# :comment100 => {1369019227766 => "foo"},
|
366
|
+
# [:cf2, HBase::ByteArray<101, 120, 116, 114, 97>] =>
|
367
|
+
# {1369019227766 => byte[98, 97, 114]@77190cfc}}
|
368
|
+
# }
|
239
369
|
```
|
240
370
|
|
241
371
|
#### Intra-row scan
|
242
372
|
|
243
|
-
Intra-row scan can be done
|
373
|
+
Intra-row scan can be done using `each` method which yields `HBase::Cell` instances.
|
244
374
|
|
245
375
|
```ruby
|
246
376
|
# Intra-row scan (all versions)
|
247
377
|
row.each do |cell|
|
248
378
|
family = cell.family
|
249
|
-
qualifier = cell.qualifier
|
379
|
+
qualifier = cell.qualifier :string # Column qualifier as String
|
250
380
|
timestamp = cell.timestamp
|
251
|
-
|
252
|
-
# Cell value as Java byte array
|
253
|
-
bytes = cell.raw
|
254
|
-
|
255
|
-
# Typed access
|
256
|
-
# value_as_string = cell.string
|
257
|
-
# value_as_fixnum = cell.fixnum
|
258
|
-
# ...
|
381
|
+
value = cell.value
|
259
382
|
end
|
260
383
|
|
261
384
|
# Array of HBase::Cells
|
262
385
|
cells = row.to_a
|
263
386
|
```
|
264
387
|
|
265
|
-
#### `to_hash`
|
266
|
-
|
267
|
-
```ruby
|
268
|
-
# Returns the Hash representation of the record with the specified schema
|
269
|
-
schema = {
|
270
|
-
'cf1:col1' => :string,
|
271
|
-
'cf1:col2' => :fixnum,
|
272
|
-
'cf1:col3' => :bigdecimal,
|
273
|
-
'cf1:col4' => :float,
|
274
|
-
'cf1:col5' => :boolean,
|
275
|
-
'cf1:col6' => :symbol }
|
276
|
-
|
277
|
-
table.get('rowkey1').to_hash(schema)
|
278
|
-
|
279
|
-
# Returns all versions for each column indexed by their timestamps
|
280
|
-
table.get('rowkey1').to_hash_with_versions(schema)
|
281
|
-
```
|
282
|
-
|
283
388
|
### DELETE
|
284
389
|
|
285
390
|
```ruby
|
@@ -287,23 +392,23 @@ table.get('rowkey1').to_hash_with_versions(schema)
|
|
287
392
|
table.delete('rowkey1')
|
288
393
|
|
289
394
|
# Deletes all columns in the specified column family
|
290
|
-
table.delete('rowkey1',
|
395
|
+
table.delete('rowkey1', :cf1)
|
291
396
|
|
292
397
|
# Deletes a column
|
293
|
-
table.delete('rowkey1',
|
398
|
+
table.delete('rowkey1', :author)
|
294
399
|
|
295
400
|
# Deletes a column with empty qualifier.
|
296
401
|
# (!= deleing the entire columns in the family. See the trailing colon.)
|
297
402
|
table.delete('rowkey1', 'cf1:')
|
298
403
|
|
299
404
|
# Deletes a version of a column
|
300
|
-
table.delete('rowkey1',
|
405
|
+
table.delete('rowkey1', :author, 1352978648642)
|
301
406
|
|
302
407
|
# Deletes multiple versions of a column
|
303
|
-
table.delete('rowkey1',
|
408
|
+
table.delete('rowkey1', :author, 1352978648642, 1352978649642)
|
304
409
|
|
305
410
|
# Batch delete
|
306
|
-
table.delete(['rowkey1'], ['rowkey2'], ['rowkey3',
|
411
|
+
table.delete(['rowkey1'], ['rowkey2'], ['rowkey3', :author, 1352978648642, 135297864964])
|
307
412
|
```
|
308
413
|
|
309
414
|
However, the last syntax seems a bit unwieldy when you just wish to delete a few rows.
|
@@ -318,11 +423,16 @@ table.delete_row 'rowkey1', 'rowkey2', 'rowkey3'
|
|
318
423
|
### Atomic increment of column values
|
319
424
|
|
320
425
|
```ruby
|
321
|
-
# Atomically increase
|
322
|
-
table.increment('rowkey1',
|
426
|
+
# Atomically increase cf2:reviews by one
|
427
|
+
table.increment('rowkey1', reviews: 1)
|
323
428
|
|
324
|
-
# Atomically increase two columns by one and
|
325
|
-
table.increment('rowkey1',
|
429
|
+
# Atomically increase two columns by one and five respectively
|
430
|
+
table.increment('rowkey1', reviews: 1, stars: 5)
|
431
|
+
|
432
|
+
# Increase column values of multiple rows.
|
433
|
+
# - Atomicity is only guaranteed within each row.
|
434
|
+
table.increment 'rowkey1' => { reviews: 1, stars: 5 },
|
435
|
+
'rowkey2' => { reviews: 1, stars: 3 }
|
326
436
|
```
|
327
437
|
|
328
438
|
### SCAN
|
@@ -332,10 +442,11 @@ table.increment('rowkey1', 'cf1:counter' => 1, 'cf1:counter2' => 2)
|
|
332
442
|
```ruby
|
333
443
|
# Full scan
|
334
444
|
table.each do |row|
|
335
|
-
|
336
|
-
name = row.string('cf:name')
|
337
|
-
# ...
|
445
|
+
p row.to_h
|
338
446
|
end
|
447
|
+
|
448
|
+
# Returns Enumerator when block is not given
|
449
|
+
table.each.with_index.each_slice(10).to_a
|
339
450
|
```
|
340
451
|
|
341
452
|
## Scoped access
|
@@ -374,13 +485,13 @@ you can retrieve data with the following methods.
|
|
374
485
|
import org.apache.hadoop.hbase.filter.RandomRowFilter
|
375
486
|
|
376
487
|
table.range('A'..'Z'). # Row key range,
|
377
|
-
project(
|
488
|
+
project(:author). # Select cf1:author column
|
378
489
|
project('cf2'). # Select cf2 family as well
|
379
|
-
filter(
|
380
|
-
filter(
|
381
|
-
filter(
|
490
|
+
filter(category: 'Comics'). # Filter by cf1:category value
|
491
|
+
filter(year: [1990, 2000, 2010]). # Set-inclusion condition on cf1:year
|
492
|
+
filter(weight: 2.0..4.0). # Range filter on cf1:weight
|
382
493
|
filter(RandomRowFilter.new(0.5)). # Any Java HBase filter
|
383
|
-
while(
|
494
|
+
while(reviews: { gt: 20 }). # Early termination of scan
|
384
495
|
time_range(Time.now - 600, Time.now). # Scan data of the last 10 minutes
|
385
496
|
limit(10). # Limits the size of the result set
|
386
497
|
versions(2). # Only fetches 2 versions for each value
|
@@ -389,7 +500,7 @@ table.range('A'..'Z'). # Row key range,
|
|
389
500
|
with_java_scan { |scan| # Directly access Java Scan object
|
390
501
|
scan.setCacheBlocks false
|
391
502
|
}.
|
392
|
-
to_a # To Array
|
503
|
+
to_a # To Array of HBase::Rows
|
393
504
|
```
|
394
505
|
|
395
506
|
### *range*
|
@@ -420,15 +531,15 @@ Optionally, prefix filter can be applied as follows.
|
|
420
531
|
# Row keys with "APPLE" prefix
|
421
532
|
# Start key is automatically set to "APPLE",
|
422
533
|
# stop key "APPLF" to avoid unnecessary disk access
|
423
|
-
table.range(:
|
534
|
+
table.range(prefix: 'APPLE')
|
424
535
|
|
425
536
|
# Row keys with "ACE", "BLUE" or "APPLE" prefix
|
426
537
|
# Start key is automatically set to "ACE",
|
427
538
|
# stop key "BLUF"
|
428
|
-
table.range(:
|
539
|
+
table.range(prefix: ['ACE', 'BLUE', 'APPLE'])
|
429
540
|
|
430
541
|
# Prefix filter with start key and stop key.
|
431
|
-
table.range('ACE', 'BLUEMARINE', :
|
542
|
+
table.range('ACE', 'BLUEMARINE', prefix: ['ACE', 'BLUE', 'APPLE'])
|
432
543
|
```
|
433
544
|
|
434
545
|
Subsequent calls to `#range` override the range previously defined.
|
@@ -437,7 +548,7 @@ Subsequent calls to `#range` override the range previously defined.
|
|
437
548
|
# Previous ranges are discarded
|
438
549
|
scope.range(1, 100).
|
439
550
|
range(50..100).
|
440
|
-
range(:
|
551
|
+
range(prefix: 'A').
|
441
552
|
range(1, 1000)
|
442
553
|
# Same as `scope.range(1, 1000)`
|
443
554
|
```
|
@@ -451,27 +562,24 @@ Multiple calls have conjunctive effects.
|
|
451
562
|
# Range scanning the table with filters
|
452
563
|
table.range(nil, 1000).
|
453
564
|
filter(
|
454
|
-
#
|
455
|
-
|
456
|
-
'cf1:b' => 1024,
|
565
|
+
# Equality match
|
566
|
+
year: 2013,
|
457
567
|
|
458
568
|
# Range of numbers or characters: Checks if the value falls within the range
|
459
|
-
|
460
|
-
|
569
|
+
weight: 2.0..4.0
|
570
|
+
author: 'A'..'C'
|
461
571
|
|
462
572
|
# Regular expression: Checks if the value matches the regular expression
|
463
|
-
|
573
|
+
summary: /classic$/i,
|
464
574
|
|
465
575
|
# Hash: Tests the value with 6 types of operators (:gt, :lt, :gte, :lte, :eq, :ne)
|
466
|
-
|
467
|
-
'cf1:g' => { ne: 1000 },
|
576
|
+
reviews: { gt: 100, lte: 200 },
|
468
577
|
|
469
578
|
# Array of the aforementioned types: OR condition (disjunctive)
|
470
|
-
|
471
|
-
'cf1:i' => ['A'...'B', 'C', /^D/, { lt: 'F' }]).
|
579
|
+
category: ['Fiction', 'Comic', /science/i, { ne: 'Political Science' }]).
|
472
580
|
|
473
581
|
# Multiple calls for conjunctive filtering
|
474
|
-
filter(
|
582
|
+
filter(summary: /instant/i).
|
475
583
|
|
476
584
|
# Any number of Java filters can be applied
|
477
585
|
filter(org.apache.hadoop.hbase.filter.RandomRowFilter.new(0.5)).
|
@@ -489,12 +597,12 @@ See the following example.
|
|
489
597
|
|
490
598
|
```ruby
|
491
599
|
(0...30).each do |idx|
|
492
|
-
table.put idx,
|
600
|
+
table.put idx, year: 2000 + idx % 10
|
493
601
|
end
|
494
602
|
|
495
|
-
table.filter(
|
603
|
+
table.filter(year: { lte: 2001 }).map { |r| r.rowkey :fixnum }
|
496
604
|
# [0, 1, 10, 11, 20, 21]
|
497
|
-
table.while(
|
605
|
+
table.while(year: { lte: 2001 }).map { |r| r.rowkey :fixnum }
|
498
606
|
# [0, 1]
|
499
607
|
# Scan terminates immediately when condition not met.
|
500
608
|
```
|
@@ -505,9 +613,9 @@ table.while('cf1:a' => { lte: 1 }).map { |r| r.rowkey :fixnum }
|
|
505
613
|
Multiple calls have additive effects.
|
506
614
|
|
507
615
|
```ruby
|
508
|
-
# Fetches cf1:
|
509
|
-
scoped.project(
|
510
|
-
project(
|
616
|
+
# Fetches cf1:title, cf1:author, and all columns in column family cf2 and cf3
|
617
|
+
scoped.project(:title, :author, :cf2).
|
618
|
+
project(:cf3)
|
511
619
|
```
|
512
620
|
|
513
621
|
HBase filters can not only filter rows but also columns.
|
@@ -519,17 +627,17 @@ to pass column filter to filter method.
|
|
519
627
|
```ruby
|
520
628
|
# Column prefix filter:
|
521
629
|
# Fetch columns whose qualifiers start with the specified prefixes
|
522
|
-
scoped.project(:
|
523
|
-
project(:
|
630
|
+
scoped.project(prefix: 'alice').
|
631
|
+
project(prefix: %w[alice bob])
|
524
632
|
|
525
633
|
# Column range filter:
|
526
634
|
# Fetch columns whose qualifiers within the ranges
|
527
|
-
scoped.project(:
|
528
|
-
project(:
|
635
|
+
scoped.project(range: 'a'...'c').
|
636
|
+
project(range: ['i'...'k', 'x'...'z'])
|
529
637
|
|
530
638
|
# Column pagination filter:
|
531
639
|
# Fetch columns within the specified intra-scan offset and limit
|
532
|
-
scoped.project(:
|
640
|
+
scoped.project(offset: 1000, limit: 10)
|
533
641
|
```
|
534
642
|
|
535
643
|
When using column filters on *fat* rows with many columns,
|
@@ -540,7 +648,7 @@ However setting batch size allows multiple rows with the same row key are return
|
|
540
648
|
```ruby
|
541
649
|
# Let's say that we have rows with more than 10 columns whose qualifiers start with `str`
|
542
650
|
puts scoped.range(1..100).
|
543
|
-
project(:
|
651
|
+
project(prefix: 'str').
|
544
652
|
batch(10).
|
545
653
|
map { |row| [row.rowkey(:fixnum), row.count].map(&:to_s).join ': ' }
|
546
654
|
|
@@ -556,12 +664,10 @@ puts scoped.range(1..100).
|
|
556
664
|
### Scoped SCAN / GET
|
557
665
|
|
558
666
|
```ruby
|
559
|
-
scoped = table.versions(1)
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
range('rowkey0'..'rowkey2') # Range of rowkeys.
|
564
|
-
project('cf1', 'cf2:x') # Projection
|
667
|
+
scoped = table.versions(1) # Limits the number of versions
|
668
|
+
.filter(year: 1990...2000)
|
669
|
+
.range('rowkey0'..'rowkey2') # Range of rowkeys.
|
670
|
+
.project('cf1', 'cf2:x') # Projection
|
565
671
|
|
566
672
|
# Scoped GET
|
567
673
|
# Nonexistent or filtered rows are returned as nils
|
@@ -603,22 +709,22 @@ of the projected columns.
|
|
603
709
|
|
604
710
|
```ruby
|
605
711
|
# cf1:a must hold 8-byte integer values
|
606
|
-
table.project(
|
607
|
-
table.project(
|
608
|
-
table.project(
|
609
|
-
table.project(
|
610
|
-
table.project(
|
611
|
-
table.project(
|
712
|
+
table.project(:reviews).aggregate(:sum)
|
713
|
+
table.project(:reviews).aggregate(:avg)
|
714
|
+
table.project(:reviews).aggregate(:min)
|
715
|
+
table.project(:reviews).aggregate(:max)
|
716
|
+
table.project(:reviews).aggregate(:std)
|
717
|
+
table.project(:reviews).aggregate(:row_count)
|
612
718
|
|
613
719
|
# Aggregation of multiple columns
|
614
|
-
table.project(
|
720
|
+
table.project(:reviews, :stars).aggregate(:sum)
|
615
721
|
```
|
616
722
|
|
617
723
|
By default, aggregate method assumes that the projected values are 8-byte integers.
|
618
724
|
For other data types, you can pass your own ColumnInterpreter.
|
619
725
|
|
620
726
|
```ruby
|
621
|
-
table.project(
|
727
|
+
table.project(:price).aggregate(:sum, MyColumnInterpreter.new)
|
622
728
|
```
|
623
729
|
|
624
730
|
## Table inspection
|
@@ -691,8 +797,7 @@ With `regions` method, you can even presplit the new table just like the old one
|
|
691
797
|
```ruby
|
692
798
|
hbase[:dupe_table].create!(
|
693
799
|
table.raw_families,
|
694
|
-
table.raw_properties.merge(
|
695
|
-
:splits => table.regions.map { |r| r[:start_key] }.compact))
|
800
|
+
table.raw_properties.merge(splits: table.regions.map { |r| r[:start_key] }.compact))
|
696
801
|
```
|
697
802
|
|
698
803
|
## Table administration
|
@@ -709,21 +814,22 @@ and come with non-bang, asynchronous counterparts.
|
|
709
814
|
table.create!(
|
710
815
|
# 1st Hash: Column family specification
|
711
816
|
{
|
712
|
-
:
|
713
|
-
:
|
817
|
+
cf1: { compression: snappy },
|
818
|
+
cf2: { bloomfilter: row }
|
714
819
|
},
|
715
820
|
|
716
821
|
# 2nd Hash: Table properties
|
717
|
-
:
|
718
|
-
:
|
719
|
-
:
|
822
|
+
max_filesize: 256 * 1024 ** 2,
|
823
|
+
deferred_log_flush: false,
|
824
|
+
splits: [1000, 2000, 3000]
|
825
|
+
)
|
720
826
|
|
721
827
|
# Alter table properties (synchronous with optional block)
|
722
828
|
table.alter!(
|
723
|
-
:
|
724
|
-
:
|
725
|
-
:
|
726
|
-
:
|
829
|
+
max_filesize: 512 * 1024 ** 2,
|
830
|
+
memstore_flushsize: 64 * 1024 ** 2,
|
831
|
+
readonly: false,
|
832
|
+
deferred_log_flush: true
|
727
833
|
) { |progress, total|
|
728
834
|
# Progress report with an optional block
|
729
835
|
puts [progress, total].join('/')
|
@@ -731,10 +837,10 @@ table.alter!(
|
|
731
837
|
|
732
838
|
# Alter table properties (asynchronous)
|
733
839
|
table.alter(
|
734
|
-
:
|
735
|
-
:
|
736
|
-
:
|
737
|
-
:
|
840
|
+
max_filesize: 512 * 1024 ** 2,
|
841
|
+
memstore_flushsize: 64 * 1024 ** 2,
|
842
|
+
readonly: false,
|
843
|
+
deferred_log_flush: true
|
738
844
|
)
|
739
845
|
```
|
740
846
|
|
@@ -780,11 +886,10 @@ http://hbase.apache.org/apidocs/org/apache/hadoop/hbase/HTableDescriptor.html
|
|
780
886
|
|
781
887
|
```ruby
|
782
888
|
# Add column family
|
783
|
-
table.add_family! :cf3, :
|
784
|
-
:bloomfilter => :row
|
889
|
+
table.add_family! :cf3, compression: :snappy, bloomfilter: :row
|
785
890
|
|
786
891
|
# Alter column family
|
787
|
-
table.alter_family! :cf2, :
|
892
|
+
table.alter_family! :cf2, bloomfilter: :rowcol
|
788
893
|
|
789
894
|
# Remove column family
|
790
895
|
table.delete_family! :cf1
|
@@ -797,8 +902,7 @@ table.delete_family! :cf1
|
|
797
902
|
unless table.has_coprocessor?(cp_class_name1)
|
798
903
|
table.add_coprocessor! cp_class_name1
|
799
904
|
end
|
800
|
-
table.add_coprocessor! cp_class_name2,
|
801
|
-
:path => path, :priority => priority, :params => params
|
905
|
+
table.add_coprocessor! cp_class_name2, path: path, priority: priority, params: params
|
802
906
|
|
803
907
|
# Remove coprocessor
|
804
908
|
table.remove_coprocessor! cp_class_name1
|
@@ -868,17 +972,14 @@ table.range('1'..'3').map { |r| r.rowkey :string }
|
|
868
972
|
|
869
973
|
### Non-string column qualifier
|
870
974
|
|
871
|
-
If a column qualifier is not a String,
|
872
|
-
instead of a conventional `FAMILY:QUALIFIER` String.
|
975
|
+
If a column qualifier is not a String, a 2-element Array should be used.
|
873
976
|
|
874
977
|
```ruby
|
875
978
|
table.put 'rowkey',
|
876
|
-
|
877
|
-
|
878
|
-
HBase::ColumnKey(:cf1, bytes) => "Qualifier is an arbitrary byte array"
|
979
|
+
[:cf1, 100 ] => "Byte representation of an 8-byte integer",
|
980
|
+
[:cf1, bytes] => "Qualifier is an arbitrary byte array"
|
879
981
|
|
880
|
-
table.get('rowkey')
|
881
|
-
table.get('rowkey').string(HBase::ColumnKey(:cf1, 100))
|
982
|
+
table.get('rowkey')[:cf1, 100]
|
882
983
|
# ...
|
883
984
|
```
|
884
985
|
|
@@ -895,12 +996,7 @@ table.put({ int: 12345 }, 'cf1:a' => { byte: 100 }, # 1-byte integer
|
|
895
996
|
'cf1:c' => { int: 300 }, # 4-byte integer
|
896
997
|
'cf1:4' => 400) # Ordinary 8-byte integer
|
897
998
|
|
898
|
-
|
899
|
-
|
900
|
-
result.byte('cf1:a') # 100
|
901
|
-
result.short('cf1:b') # 200
|
902
|
-
result.int('cf1:c') # 300
|
903
|
-
# ...
|
999
|
+
row = table.get(int: 12345)
|
904
1000
|
```
|
905
1001
|
|
906
1002
|
### Working with byte arrays
|
@@ -919,7 +1015,7 @@ which makes byte array manipulation much easier.
|
|
919
1015
|
A ByteArray can be created as a concatenation of any number of objects.
|
920
1016
|
|
921
1017
|
```ruby
|
922
|
-
ba = HBase::ByteArray
|
1018
|
+
ba = HBase::ByteArray[100, 3.14, {int: 300}, "Hello World"]
|
923
1019
|
```
|
924
1020
|
|
925
1021
|
Then you can slice it and decode each part,
|
@@ -943,7 +1039,7 @@ ba << { short: 300 }
|
|
943
1039
|
concatenate another ByteArray,
|
944
1040
|
|
945
1041
|
```ruby
|
946
|
-
ba += HBase::ByteArray
|
1042
|
+
ba += HBase::ByteArray[1024]
|
947
1043
|
```
|
948
1044
|
|
949
1045
|
or shift decoded objects from it.
|