purecdb 0.1.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +3 -3
- data/lib/purecdb.rb +10 -1
- data/lib/purecdb/base.rb +8 -25
- data/lib/purecdb/reader.rb +22 -13
- data/lib/purecdb/version.rb +1 -2
- data/lib/purecdb/writer.rb +78 -33
- metadata +2 -2
data/README.md
CHANGED
@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
|
|
45
45
|
To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
|
46
46
|
|
47
47
|
|
48
|
-
To read a
|
48
|
+
To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
|
|
53
53
|
end
|
54
54
|
```
|
55
55
|
|
56
|
-
To
|
57
|
-
|
56
|
+
To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
|
57
|
+
the second argument to PureCDB::Reader#open.
|
58
58
|
|
59
59
|
See PureCDB::Reader#new for additional usage.
|
60
60
|
|
data/lib/purecdb.rb
CHANGED
@@ -3,6 +3,15 @@ require "purecdb/base"
|
|
3
3
|
require "purecdb/reader"
|
4
4
|
require "purecdb/writer"
|
5
5
|
|
6
|
-
# :nodoc:
|
7
6
|
module PureCDB
|
7
|
+
|
8
|
+
# Convenience alternative to PureCDB::Reader.new(target, *options, &block)
|
9
|
+
def self.reader(target,*options, &block)
|
10
|
+
PureCDB::Reader.new(target, *options, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Convenience alternative to PureCDB::Writer.new(target, *options, &block)
|
14
|
+
def self.writer(target,*options, &block)
|
15
|
+
PureCDB::Writer.new(target, *options, &block)
|
16
|
+
end
|
8
17
|
end
|
data/lib/purecdb/base.rb
CHANGED
@@ -4,6 +4,9 @@ module PureCDB
|
|
4
4
|
# Base class with shared functionality for PureCDB::Reader and
|
5
5
|
# PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
|
6
6
|
# details. You should not need to use this directly.
|
7
|
+
#
|
8
|
+
# Changing the constants defined here is likely to break all
|
9
|
+
# kinds of stuff.
|
7
10
|
#
|
8
11
|
class Base
|
9
12
|
# The CDB format contains 256 separate hashes by default.
|
@@ -17,9 +20,6 @@ module PureCDB
|
|
17
20
|
# 64 bit format this is multiplied by 2.
|
18
21
|
DEFAULT_HASHPTR_SIZE = 4
|
19
22
|
|
20
|
-
# Number of bytes that will be buffered
|
21
|
-
BUFFER_SIZE = 4096
|
22
|
-
|
23
23
|
# Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
|
24
24
|
# rather than a 32-bit CDB file.
|
25
25
|
CDB64_MAGIC = "cdb64:01"
|
@@ -53,12 +53,15 @@ module PureCDB
|
|
53
53
|
def set_mode mode
|
54
54
|
@mode = mode
|
55
55
|
@num_hashes = DEFAULT_NUM_HASHES
|
56
|
+
|
56
57
|
if @mode == 64
|
57
58
|
@length_size = DEFAULT_LENGTH_SIZE * 2
|
58
59
|
@hashptr_size = DEFAULT_HASHPTR_SIZE * 2
|
60
|
+
@format = "Q<"
|
59
61
|
else
|
60
62
|
@length_size = DEFAULT_LENGTH_SIZE
|
61
63
|
@hashptr_size = DEFAULT_HASHPTR_SIZE
|
64
|
+
@format = "V"
|
62
65
|
end
|
63
66
|
end
|
64
67
|
|
@@ -74,9 +77,6 @@ module PureCDB
|
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
|
-
# Used to speed up 64bit pack/unpack
|
78
|
-
@little_endian = [123456789].pack("L") == [123456789].pack("V")
|
79
|
-
|
80
80
|
if mode == :detect
|
81
81
|
@mode = :detect
|
82
82
|
else
|
@@ -112,29 +112,12 @@ module PureCDB
|
|
112
112
|
|
113
113
|
private
|
114
114
|
|
115
|
-
# Due to Array#pack's lack of a little/big endian specific 64 bit operator
|
116
115
|
def ary_pack(ary)
|
117
|
-
|
118
|
-
ary.pack("V*")
|
119
|
-
elsif @little_endian
|
120
|
-
ary.pack("Q*")
|
121
|
-
else
|
122
|
-
ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
|
123
|
-
end
|
116
|
+
ary.pack("#{@format}*")
|
124
117
|
end
|
125
118
|
|
126
|
-
# Due to String#unpack's lack of a little/big endian specific 64 bit operator
|
127
119
|
def ary_unpack(data, num)
|
128
|
-
|
129
|
-
data.unpack("V#{num}")
|
130
|
-
elsif @little_endian
|
131
|
-
data.unpack("Q#{num}")
|
132
|
-
else
|
133
|
-
ret = []
|
134
|
-
data = data.unpack("V#{num*2}")
|
135
|
-
data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
|
136
|
-
ret
|
137
|
-
end
|
120
|
+
data.unpack("#{@format}#{num}")
|
138
121
|
end
|
139
122
|
end
|
140
123
|
|
data/lib/purecdb/reader.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module PureCDB
|
2
2
|
|
3
3
|
#
|
4
|
-
# Read 32 bit or
|
4
|
+
# Read 32 bit or 64 bit CDB files.
|
5
5
|
#
|
6
6
|
class Reader < Base
|
7
7
|
include Enumerable
|
@@ -41,7 +41,6 @@ module PureCDB
|
|
41
41
|
raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
|
42
42
|
end
|
43
43
|
|
44
|
-
# FIXME: It seems like there are bugs triggered if mmap fails
|
45
44
|
@m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
|
46
45
|
read_hashes
|
47
46
|
|
@@ -55,12 +54,15 @@ module PureCDB
|
|
55
54
|
end
|
56
55
|
|
57
56
|
#
|
58
|
-
#
|
57
|
+
# Alternative to PureCDB::Reader.new(target,options) ..
|
59
58
|
#
|
60
59
|
def self.open(target, *options, &block)
|
61
60
|
Reader.new(target, *options, &block)
|
62
61
|
end
|
63
62
|
|
63
|
+
#
|
64
|
+
# Close the CDB file
|
65
|
+
#
|
64
66
|
def close
|
65
67
|
@io.close if @io
|
66
68
|
@m.unmap if @m
|
@@ -68,6 +70,11 @@ module PureCDB
|
|
68
70
|
@io = nil
|
69
71
|
end
|
70
72
|
|
73
|
+
# Iterate over all key/value pairs in the order they occur in the file.
|
74
|
+
# This is *not* sorted or insertion order.
|
75
|
+
#
|
76
|
+
# +each+ will yield each key,value pair separately even when a key is
|
77
|
+
# duplicate.
|
71
78
|
def each
|
72
79
|
pos = hash_size
|
73
80
|
hoff0 = @hashes[0]
|
@@ -78,16 +85,7 @@ module PureCDB
|
|
78
85
|
end
|
79
86
|
end
|
80
87
|
|
81
|
-
|
82
|
-
keylen, datalen = read_header(pos)
|
83
|
-
return nil,nil if !keylen
|
84
|
-
pos += hashref_size
|
85
|
-
rkey = read(pos .. pos + keylen - 1)
|
86
|
-
pos += keylen
|
87
|
-
value = read(pos .. pos + datalen - 1)
|
88
|
-
return rkey, value
|
89
|
-
end
|
90
|
-
|
88
|
+
# Returns all values for +key+ in an array
|
91
89
|
def values(key)
|
92
90
|
h = hash(key)
|
93
91
|
|
@@ -121,6 +119,17 @@ module PureCDB
|
|
121
119
|
|
122
120
|
private
|
123
121
|
|
122
|
+
def read_entry(pos)
|
123
|
+
keylen, datalen = read_header(pos)
|
124
|
+
return nil,nil if !keylen
|
125
|
+
pos += hashref_size
|
126
|
+
rkey = read(pos .. pos + keylen - 1)
|
127
|
+
pos += keylen
|
128
|
+
value = read(pos .. pos + datalen - 1)
|
129
|
+
return rkey, value
|
130
|
+
end
|
131
|
+
|
132
|
+
|
124
133
|
# Warning: This will be very slow if not mmap'd
|
125
134
|
def read r
|
126
135
|
@m = nil
|
data/lib/purecdb/version.rb
CHANGED
data/lib/purecdb/writer.rb
CHANGED
@@ -1,15 +1,58 @@
|
|
1
1
|
module PureCDB
|
2
|
+
#
|
3
|
+
# Write 32 or 64 bit CDB files
|
4
|
+
#
|
5
|
+
# == Memory considerations
|
6
|
+
#
|
7
|
+
# While the entry is written to the target object immediately on calling #store, the actual
|
8
|
+
# hash tables can not be written until the full dataset is ready. You must therefore be able
|
9
|
+
# to hold the hash of each key (including duplicates) and the position in the file the full
|
10
|
+
# netry is stored at in memory while building the CDB file.
|
11
|
+
#
|
12
|
+
# It would be possible to write this to a temporary file at the cost of performance, but the
|
13
|
+
# current implementation does not do this.
|
14
|
+
#
|
15
|
+
# As a compromise, the current implementation stores the hashes and positions as a BER encoded
|
16
|
+
# string per hash bucket until it is ready to write it to disk.
|
17
|
+
#
|
2
18
|
class Writer < Base
|
3
|
-
#
|
4
|
-
#
|
5
|
-
# records
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
19
|
+
# How full any given hash table is allowed to get, as a float between 0 and 1.
|
20
|
+
#
|
21
|
+
# Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
|
22
|
+
# the more frequently the reader may have to engage in potentially lengthy (worst case
|
23
|
+
# scanning all the records) probing to find the right entry
|
24
|
+
attr_accessor :hash_fill_factor
|
25
|
+
|
26
|
+
# Open a CDB file for writing, or preparing an IO like object for writing.
|
27
|
+
#
|
28
|
+
# :call-seq:
|
29
|
+
# w = PureCDB::Writer.new(target)
|
30
|
+
# w = PureCDB::Writer.new(target, *options)
|
31
|
+
# PureCDB::Writer.new(target) {|w| ... }
|
32
|
+
# PureCDB::Writer.new(target, *options) {|w| ... }
|
33
|
+
#
|
34
|
+
# If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
|
35
|
+
# you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
|
36
|
+
# is 32.
|
37
|
+
#
|
38
|
+
# If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
|
39
|
+
# Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
|
40
|
+
# and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
|
41
|
+
# #syswrite to #seek and #write respectively, and these must be present.
|
42
|
+
#
|
43
|
+
# (+IO+ and +StringIO+ both satisfy these requirements)
|
44
|
+
#
|
45
|
+
# If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
|
46
|
+
# afterwards.
|
47
|
+
#
|
48
|
+
# **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
|
49
|
+
# when you are done.
|
50
|
+
#
|
10
51
|
def initialize target, *options
|
11
52
|
super *options
|
12
53
|
|
54
|
+
@hash_fill_factor = 0.7
|
55
|
+
|
13
56
|
set_mode(32) if @mode == :detect
|
14
57
|
|
15
58
|
if target.is_a?(String)
|
@@ -34,36 +77,38 @@ module PureCDB
|
|
34
77
|
end
|
35
78
|
end
|
36
79
|
|
80
|
+
# Write out the hashes and hash pointers, and close the target if it responds to #close
|
81
|
+
#
|
37
82
|
def close
|
38
83
|
write_hashes
|
39
84
|
write_hashptrs
|
40
85
|
@io.close if @io.respond_to?(:close)
|
41
86
|
end
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
87
|
+
|
88
|
+
# Store 'value' under 'key'.
|
89
|
+
#
|
90
|
+
# Multiple values can we stored for the same key by calling #store multiple times
|
91
|
+
# with the same key value.
|
92
|
+
def store key,value
|
93
|
+
# In an attempt to save memory, we pack the hash data we gather into
|
94
|
+
# strings of BER compressed integers...
|
95
|
+
h = hash(key)
|
96
|
+
hi = (h % num_hashes)
|
97
|
+
@hashes[hi] ||= ""
|
98
|
+
|
99
|
+
header = build_header(key.length, value.length)
|
100
|
+
@io.syswrite(header+key+value)
|
101
|
+
size = header.size + key.size + value.size
|
102
|
+
@hashes[hi] += [h,@pos].pack("ww") # BER compressed
|
103
|
+
@pos += size
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Alternative to PureCDB::Writer.new(target,options) ..
|
108
|
+
#
|
109
|
+
def self.open target, *options, &block
|
110
|
+
Writer.new(target, *options, &block)
|
111
|
+
end
|
67
112
|
|
68
113
|
private
|
69
114
|
def write_hashes
|
@@ -88,7 +133,7 @@ module PureCDB
|
|
88
133
|
while ary[off*2] != 0
|
89
134
|
off = (off + 1) % len
|
90
135
|
end
|
91
|
-
|
136
|
+
free_slots -= 1
|
92
137
|
ary[off*2] = entry[0]
|
93
138
|
ary[off*2+1] = entry[1]
|
94
139
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: purecdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-07-
|
12
|
+
date: 2016-07-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|