purecdb 0.1.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +3 -3
- data/lib/purecdb.rb +10 -1
- data/lib/purecdb/base.rb +8 -25
- data/lib/purecdb/reader.rb +22 -13
- data/lib/purecdb/version.rb +1 -2
- data/lib/purecdb/writer.rb +78 -33
- metadata +2 -2
data/README.md
CHANGED
@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
|
|
45
45
|
To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
|
46
46
|
|
47
47
|
|
48
|
-
To read a
|
48
|
+
To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
|
49
49
|
|
50
50
|
```ruby
|
51
51
|
PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
|
|
53
53
|
end
|
54
54
|
```
|
55
55
|
|
56
|
-
To
|
57
|
-
|
56
|
+
To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
|
57
|
+
the second argument to PureCDB::Reader#open.
|
58
58
|
|
59
59
|
See PureCDB::Reader#new for additional usage.
|
60
60
|
|
data/lib/purecdb.rb
CHANGED
@@ -3,6 +3,15 @@ require "purecdb/base"
|
|
3
3
|
require "purecdb/reader"
|
4
4
|
require "purecdb/writer"
|
5
5
|
|
6
|
-
# :nodoc:
|
7
6
|
module PureCDB
|
7
|
+
|
8
|
+
# Convenience alternative to PureCDB::Reader.new(target, *options, &block)
|
9
|
+
def self.reader(target,*options, &block)
|
10
|
+
PureCDB::Reader.new(target, *options, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Convenience alternative to PureCDB::Writer.new(target, *options, &block)
|
14
|
+
def self.writer(target,*options, &block)
|
15
|
+
PureCDB::Writer.new(target, *options, &block)
|
16
|
+
end
|
8
17
|
end
|
data/lib/purecdb/base.rb
CHANGED
@@ -4,6 +4,9 @@ module PureCDB
|
|
4
4
|
# Base class with shared functionality for PureCDB::Reader and
|
5
5
|
# PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
|
6
6
|
# details. You should not need to use this directly.
|
7
|
+
#
|
8
|
+
# Changing the constants defined here is likely to break all
|
9
|
+
# kinds of stuff.
|
7
10
|
#
|
8
11
|
class Base
|
9
12
|
# The CDB format contains 256 separate hashes by default.
|
@@ -17,9 +20,6 @@ module PureCDB
|
|
17
20
|
# 64 bit format this is multiplied by 2.
|
18
21
|
DEFAULT_HASHPTR_SIZE = 4
|
19
22
|
|
20
|
-
# Number of bytes that will be buffered
|
21
|
-
BUFFER_SIZE = 4096
|
22
|
-
|
23
23
|
# Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
|
24
24
|
# rather than a 32-bit CDB file.
|
25
25
|
CDB64_MAGIC = "cdb64:01"
|
@@ -53,12 +53,15 @@ module PureCDB
|
|
53
53
|
def set_mode mode
|
54
54
|
@mode = mode
|
55
55
|
@num_hashes = DEFAULT_NUM_HASHES
|
56
|
+
|
56
57
|
if @mode == 64
|
57
58
|
@length_size = DEFAULT_LENGTH_SIZE * 2
|
58
59
|
@hashptr_size = DEFAULT_HASHPTR_SIZE * 2
|
60
|
+
@format = "Q<"
|
59
61
|
else
|
60
62
|
@length_size = DEFAULT_LENGTH_SIZE
|
61
63
|
@hashptr_size = DEFAULT_HASHPTR_SIZE
|
64
|
+
@format = "V"
|
62
65
|
end
|
63
66
|
end
|
64
67
|
|
@@ -74,9 +77,6 @@ module PureCDB
|
|
74
77
|
end
|
75
78
|
end
|
76
79
|
|
77
|
-
# Used to speed up 64bit pack/unpack
|
78
|
-
@little_endian = [123456789].pack("L") == [123456789].pack("V")
|
79
|
-
|
80
80
|
if mode == :detect
|
81
81
|
@mode = :detect
|
82
82
|
else
|
@@ -112,29 +112,12 @@ module PureCDB
|
|
112
112
|
|
113
113
|
private
|
114
114
|
|
115
|
-
# Due to Array#pack's lack of a little/big endian specific 64 bit operator
|
116
115
|
def ary_pack(ary)
|
117
|
-
|
118
|
-
ary.pack("V*")
|
119
|
-
elsif @little_endian
|
120
|
-
ary.pack("Q*")
|
121
|
-
else
|
122
|
-
ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
|
123
|
-
end
|
116
|
+
ary.pack("#{@format}*")
|
124
117
|
end
|
125
118
|
|
126
|
-
# Due to String#unpack's lack of a little/big endian specific 64 bit operator
|
127
119
|
def ary_unpack(data, num)
|
128
|
-
|
129
|
-
data.unpack("V#{num}")
|
130
|
-
elsif @little_endian
|
131
|
-
data.unpack("Q#{num}")
|
132
|
-
else
|
133
|
-
ret = []
|
134
|
-
data = data.unpack("V#{num*2}")
|
135
|
-
data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
|
136
|
-
ret
|
137
|
-
end
|
120
|
+
data.unpack("#{@format}#{num}")
|
138
121
|
end
|
139
122
|
end
|
140
123
|
|
data/lib/purecdb/reader.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module PureCDB
|
2
2
|
|
3
3
|
#
|
4
|
-
# Read 32 bit or
|
4
|
+
# Read 32 bit or 64 bit CDB files.
|
5
5
|
#
|
6
6
|
class Reader < Base
|
7
7
|
include Enumerable
|
@@ -41,7 +41,6 @@ module PureCDB
|
|
41
41
|
raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
|
42
42
|
end
|
43
43
|
|
44
|
-
# FIXME: It seems like there are bugs triggered if mmap fails
|
45
44
|
@m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
|
46
45
|
read_hashes
|
47
46
|
|
@@ -55,12 +54,15 @@ module PureCDB
|
|
55
54
|
end
|
56
55
|
|
57
56
|
#
|
58
|
-
#
|
57
|
+
# Alternative to PureCDB::Reader.new(target,options) ..
|
59
58
|
#
|
60
59
|
def self.open(target, *options, &block)
|
61
60
|
Reader.new(target, *options, &block)
|
62
61
|
end
|
63
62
|
|
63
|
+
#
|
64
|
+
# Close the CDB file
|
65
|
+
#
|
64
66
|
def close
|
65
67
|
@io.close if @io
|
66
68
|
@m.unmap if @m
|
@@ -68,6 +70,11 @@ module PureCDB
|
|
68
70
|
@io = nil
|
69
71
|
end
|
70
72
|
|
73
|
+
# Iterate over all key/value pairs in the order they occur in the file.
|
74
|
+
# This is *not* sorted or insertion order.
|
75
|
+
#
|
76
|
+
# +each+ will yield each key,value pair separately even when a key is
|
77
|
+
# duplicate.
|
71
78
|
def each
|
72
79
|
pos = hash_size
|
73
80
|
hoff0 = @hashes[0]
|
@@ -78,16 +85,7 @@ module PureCDB
|
|
78
85
|
end
|
79
86
|
end
|
80
87
|
|
81
|
-
|
82
|
-
keylen, datalen = read_header(pos)
|
83
|
-
return nil,nil if !keylen
|
84
|
-
pos += hashref_size
|
85
|
-
rkey = read(pos .. pos + keylen - 1)
|
86
|
-
pos += keylen
|
87
|
-
value = read(pos .. pos + datalen - 1)
|
88
|
-
return rkey, value
|
89
|
-
end
|
90
|
-
|
88
|
+
# Returns all values for +key+ in an array
|
91
89
|
def values(key)
|
92
90
|
h = hash(key)
|
93
91
|
|
@@ -121,6 +119,17 @@ module PureCDB
|
|
121
119
|
|
122
120
|
private
|
123
121
|
|
122
|
+
def read_entry(pos)
|
123
|
+
keylen, datalen = read_header(pos)
|
124
|
+
return nil,nil if !keylen
|
125
|
+
pos += hashref_size
|
126
|
+
rkey = read(pos .. pos + keylen - 1)
|
127
|
+
pos += keylen
|
128
|
+
value = read(pos .. pos + datalen - 1)
|
129
|
+
return rkey, value
|
130
|
+
end
|
131
|
+
|
132
|
+
|
124
133
|
# Warning: This will be very slow if not mmap'd
|
125
134
|
def read r
|
126
135
|
@m = nil
|
data/lib/purecdb/version.rb
CHANGED
data/lib/purecdb/writer.rb
CHANGED
@@ -1,15 +1,58 @@
|
|
1
1
|
module PureCDB
|
2
|
+
#
|
3
|
+
# Write 32 or 64 bit CDB files
|
4
|
+
#
|
5
|
+
# == Memory considerations
|
6
|
+
#
|
7
|
+
# While the entry is written to the target object immediately on calling #store, the actual
|
8
|
+
# hash tables can not be written until the full dataset is ready. You must therefore be able
|
9
|
+
# to hold the hash of each key (including duplicates) and the position in the file the full
|
10
|
+
# netry is stored at in memory while building the CDB file.
|
11
|
+
#
|
12
|
+
# It would be possible to write this to a temporary file at the cost of performance, but the
|
13
|
+
# current implementation does not do this.
|
14
|
+
#
|
15
|
+
# As a compromise, the current implementation stores the hashes and positions as a BER encoded
|
16
|
+
# string per hash bucket until it is ready to write it to disk.
|
17
|
+
#
|
2
18
|
class Writer < Base
|
3
|
-
#
|
4
|
-
#
|
5
|
-
# records
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
19
|
+
# How full any given hash table is allowed to get, as a float between 0 and 1.
|
20
|
+
#
|
21
|
+
# Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
|
22
|
+
# the more frequently the reader may have to engage in potentially lengthy (worst case
|
23
|
+
# scanning all the records) probing to find the right entry
|
24
|
+
attr_accessor :hash_fill_factor
|
25
|
+
|
26
|
+
# Open a CDB file for writing, or preparing an IO like object for writing.
|
27
|
+
#
|
28
|
+
# :call-seq:
|
29
|
+
# w = PureCDB::Writer.new(target)
|
30
|
+
# w = PureCDB::Writer.new(target, *options)
|
31
|
+
# PureCDB::Writer.new(target) {|w| ... }
|
32
|
+
# PureCDB::Writer.new(target, *options) {|w| ... }
|
33
|
+
#
|
34
|
+
# If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
|
35
|
+
# you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
|
36
|
+
# is 32.
|
37
|
+
#
|
38
|
+
# If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
|
39
|
+
# Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
|
40
|
+
# and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
|
41
|
+
# #syswrite to #seek and #write respectively, and these must be present.
|
42
|
+
#
|
43
|
+
# (+IO+ and +StringIO+ both satisfy these requirements)
|
44
|
+
#
|
45
|
+
# If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
|
46
|
+
# afterwards.
|
47
|
+
#
|
48
|
+
# **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
|
49
|
+
# when you are done.
|
50
|
+
#
|
10
51
|
def initialize target, *options
|
11
52
|
super *options
|
12
53
|
|
54
|
+
@hash_fill_factor = 0.7
|
55
|
+
|
13
56
|
set_mode(32) if @mode == :detect
|
14
57
|
|
15
58
|
if target.is_a?(String)
|
@@ -34,36 +77,38 @@ module PureCDB
|
|
34
77
|
end
|
35
78
|
end
|
36
79
|
|
80
|
+
# Write out the hashes and hash pointers, and close the target if it responds to #close
|
81
|
+
#
|
37
82
|
def close
|
38
83
|
write_hashes
|
39
84
|
write_hashptrs
|
40
85
|
@io.close if @io.respond_to?(:close)
|
41
86
|
end
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
87
|
+
|
88
|
+
# Store 'value' under 'key'.
|
89
|
+
#
|
90
|
+
# Multiple values can we stored for the same key by calling #store multiple times
|
91
|
+
# with the same key value.
|
92
|
+
def store key,value
|
93
|
+
# In an attempt to save memory, we pack the hash data we gather into
|
94
|
+
# strings of BER compressed integers...
|
95
|
+
h = hash(key)
|
96
|
+
hi = (h % num_hashes)
|
97
|
+
@hashes[hi] ||= ""
|
98
|
+
|
99
|
+
header = build_header(key.length, value.length)
|
100
|
+
@io.syswrite(header+key+value)
|
101
|
+
size = header.size + key.size + value.size
|
102
|
+
@hashes[hi] += [h,@pos].pack("ww") # BER compressed
|
103
|
+
@pos += size
|
104
|
+
end
|
105
|
+
|
106
|
+
#
|
107
|
+
# Alternative to PureCDB::Writer.new(target,options) ..
|
108
|
+
#
|
109
|
+
def self.open target, *options, &block
|
110
|
+
Writer.new(target, *options, &block)
|
111
|
+
end
|
67
112
|
|
68
113
|
private
|
69
114
|
def write_hashes
|
@@ -88,7 +133,7 @@ module PureCDB
|
|
88
133
|
while ary[off*2] != 0
|
89
134
|
off = (off + 1) % len
|
90
135
|
end
|
91
|
-
|
136
|
+
free_slots -= 1
|
92
137
|
ary[off*2] = entry[0]
|
93
138
|
ary[off*2+1] = entry[1]
|
94
139
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: purecdb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2016-07-
|
12
|
+
date: 2016-07-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: bundler
|