purecdb 0.1.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
45
45
  To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
46
46
 
47
47
 
48
- To read a 32 bit (standard) CDB file:
48
+ To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
49
49
 
50
50
  ```ruby
51
51
  PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
53
53
  end
54
54
  ```
55
55
 
56
- To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Reader#open,
57
- or let the reader auto-detect the format.
56
+ To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
57
+ the second argument to PureCDB::Reader#open.
58
58
 
59
59
  See PureCDB::Reader#new for additional usage.
60
60
 
@@ -3,6 +3,15 @@ require "purecdb/base"
3
3
  require "purecdb/reader"
4
4
  require "purecdb/writer"
5
5
 
6
- # :nodoc:
7
6
  module PureCDB
7
+
8
+ # Convenience alternative to PureCDB::Reader.new(target, *options, &block)
9
+ def self.reader(target,*options, &block)
10
+ PureCDB::Reader.new(target, *options, &block)
11
+ end
12
+
13
+ # Convenience alternative to PureCDB::Writer.new(target, *options, &block)
14
+ def self.writer(target,*options, &block)
15
+ PureCDB::Writer.new(target, *options, &block)
16
+ end
8
17
  end
@@ -4,6 +4,9 @@ module PureCDB
4
4
  # Base class with shared functionality for PureCDB::Reader and
5
5
  # PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
6
6
  # details. You should not need to use this directly.
7
+ #
8
+ # Changing the constants defined here is likely to break all
9
+ # kinds of stuff.
7
10
  #
8
11
  class Base
9
12
  # The CDB format contains 256 separate hashes by default.
@@ -17,9 +20,6 @@ module PureCDB
17
20
  # 64 bit format this is multiplied by 2.
18
21
  DEFAULT_HASHPTR_SIZE = 4
19
22
 
20
- # Number of bytes that will be buffered
21
- BUFFER_SIZE = 4096
22
-
23
23
  # Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
24
24
  # rather than a 32-bit CDB file.
25
25
  CDB64_MAGIC = "cdb64:01"
@@ -53,12 +53,15 @@ module PureCDB
53
53
  def set_mode mode
54
54
  @mode = mode
55
55
  @num_hashes = DEFAULT_NUM_HASHES
56
+
56
57
  if @mode == 64
57
58
  @length_size = DEFAULT_LENGTH_SIZE * 2
58
59
  @hashptr_size = DEFAULT_HASHPTR_SIZE * 2
60
+ @format = "Q<"
59
61
  else
60
62
  @length_size = DEFAULT_LENGTH_SIZE
61
63
  @hashptr_size = DEFAULT_HASHPTR_SIZE
64
+ @format = "V"
62
65
  end
63
66
  end
64
67
 
@@ -74,9 +77,6 @@ module PureCDB
74
77
  end
75
78
  end
76
79
 
77
- # Used to speed up 64bit pack/unpack
78
- @little_endian = [123456789].pack("L") == [123456789].pack("V")
79
-
80
80
  if mode == :detect
81
81
  @mode = :detect
82
82
  else
@@ -112,29 +112,12 @@ module PureCDB
112
112
 
113
113
  private
114
114
 
115
- # Due to Array#pack's lack of a little/big endian specific 64 bit operator
116
115
  def ary_pack(ary)
117
- if @mode == 32
118
- ary.pack("V*")
119
- elsif @little_endian
120
- ary.pack("Q*")
121
- else
122
- ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
123
- end
116
+ ary.pack("#{@format}*")
124
117
  end
125
118
 
126
- # Due to String#unpack's lack of a little/big endian specific 64 bit operator
127
119
  def ary_unpack(data, num)
128
- if @mode == 32
129
- data.unpack("V#{num}")
130
- elsif @little_endian
131
- data.unpack("Q#{num}")
132
- else
133
- ret = []
134
- data = data.unpack("V#{num*2}")
135
- data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
136
- ret
137
- end
120
+ data.unpack("#{@format}#{num}")
138
121
  end
139
122
  end
140
123
 
@@ -1,7 +1,7 @@
1
1
  module PureCDB
2
2
 
3
3
  #
4
- # Read 32 bit or 54 bit CDB file CDB files.
4
+ # Read 32 bit or 64 bit CDB files.
5
5
  #
6
6
  class Reader < Base
7
7
  include Enumerable
@@ -41,7 +41,6 @@ module PureCDB
41
41
  raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
42
42
  end
43
43
 
44
- # FIXME: It seems like there are bugs triggered if mmap fails
45
44
  @m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
46
45
  read_hashes
47
46
 
@@ -55,12 +54,15 @@ module PureCDB
55
54
  end
56
55
 
57
56
  #
58
- # Shortcut for PureCDB::Reader.new(target,options) ..
57
+ # Alternative to PureCDB::Reader.new(target,options) ..
59
58
  #
60
59
  def self.open(target, *options, &block)
61
60
  Reader.new(target, *options, &block)
62
61
  end
63
62
 
63
+ #
64
+ # Close the CDB file
65
+ #
64
66
  def close
65
67
  @io.close if @io
66
68
  @m.unmap if @m
@@ -68,6 +70,11 @@ module PureCDB
68
70
  @io = nil
69
71
  end
70
72
 
73
+ # Iterate over all key/value pairs in the order they occur in the file.
74
+ # This is *not* sorted or insertion order.
75
+ #
76
+ # +each+ will yield each key,value pair separately even when a key is
77
+ # duplicate.
71
78
  def each
72
79
  pos = hash_size
73
80
  hoff0 = @hashes[0]
@@ -78,16 +85,7 @@ module PureCDB
78
85
  end
79
86
  end
80
87
 
81
- def read_entry(pos)
82
- keylen, datalen = read_header(pos)
83
- return nil,nil if !keylen
84
- pos += hashref_size
85
- rkey = read(pos .. pos + keylen - 1)
86
- pos += keylen
87
- value = read(pos .. pos + datalen - 1)
88
- return rkey, value
89
- end
90
-
88
+ # Returns all values for +key+ in an array
91
89
  def values(key)
92
90
  h = hash(key)
93
91
 
@@ -121,6 +119,17 @@ module PureCDB
121
119
 
122
120
  private
123
121
 
122
+ def read_entry(pos)
123
+ keylen, datalen = read_header(pos)
124
+ return nil,nil if !keylen
125
+ pos += hashref_size
126
+ rkey = read(pos .. pos + keylen - 1)
127
+ pos += keylen
128
+ value = read(pos .. pos + datalen - 1)
129
+ return rkey, value
130
+ end
131
+
132
+
124
133
  # Warning: This will be very slow if not mmap'd
125
134
  def read r
126
135
  @m = nil
@@ -1,4 +1,3 @@
1
- #
2
1
  module PureCDB
3
- VERSION = "0.1.0"
2
+ VERSION = "1.0.0"
4
3
  end
@@ -1,15 +1,58 @@
1
1
  module PureCDB
2
+ #
3
+ # Write 32 or 64 bit CDB files
4
+ #
5
+ # == Memory considerations
6
+ #
7
+ # While the entry is written to the target object immediately on calling #store, the actual
8
+ # hash tables can not be written until the full dataset is ready. You must therefore be able
9
+ # to hold the hash of each key (including duplicates) and the position in the file the full
10
+ # netry is stored at in memory while building the CDB file.
11
+ #
12
+ # It would be possible to write this to a temporary file at the cost of performance, but the
13
+ # current implementation does not do this.
14
+ #
15
+ # As a compromise, the current implementation stores the hashes and positions as a BER encoded
16
+ # string per hash bucket until it is ready to write it to disk.
17
+ #
2
18
  class Writer < Base
3
- # This just needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
4
- # the more frequently the reader may have to engage in potentially lengthy (worst case scanning all the
5
- # records) probing to find the right entry
6
- def hash_fill_factor
7
- 0.7
8
- end
9
-
19
+ # How full any given hash table is allowed to get, as a float between 0 and 1.
20
+ #
21
+ # Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
22
+ # the more frequently the reader may have to engage in potentially lengthy (worst case
23
+ # scanning all the records) probing to find the right entry
24
+ attr_accessor :hash_fill_factor
25
+
26
+ # Open a CDB file for writing, or preparing an IO like object for writing.
27
+ #
28
+ # :call-seq:
29
+ # w = PureCDB::Writer.new(target)
30
+ # w = PureCDB::Writer.new(target, *options)
31
+ # PureCDB::Writer.new(target) {|w| ... }
32
+ # PureCDB::Writer.new(target, *options) {|w| ... }
33
+ #
34
+ # If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
35
+ # you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
36
+ # is 32.
37
+ #
38
+ # If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
39
+ # Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
40
+ # and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
41
+ # #syswrite to #seek and #write respectively, and these must be present.
42
+ #
43
+ # (+IO+ and +StringIO+ both satisfy these requirements)
44
+ #
45
+ # If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
46
+ # afterwards.
47
+ #
48
+ # **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
49
+ # when you are done.
50
+ #
10
51
  def initialize target, *options
11
52
  super *options
12
53
 
54
+ @hash_fill_factor = 0.7
55
+
13
56
  set_mode(32) if @mode == :detect
14
57
 
15
58
  if target.is_a?(String)
@@ -34,36 +77,38 @@ module PureCDB
34
77
  end
35
78
  end
36
79
 
80
+ # Write out the hashes and hash pointers, and close the target if it responds to #close
81
+ #
37
82
  def close
38
83
  write_hashes
39
84
  write_hashptrs
40
85
  @io.close if @io.respond_to?(:close)
41
86
  end
42
-
43
- # For compatibility w/cdb / CDBMaker
44
- def store key,value
45
- add key,value
46
- end
47
-
48
- def add key,value
49
- # In an attempt to save memory, we pack the hash data we gather into
50
- # strings of BER compressed integers...
51
-
52
- h = hash(key)
53
- hi = (h % num_hashes)
54
- @hashes[hi] ||= ""
55
-
56
- header = build_header(key.length, value.length)
57
- @io.syswrite(header+key+value)
58
- size = header.size + key.size + value.size
59
- @hashes[hi] += [h,@pos].pack("ww") # BER compressed
60
-
61
- @pos += size
62
- end
63
-
64
- def self.open target, *options, &block
65
- Writer.new(target, *options, &block)
66
- end
87
+
88
+ # Store 'value' under 'key'.
89
+ #
90
+ # Multiple values can we stored for the same key by calling #store multiple times
91
+ # with the same key value.
92
+ def store key,value
93
+ # In an attempt to save memory, we pack the hash data we gather into
94
+ # strings of BER compressed integers...
95
+ h = hash(key)
96
+ hi = (h % num_hashes)
97
+ @hashes[hi] ||= ""
98
+
99
+ header = build_header(key.length, value.length)
100
+ @io.syswrite(header+key+value)
101
+ size = header.size + key.size + value.size
102
+ @hashes[hi] += [h,@pos].pack("ww") # BER compressed
103
+ @pos += size
104
+ end
105
+
106
+ #
107
+ # Alternative to PureCDB::Writer.new(target,options) ..
108
+ #
109
+ def self.open target, *options, &block
110
+ Writer.new(target, *options, &block)
111
+ end
67
112
 
68
113
  private
69
114
  def write_hashes
@@ -88,7 +133,7 @@ module PureCDB
88
133
  while ary[off*2] != 0
89
134
  off = (off + 1) % len
90
135
  end
91
- free_slots -= 1
136
+ free_slots -= 1
92
137
  ary[off*2] = entry[0]
93
138
  ary[off*2+1] = entry[1]
94
139
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purecdb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2016-07-03 00:00:00.000000000 Z
12
+ date: 2016-07-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler