purecdb 0.1.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -45,7 +45,7 @@ To create a 32 bit (standard) CDB file:
45
45
  To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Writer#open .
46
46
 
47
47
 
48
- To read a 32 bit (standard) CDB file:
48
+ To read a CDB file (auto-detecting standard 32-bit or extended 64-bit) CDB files:
49
49
 
50
50
  ```ruby
51
51
  PureCDB::Reader.open("/tmp/somecdbfile.cdb") do |r|
@@ -53,8 +53,8 @@ To read a 32 bit (standard) CDB file:
53
53
  end
54
54
  ```
55
55
 
56
- To instead create a 64 bit file, pass {mode: 64} as the second argument to PureCDB::Reader#open,
57
- or let the reader auto-detect the format.
56
+ To require a 32 or 64 bit file specifically, pass {mode: 32} or {mode: 64} as
57
+ the second argument to PureCDB::Reader#open.
58
58
 
59
59
  See PureCDB::Reader#new for additional usage.
60
60
 
@@ -3,6 +3,15 @@ require "purecdb/base"
3
3
  require "purecdb/reader"
4
4
  require "purecdb/writer"
5
5
 
6
- # :nodoc:
7
6
  module PureCDB
7
+
8
+ # Convenience alternative to PureCDB::Reader.new(target, *options, &block)
9
+ def self.reader(target,*options, &block)
10
+ PureCDB::Reader.new(target, *options, &block)
11
+ end
12
+
13
+ # Convenience alternative to PureCDB::Writer.new(target, *options, &block)
14
+ def self.writer(target,*options, &block)
15
+ PureCDB::Writer.new(target, *options, &block)
16
+ end
8
17
  end
@@ -4,6 +4,9 @@ module PureCDB
4
4
  # Base class with shared functionality for PureCDB::Reader and
5
5
  # PureCDB::Writer that abstracts away 32 bit vs. 64 bit format
6
6
  # details. You should not need to use this directly.
7
+ #
8
+ # Changing the constants defined here is likely to break all
9
+ # kinds of stuff.
7
10
  #
8
11
  class Base
9
12
  # The CDB format contains 256 separate hashes by default.
@@ -17,9 +20,6 @@ module PureCDB
17
20
  # 64 bit format this is multiplied by 2.
18
21
  DEFAULT_HASHPTR_SIZE = 4
19
22
 
20
- # Number of bytes that will be buffered
21
- BUFFER_SIZE = 4096
22
-
23
23
  # Magic cookied used to indicate that this is a 64 bit (non-standard) CDB file
24
24
  # rather than a 32-bit CDB file.
25
25
  CDB64_MAGIC = "cdb64:01"
@@ -53,12 +53,15 @@ module PureCDB
53
53
  def set_mode mode
54
54
  @mode = mode
55
55
  @num_hashes = DEFAULT_NUM_HASHES
56
+
56
57
  if @mode == 64
57
58
  @length_size = DEFAULT_LENGTH_SIZE * 2
58
59
  @hashptr_size = DEFAULT_HASHPTR_SIZE * 2
60
+ @format = "Q<"
59
61
  else
60
62
  @length_size = DEFAULT_LENGTH_SIZE
61
63
  @hashptr_size = DEFAULT_HASHPTR_SIZE
64
+ @format = "V"
62
65
  end
63
66
  end
64
67
 
@@ -74,9 +77,6 @@ module PureCDB
74
77
  end
75
78
  end
76
79
 
77
- # Used to speed up 64bit pack/unpack
78
- @little_endian = [123456789].pack("L") == [123456789].pack("V")
79
-
80
80
  if mode == :detect
81
81
  @mode = :detect
82
82
  else
@@ -112,29 +112,12 @@ module PureCDB
112
112
 
113
113
  private
114
114
 
115
- # Due to Array#pack's lack of a little/big endian specific 64 bit operator
116
115
  def ary_pack(ary)
117
- if @mode == 32
118
- ary.pack("V*")
119
- elsif @little_endian
120
- ary.pack("Q*")
121
- else
122
- ary.collect {|a| [a & 0xffffffff, (a >> 32) & 0xffffffff] }.flatten.pack("V*")
123
- end
116
+ ary.pack("#{@format}*")
124
117
  end
125
118
 
126
- # Due to String#unpack's lack of a little/big endian specific 64 bit operator
127
119
  def ary_unpack(data, num)
128
- if @mode == 32
129
- data.unpack("V#{num}")
130
- elsif @little_endian
131
- data.unpack("Q#{num}")
132
- else
133
- ret = []
134
- data = data.unpack("V#{num*2}")
135
- data.each_slice(2) {|a| ret << (a[0] + (a[1] << 32)) }
136
- ret
137
- end
120
+ data.unpack("#{@format}#{num}")
138
121
  end
139
122
  end
140
123
 
@@ -1,7 +1,7 @@
1
1
  module PureCDB
2
2
 
3
3
  #
4
- # Read 32 bit or 54 bit CDB file CDB files.
4
+ # Read 32 bit or 64 bit CDB files.
5
5
  #
6
6
  class Reader < Base
7
7
  include Enumerable
@@ -41,7 +41,6 @@ module PureCDB
41
41
  raise "#{mode}bit mode detected in file; options request #{@mode}bit mode"
42
42
  end
43
43
 
44
- # FIXME: It seems like there are bugs triggered if mmap fails
45
44
  @m = Mmap.new(target,"r", Mmap::MAP_SHARED) rescue nil
46
45
  read_hashes
47
46
 
@@ -55,12 +54,15 @@ module PureCDB
55
54
  end
56
55
 
57
56
  #
58
- # Shortcut for PureCDB::Reader.new(target,options) ..
57
+ # Alternative to PureCDB::Reader.new(target,options) ..
59
58
  #
60
59
  def self.open(target, *options, &block)
61
60
  Reader.new(target, *options, &block)
62
61
  end
63
62
 
63
+ #
64
+ # Close the CDB file
65
+ #
64
66
  def close
65
67
  @io.close if @io
66
68
  @m.unmap if @m
@@ -68,6 +70,11 @@ module PureCDB
68
70
  @io = nil
69
71
  end
70
72
 
73
+ # Iterate over all key/value pairs in the order they occur in the file.
74
+ # This is *not* sorted or insertion order.
75
+ #
76
+ # +each+ will yield each key,value pair separately even when a key is
77
+ # duplicate.
71
78
  def each
72
79
  pos = hash_size
73
80
  hoff0 = @hashes[0]
@@ -78,16 +85,7 @@ module PureCDB
78
85
  end
79
86
  end
80
87
 
81
- def read_entry(pos)
82
- keylen, datalen = read_header(pos)
83
- return nil,nil if !keylen
84
- pos += hashref_size
85
- rkey = read(pos .. pos + keylen - 1)
86
- pos += keylen
87
- value = read(pos .. pos + datalen - 1)
88
- return rkey, value
89
- end
90
-
88
+ # Returns all values for +key+ in an array
91
89
  def values(key)
92
90
  h = hash(key)
93
91
 
@@ -121,6 +119,17 @@ module PureCDB
121
119
 
122
120
  private
123
121
 
122
+ def read_entry(pos)
123
+ keylen, datalen = read_header(pos)
124
+ return nil,nil if !keylen
125
+ pos += hashref_size
126
+ rkey = read(pos .. pos + keylen - 1)
127
+ pos += keylen
128
+ value = read(pos .. pos + datalen - 1)
129
+ return rkey, value
130
+ end
131
+
132
+
124
133
  # Warning: This will be very slow if not mmap'd
125
134
  def read r
126
135
  @m = nil
@@ -1,4 +1,3 @@
1
- #
2
1
  module PureCDB
3
- VERSION = "0.1.0"
2
+ VERSION = "1.0.0"
4
3
  end
@@ -1,15 +1,58 @@
1
1
  module PureCDB
2
+ #
3
+ # Write 32 or 64 bit CDB files
4
+ #
5
+ # == Memory considerations
6
+ #
7
+ # While the entry is written to the target object immediately on calling #store, the actual
8
+ # hash tables can not be written until the full dataset is ready. You must therefore be able
9
+ # to hold the hash of each key (including duplicates) and the position in the file the full
10
+ # netry is stored at in memory while building the CDB file.
11
+ #
12
+ # It would be possible to write this to a temporary file at the cost of performance, but the
13
+ # current implementation does not do this.
14
+ #
15
+ # As a compromise, the current implementation stores the hashes and positions as a BER encoded
16
+ # string per hash bucket until it is ready to write it to disk.
17
+ #
2
18
  class Writer < Base
3
- # This just needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
4
- # the more frequently the reader may have to engage in potentially lengthy (worst case scanning all the
5
- # records) probing to find the right entry
6
- def hash_fill_factor
7
- 0.7
8
- end
9
-
19
+ # How full any given hash table is allowed to get, as a float between 0 and 1.
20
+ #
21
+ # Needs to be <= 1. The lower it is, the fewer records will collide. The closer to 1 it is,
22
+ # the more frequently the reader may have to engage in potentially lengthy (worst case
23
+ # scanning all the records) probing to find the right entry
24
+ attr_accessor :hash_fill_factor
25
+
26
+ # Open a CDB file for writing, or preparing an IO like object for writing.
27
+ #
28
+ # :call-seq:
29
+ # w = PureCDB::Writer.new(target)
30
+ # w = PureCDB::Writer.new(target, *options)
31
+ # PureCDB::Writer.new(target) {|w| ... }
32
+ # PureCDB::Writer.new(target, *options) {|w| ... }
33
+ #
34
+ # If +:mode+ is passed in +options+, it must be the integers 32 or 64, indicating whether
35
+ # you wish to write a standard (32 bit) CDB file, or a 64 bit CDB-like file. The default
36
+ # is 32.
37
+ #
38
+ # If +target+ is a +String+ it is treated as a filename of a file to be opened to write to.
39
+ # Otherwise +target+ is assumed to be an IO-like object that ideally responds to #sysseek
40
+ # and #syswrite. If it doesn't, it will be wrapped with an object delegating #sysseek and
41
+ # #syswrite to #seek and #write respectively, and these must be present.
42
+ #
43
+ # (+IO+ and +StringIO+ both satisfy these requirements)
44
+ #
45
+ # If passed a block, the writer is yielded to the block and PureCDB::Writer#close is called
46
+ # afterwards.
47
+ #
48
+ # **WARNING:** To complete writing the hash tables, you *must* ensure #close is called
49
+ # when you are done.
50
+ #
10
51
  def initialize target, *options
11
52
  super *options
12
53
 
54
+ @hash_fill_factor = 0.7
55
+
13
56
  set_mode(32) if @mode == :detect
14
57
 
15
58
  if target.is_a?(String)
@@ -34,36 +77,38 @@ module PureCDB
34
77
  end
35
78
  end
36
79
 
80
+ # Write out the hashes and hash pointers, and close the target if it responds to #close
81
+ #
37
82
  def close
38
83
  write_hashes
39
84
  write_hashptrs
40
85
  @io.close if @io.respond_to?(:close)
41
86
  end
42
-
43
- # For compatibility w/cdb / CDBMaker
44
- def store key,value
45
- add key,value
46
- end
47
-
48
- def add key,value
49
- # In an attempt to save memory, we pack the hash data we gather into
50
- # strings of BER compressed integers...
51
-
52
- h = hash(key)
53
- hi = (h % num_hashes)
54
- @hashes[hi] ||= ""
55
-
56
- header = build_header(key.length, value.length)
57
- @io.syswrite(header+key+value)
58
- size = header.size + key.size + value.size
59
- @hashes[hi] += [h,@pos].pack("ww") # BER compressed
60
-
61
- @pos += size
62
- end
63
-
64
- def self.open target, *options, &block
65
- Writer.new(target, *options, &block)
66
- end
87
+
88
+ # Store 'value' under 'key'.
89
+ #
90
+ # Multiple values can we stored for the same key by calling #store multiple times
91
+ # with the same key value.
92
+ def store key,value
93
+ # In an attempt to save memory, we pack the hash data we gather into
94
+ # strings of BER compressed integers...
95
+ h = hash(key)
96
+ hi = (h % num_hashes)
97
+ @hashes[hi] ||= ""
98
+
99
+ header = build_header(key.length, value.length)
100
+ @io.syswrite(header+key+value)
101
+ size = header.size + key.size + value.size
102
+ @hashes[hi] += [h,@pos].pack("ww") # BER compressed
103
+ @pos += size
104
+ end
105
+
106
+ #
107
+ # Alternative to PureCDB::Writer.new(target,options) ..
108
+ #
109
+ def self.open target, *options, &block
110
+ Writer.new(target, *options, &block)
111
+ end
67
112
 
68
113
  private
69
114
  def write_hashes
@@ -88,7 +133,7 @@ module PureCDB
88
133
  while ary[off*2] != 0
89
134
  off = (off + 1) % len
90
135
  end
91
- free_slots -= 1
136
+ free_slots -= 1
92
137
  ary[off*2] = entry[0]
93
138
  ary[off*2+1] = entry[1]
94
139
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: purecdb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2016-07-03 00:00:00.000000000 Z
12
+ date: 2016-07-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: bundler