geotree 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ require_relative 'loc'
2
+
3
+ module GeoTreeModule
4
+ class Bounds
5
+ attr_accessor :x,:y,:w,:h
6
+ # Constructor.
7
+ # If x is a Float, it assumes that x,y,w,h are expressed
8
+ # in terms of latitudes and longitudes, and converts them
9
+ # to integers accordingly.
10
+ def initialize(x=0,y=0,w=0,h=0)
11
+ if x.is_a? Float
12
+ x1 = Loc.cvt_latlong_to_int(x)
13
+ y1 = Loc.cvt_latlong_to_int(y)
14
+ w = Loc.cvt_latlong_to_int(x+w) - x1
15
+ h = Loc.cvt_latlong_to_int(y+h) - y1
16
+ x,y = x1,y1
17
+ end
18
+
19
+ @x = x
20
+ @y = y
21
+ @w = w
22
+ @h = h
23
+ end
24
+
25
+ def x2
26
+ @x + @w
27
+ end
28
+
29
+ def y2
30
+ @y + @h
31
+ end
32
+
33
+ def to_s
34
+ "[#{@x},#{@y}..#{x2},#{y2}]"
35
+ end
36
+
37
+ def inspect
38
+ to_s
39
+ end
40
+
41
+ def contains_point(loc)
42
+ loc.x >= @x && loc.x < x2 && loc.y >= @y && loc.y < y2
43
+ end
44
+
45
+ def flip
46
+ Bounds.new(@y,@x,@h,@w)
47
+ end
48
+
49
+ def self.intersect(a,b)
50
+ a.x2 > b.x && b.x2 > a.x && a.y2 > b.y && b.y2 > a.y
51
+ end
52
+
53
+ # Construct a random bounds
54
+ #
55
+ def self.rnd
56
+ x1 = rand(1000)
57
+ y1 = rand(1000)
58
+ x2 = rand(1000)
59
+ y2 = rand(1000)
60
+ x1,x2 = [x1,x2].min,[x1,x2].max
61
+ y1,y2 = [y1,y2].min,[y1,y2].max
62
+ sz = rand() * rand() * 1000
63
+ sz = [sz.to_i,1].max
64
+ ix = [0,x2-x1-sz].max
65
+ iy = [0,y2-y1-sz].max
66
+ sx = (x2-x1-ix)/2
67
+ sy = (y2-y1-iy)/2
68
+
69
+ cx = (x1+x2)/2
70
+ cy = (y1+y2)/2
71
+ Bounds.new(cx-sx,cy-sy,sx*2,sy*2)
72
+ end
73
+
74
+ def self.rnd_many(count)
75
+ a = []
76
+ count.times{a << self.rnd}
77
+ a
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,68 @@
1
+ require_relative 'bounds'
2
+
3
+ module GeoTreeModule
4
+
5
+ MAX_POINT_WEIGHT = 16 # 1 + the maximum datapoint weight; must be power of 2
6
+
7
+ # Represents a point to be stored in a GeoTree.
8
+ #
9
+ # A point has these fields.
10
+ # ----------------------------------------
11
+ # [] A name, which is a unique integer identifier. This could be, e.g.,
12
+ # the id of a larger database record associated with the point.
13
+ # [] A position, stored as a Loc object (two integers, x and y).
14
+ # [] A weight. This is an integer, and is unused by the GeoTree except that
15
+ # the MultiTree class assumes that the lower 4 bits hold the point's
16
+ # detail level (a lower value means the point is less likely to show
17
+ # up at lower detail levels).
18
+ #
19
+ class DataPoint
20
+ attr_accessor :loc, :name, :weight
21
+
22
+ def initialize(name,weight,loc)
23
+ @name = name
24
+ @loc = loc
25
+ @weight = weight
26
+ end
27
+
28
+ def flip
29
+ DataPoint.new(@name,@weight,@loc.flip)
30
+ end
31
+
32
+ def to_s
33
+ "[##{name}: #{loc} w#{weight}]"
34
+ end
35
+
36
+ def inspect
37
+ to_s
38
+ end
39
+
40
+ # Construct a random point, one with a unique name (assumes no other
41
+ # process is generating point names)
42
+ #
43
+ def self.rnd
44
+ wt = (rand() * rand() * MAX_POINT_WEIGHT).to_i
45
+ x = rand(1000)
46
+ y = rand(1000)
47
+ @@nextRndName += 1
48
+ DataPoint.new(@@nextRndName, wt, Loc.new(x,y))
49
+ end
50
+
51
+ def self.rnd_many(count)
52
+ a = []
53
+ count.times{a << self.rnd}
54
+ a
55
+ end
56
+
57
+ def self.name_list(dp_list)
58
+ dp_list.map{|x| x.name}.sort
59
+ end
60
+
61
+ def self.match(a, b)
62
+ a.name == b.name && a.loc.x == b.loc.x && a.loc.y == b.loc.y
63
+ end
64
+
65
+ @@nextRndName = 200
66
+ end
67
+
68
+ end
@@ -0,0 +1,64 @@
1
+ require_relative 'blockfile'
2
+
3
+ # Block file that stores its contents to disk
4
+ #
5
+ class DiskBlockFile < BlockFile
6
+
7
+ def initialize(block_size, path)
8
+ @path = path
9
+ super(block_size)
10
+ end
11
+
12
+ def read(block_name, dest_buffer = nil)
13
+ db = false
14
+
15
+ dest_buffer ||= alloc_buffer
16
+
17
+ offset = block_size * block_name
18
+
19
+ @file.pos = offset
20
+
21
+ @file.read(block_size,dest_buffer)
22
+ raise IOError if (dest_buffer.size != block_size)
23
+
24
+ !db || hex_dump(dest_buffer,"Disk.read #{block_name}")
25
+
26
+ dest_buffer
27
+ end
28
+
29
+ def write(block_name, src_buffer)
30
+
31
+ db = false
32
+ !db || pr("Disk.write %d\n",block_name)
33
+ !db || hex_dump(src_buffer)
34
+
35
+ offset = block_size * block_name
36
+ @file.pos = offset
37
+
38
+ raise ArgumentError if src_buffer.size != block_size
39
+
40
+ count = @file.write(src_buffer)
41
+
42
+ if count != src_buffer.size
43
+ raise IOError,"wrote #{count} bytes instead of #{src_buffer.size}"
44
+ end
45
+ end
46
+
47
+ def open_storage
48
+ existed = File.file?(@path)
49
+ @file = File.open(@path, existed ? "r+b" : "w+b")
50
+ raise IOError if !@file
51
+
52
+ existed
53
+ end
54
+
55
+ def close_storage
56
+ flush
57
+ @file = nil
58
+ end
59
+
60
+ def flush
61
+ @file.flush
62
+ end
63
+ end
64
+
@@ -0,0 +1,369 @@
1
+ require_relative 'tools'
2
+
3
+ module ExternalSortModule
4
+
5
+ if false
6
+ warn("using small chunk size")
7
+ MAX_CHUNK_SIZE_ = 128
8
+ else
9
+ MAX_CHUNK_SIZE_ = 4_000_000
10
+ end
11
+
12
+ privatize(self)
13
+
14
+ # Base class for chunking file access.
15
+ # Essentially a buffer that acts as a sliding window into a binary file.
16
+ #
17
+ class Chunk
18
+ # Constructor
19
+ # @param target_file file containing target area
20
+ # @param target_offset offset to start of the target area for this chunk
21
+ # @param target_length length of target area
22
+ # @param element_size size of each element; target_length must be a multiple of this
23
+ #
24
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
25
+ @target_file = target_file
26
+ @target_offset = target_offset
27
+ @target_length = target_length
28
+
29
+ @target_end_offset = target_offset + target_length
30
+ @element_size = element_size
31
+ raise ArgumentError if target_length % element_size != 0
32
+
33
+ set_chunk_size(chunk_size)
34
+
35
+ @buffer = []
36
+ @buffer_offset = 0
37
+ end
38
+
39
+ def set_chunk_size(n)
40
+ n -= (n % @element_size)
41
+ raise ArgumentError if n <= 0
42
+ @max_chunk_size = [n,@target_length].min
43
+ end
44
+
45
+ def done
46
+ @buffer_offset == @buffer.size && @target_offset == @target_end_offset
47
+ end
48
+ end
49
+
50
+ # A subclass of Chunk that does not use a sliding window, and
51
+ # instead can contain the entire target length;
52
+ # includes methods for accessing target elements in arbitrary (non-streaming) order
53
+ class ChunkRandomAccess < Chunk
54
+
55
+ attr_reader :num_elements;
56
+ # Construct chunk, and read the complete targeted bytes to the buffer
57
+ #
58
+ def initialize(target_file, target_offset, target_length, element_size)
59
+ super(target_file,target_offset,target_length,element_size,target_length)
60
+
61
+ @num_elements = target_length / element_size
62
+
63
+ chunk_size = target_length
64
+
65
+ f = @target_file
66
+ f.pos = @target_offset
67
+ @buffer = f.read(chunk_size)
68
+ raise IOError if !@buffer || @buffer.size != chunk_size
69
+ end
70
+
71
+ # Get element from chunk
72
+ # @param index of element,
73
+ def element(index)
74
+ raise ArgumentError if index < 0 || index >= num_elements
75
+ off = index * @element_size
76
+ [@buffer,off]
77
+ end
78
+
79
+ # Replace existing buffer
80
+ def replace_buffer_with(b)
81
+ raise IllegalArgumentException if b.size != @buffer.size
82
+ @buffer = b
83
+ end
84
+
85
+ # Write buffer to target
86
+ def write
87
+ f = @target_file
88
+ f.pos = @target_end_offset - @target_length
89
+ bytes_written = f.write(@buffer)
90
+ raise IOError if @buffer.size != bytes_written
91
+ end
92
+
93
+ end
94
+
95
+ # Chunk subclass that performs streaming reading of target with sliding window
96
+ #
97
+ class ChunkReader < Chunk
98
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
99
+ super(target_file,target_offset,target_length,element_size, chunk_size)
100
+ end
101
+
102
+ # Display record being viewed using hex dump
103
+ def peek_dump
104
+ "(done)" if done
105
+
106
+ buff, off = peek
107
+ "Next element: "+hex_dump_to_string(buff,nil,off,@element_size)
108
+ end
109
+
110
+ # Get next element
111
+ # @return (array, offset) containing element, or nil if chunk is done
112
+ def peek
113
+ nil if done
114
+
115
+ # If no more elements exist in the buffer, fill it from the target
116
+ if @buffer_offset == @buffer.size
117
+ max_size = @max_chunk_size
118
+
119
+ chunk_size = [@target_end_offset - @target_offset, max_size].min
120
+
121
+ f = @target_file
122
+ f.pos = @target_offset
123
+ @buffer = f.read(chunk_size)
124
+ raise IOError if !@buffer || @buffer.size != chunk_size
125
+
126
+ @target_offset += chunk_size
127
+ @buffer_offset = 0
128
+ end
129
+ [@buffer, @buffer_offset]
130
+ end
131
+
132
+ # Read next element, advance pointers
133
+ # @return (array, offset) containing element
134
+ # @raise IllegalStateException if already done
135
+ def read
136
+ ret = peek
137
+ raise IllegalStateException if !ret
138
+ @buffer_offset += @element_size
139
+ ret
140
+ end
141
+ end
142
+
143
+ # Chunk subclass that performs streaming writing to target with sliding window
144
+ #
145
+ class ChunkWriter < Chunk
146
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
147
+ super(target_file,target_offset,target_length,element_size, chunk_size)
148
+ end
149
+
150
+ # Write an element to the target
151
+ # @param src_buffer source of element
152
+ # @param src_offset offset into source
153
+ #
154
+ def write(src_buffer, src_offset = 0)
155
+ raise IllegalStateException if done
156
+ raise ArgumentError if (src_buffer.size - src_offset < @element_size)
157
+
158
+ if @buffer_offset == @buffer.length
159
+ max_size = @max_chunk_size
160
+ chunk_size = [@target_end_offset - @target_offset, max_size].min
161
+ @buffer = zero_bytes(chunk_size)
162
+ @buffer_offset = 0
163
+ end
164
+
165
+ @buffer[@buffer_offset,@element_size] = src_buffer[src_offset,@element_size]
166
+ @buffer_offset += @element_size
167
+
168
+ # If buffer is now full, flush to target
169
+ if @buffer_offset == @buffer.size
170
+ f = @target_file
171
+ f.pos = @target_offset
172
+ bytes_written = f.write(@buffer)
173
+ raise IOError if @buffer.size != bytes_written
174
+ @target_offset += bytes_written
175
+ end
176
+ end
177
+ end
178
+
179
+ # Performs an external sort of a binary file.
180
+ # Used by the GeoTree module to shuffle buffered point sets into a random
181
+ # order prior to adding to the tree, in order to create a balanced tree.
182
+ #
183
+ class Sorter
184
+
185
+ MAX_CHUNKS_ = 8
186
+ privatize(self)
187
+
188
+ # Constructor
189
+ # @param path of file to sort
190
+ # @param element_size size, in bytes, of each element
191
+ # @param comparator to compare elements; if nil, compares the bytes as substrings
192
+ #
193
+ def initialize(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_)
194
+ raise ArgumentError,"no such file" if !File.file?(path)
195
+
196
+ @comparator = comparator || Proc.new do |x,y|
197
+ bx,ox = x
198
+ by,oy = y
199
+ bx[ox,@element_size] <=> by[oy,@element_size]
200
+ end
201
+
202
+ @path = path
203
+
204
+ @work_file = nil
205
+
206
+ @file_len = File.size(path)
207
+ if @file_len == 0 || @file_len % element_size != 0
208
+ raise ArgumentError,"File length #{@file_len} is not a positive multiple of element size #{element_size}"
209
+ end
210
+ @element_size = element_size
211
+ @max_chunks = max_chunks
212
+ @max_chunk_size = max_chunk_size - max_chunk_size % element_size
213
+ raise ArgumentError if @max_chunk_size <= 0
214
+ end
215
+
216
+ def sort
217
+ @file = File.open(@path,"r+b")
218
+
219
+ # Break file into chunks, sorting them in place
220
+ build_initial_segments
221
+ sort_chunks_in_place
222
+
223
+ require 'tempfile'
224
+
225
+ @work_file = Tempfile.new('_externalsort_')
226
+ @work_file.binmode
227
+
228
+ while @segments.size > 1
229
+ @segments = merge_segments(@segments)
230
+ end
231
+
232
+ @work_file.unlink
233
+ end
234
+
235
+ private
236
+
237
+ # Merge segments into one; if too many to handle at once, process recursively
238
+ def merge_segments(segs)
239
+
240
+ return segs if segs.size <= 1
241
+
242
+ if segs.size > MAX_CHUNKS_
243
+ k = segs.size/2
244
+ s1 = segs[0 .. k]
245
+ s2 = segs[k+1 .. -1]
246
+ ret = merge_segments(s1)
247
+ ret.concat(merge_segments(s2))
248
+ return ret
249
+ end
250
+
251
+ # Build a chunk for reading each segment; also, determine
252
+ # bounds of the set of segments.
253
+
254
+ # Sort the chunks by their next elements.
255
+
256
+ segset_start = nil
257
+ segset_end = nil
258
+
259
+ chunks = []
260
+ segs.each do |sg|
261
+ off,len = sg
262
+
263
+ ch = ChunkReader.new(@file, off, len, @element_size, @max_chunk_size)
264
+ chunks << ch
265
+ if !segset_start
266
+ segset_start = off
267
+ segset_end = off+len
268
+ else
269
+ segset_start = [segset_start,off].min
270
+ segset_end = [segset_end,off+len].max
271
+ end
272
+ end
273
+ segset_size = segset_end - segset_start
274
+
275
+ # Sort the chunks into order by their peek items, so the lowest item is at the end of the array
276
+ chunks.sort! do |a,b|
277
+ ex = a.peek
278
+ ey = b.peek
279
+ @comparator.call(ey,ex)
280
+ end
281
+
282
+ # Build a chunk for writing merged result to work file
283
+ wch = ChunkWriter.new(@work_file,0,segset_size, @element_size, @max_chunk_size)
284
+
285
+ while !chunks.empty?
286
+ ch = chunks.pop
287
+ buff,off = ch.peek
288
+ wch.write(buff,off)
289
+ ch.read
290
+
291
+ next if ch.done
292
+
293
+ # Examine this chunk's next item to reinsert the chunk back into the sorted array.
294
+ # Perform a binary search:
295
+ i0 = 0
296
+ i1 = chunks.size
297
+ while i0 < i1
298
+ i = (i0+i1)/2
299
+ ci = chunks[i]
300
+ if @comparator.call(ci.peek, ch.peek) > 0
301
+ i0 = i+1
302
+ else
303
+ i1 = i
304
+ end
305
+ end
306
+ chunks.insert(i1, ch)
307
+ end
308
+
309
+ # Read from work file and write to segment set's position in original file
310
+
311
+ rch = ChunkReader.new(@work_file,0,segset_size, @element_size, @max_chunk_size)
312
+ wch = ChunkWriter.new(@file,segset_start,segset_size, @element_size, @max_chunk_size)
313
+
314
+ while !rch.done
315
+ buff,off = rch.peek
316
+ wch.write(buff,off)
317
+ rch.read
318
+ end
319
+
320
+ # We must flush the file we're writing to, now that the
321
+ # operation is complete
322
+ @file.flush
323
+
324
+ [[segset_start,segset_size]]
325
+ end
326
+
327
+ # Partition the file into segments, each the size of a chunk
328
+ def build_initial_segments
329
+ db = false
330
+
331
+ !db || pr("build_initial_segments, @file_len=#@file_len\n")
332
+ raise IllegalStateException if @file_len == 0
333
+
334
+ @segments = []
335
+ off = 0
336
+ while off < @file_len
337
+ seg_len = [@file_len - off, @max_chunk_size].min
338
+ @segments << [off, seg_len]
339
+ off += seg_len
340
+ end
341
+ end
342
+
343
+ def sort_chunks_in_place
344
+ @segments.each do |offset,length|
345
+ ch = ChunkRandomAccess.new(@file, offset, length, @element_size)
346
+
347
+ a = (0 ... ch.num_elements).to_a
348
+
349
+ a.sort! do |x,y|
350
+ ex = ch.element(x)
351
+ ey = ch.element(y)
352
+ @comparator.call(ex,ey)
353
+ end
354
+
355
+ # Construct another buffer, in the sorted order
356
+ b = zero_bytes(@element_size * a.size)
357
+ j = 0
358
+ a.each do |i|
359
+ buff,off = ch.element(i)
360
+ b[j, @element_size] = buff[off,@element_size]
361
+ j += @element_size
362
+ end
363
+ ch.replace_buffer_with(b)
364
+ ch.write
365
+ end
366
+ end
367
+ end
368
+
369
+ end # module