geotree 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,81 @@
1
+ require_relative 'loc'
2
+
3
+ module GeoTreeModule
4
+ class Bounds
5
+ attr_accessor :x,:y,:w,:h
6
+ # Constructor.
7
+ # If x is a Float, it assumes that x,y,w,h are expressed
8
+ # in terms of latitudes and longitudes, and converts them
9
+ # to integers accordingly.
10
+ def initialize(x=0,y=0,w=0,h=0)
11
+ if x.is_a? Float
12
+ x1 = Loc.cvt_latlong_to_int(x)
13
+ y1 = Loc.cvt_latlong_to_int(y)
14
+ w = Loc.cvt_latlong_to_int(x+w) - x1
15
+ h = Loc.cvt_latlong_to_int(y+h) - y1
16
+ x,y = x1,y1
17
+ end
18
+
19
+ @x = x
20
+ @y = y
21
+ @w = w
22
+ @h = h
23
+ end
24
+
25
+ def x2
26
+ @x + @w
27
+ end
28
+
29
+ def y2
30
+ @y + @h
31
+ end
32
+
33
+ def to_s
34
+ "[#{@x},#{@y}..#{x2},#{y2}]"
35
+ end
36
+
37
+ def inspect
38
+ to_s
39
+ end
40
+
41
+ def contains_point(loc)
42
+ loc.x >= @x && loc.x < x2 && loc.y >= @y && loc.y < y2
43
+ end
44
+
45
+ def flip
46
+ Bounds.new(@y,@x,@h,@w)
47
+ end
48
+
49
+ def self.intersect(a,b)
50
+ a.x2 > b.x && b.x2 > a.x && a.y2 > b.y && b.y2 > a.y
51
+ end
52
+
53
+ # Construct a random bounds
54
+ #
55
+ def self.rnd
56
+ x1 = rand(1000)
57
+ y1 = rand(1000)
58
+ x2 = rand(1000)
59
+ y2 = rand(1000)
60
+ x1,x2 = [x1,x2].min,[x1,x2].max
61
+ y1,y2 = [y1,y2].min,[y1,y2].max
62
+ sz = rand() * rand() * 1000
63
+ sz = [sz.to_i,1].max
64
+ ix = [0,x2-x1-sz].max
65
+ iy = [0,y2-y1-sz].max
66
+ sx = (x2-x1-ix)/2
67
+ sy = (y2-y1-iy)/2
68
+
69
+ cx = (x1+x2)/2
70
+ cy = (y1+y2)/2
71
+ Bounds.new(cx-sx,cy-sy,sx*2,sy*2)
72
+ end
73
+
74
+ def self.rnd_many(count)
75
+ a = []
76
+ count.times{a << self.rnd}
77
+ a
78
+ end
79
+
80
+ end
81
+ end
@@ -0,0 +1,68 @@
1
+ require_relative 'bounds'
2
+
3
+ module GeoTreeModule
4
+
5
+ MAX_POINT_WEIGHT = 16 # 1 + the maximum datapoint weight; must be power of 2
6
+
7
+ # Represents a point to be stored in a GeoTree.
8
+ #
9
+ # A point has these fields.
10
+ # ----------------------------------------
11
+ # [] A name, which is a unique integer identifier. This could be, e.g.,
12
+ # the id of a larger database record associated with the point.
13
+ # [] A position, stored as a Loc object (two integers, x and y).
14
+ # [] A weight. This is an integer, and is unused by the GeoTree except that
15
+ # the MultiTree class assumes that the lower 4 bits hold the point's
16
+ # detail level (a lower value means the point is less likely to show
17
+ # up at lower detail levels).
18
+ #
19
+ class DataPoint
20
+ attr_accessor :loc, :name, :weight
21
+
22
+ def initialize(name,weight,loc)
23
+ @name = name
24
+ @loc = loc
25
+ @weight = weight
26
+ end
27
+
28
+ def flip
29
+ DataPoint.new(@name,@weight,@loc.flip)
30
+ end
31
+
32
+ def to_s
33
+ "[##{name}: #{loc} w#{weight}]"
34
+ end
35
+
36
+ def inspect
37
+ to_s
38
+ end
39
+
40
+ # Construct a random point, one with a unique name (assumes no other
41
+ # process is generating point names)
42
+ #
43
+ def self.rnd
44
+ wt = (rand() * rand() * MAX_POINT_WEIGHT).to_i
45
+ x = rand(1000)
46
+ y = rand(1000)
47
+ @@nextRndName += 1
48
+ DataPoint.new(@@nextRndName, wt, Loc.new(x,y))
49
+ end
50
+
51
+ def self.rnd_many(count)
52
+ a = []
53
+ count.times{a << self.rnd}
54
+ a
55
+ end
56
+
57
+ def self.name_list(dp_list)
58
+ dp_list.map{|x| x.name}.sort
59
+ end
60
+
61
+ def self.match(a, b)
62
+ a.name == b.name && a.loc.x == b.loc.x && a.loc.y == b.loc.y
63
+ end
64
+
65
+ @@nextRndName = 200
66
+ end
67
+
68
+ end
@@ -0,0 +1,64 @@
1
+ require_relative 'blockfile'
2
+
3
+ # Block file that stores its contents to disk
4
+ #
5
+ class DiskBlockFile < BlockFile
6
+
7
+ def initialize(block_size, path)
8
+ @path = path
9
+ super(block_size)
10
+ end
11
+
12
+ def read(block_name, dest_buffer = nil)
13
+ db = false
14
+
15
+ dest_buffer ||= alloc_buffer
16
+
17
+ offset = block_size * block_name
18
+
19
+ @file.pos = offset
20
+
21
+ @file.read(block_size,dest_buffer)
22
+ raise IOError if (dest_buffer.size != block_size)
23
+
24
+ !db || hex_dump(dest_buffer,"Disk.read #{block_name}")
25
+
26
+ dest_buffer
27
+ end
28
+
29
+ def write(block_name, src_buffer)
30
+
31
+ db = false
32
+ !db || pr("Disk.write %d\n",block_name)
33
+ !db || hex_dump(src_buffer)
34
+
35
+ offset = block_size * block_name
36
+ @file.pos = offset
37
+
38
+ raise ArgumentError if src_buffer.size != block_size
39
+
40
+ count = @file.write(src_buffer)
41
+
42
+ if count != src_buffer.size
43
+ raise IOError,"wrote #{count} bytes instead of #{src_buffer.size}"
44
+ end
45
+ end
46
+
47
+ def open_storage
48
+ existed = File.file?(@path)
49
+ @file = File.open(@path, existed ? "r+b" : "w+b")
50
+ raise IOError if !@file
51
+
52
+ existed
53
+ end
54
+
55
+ def close_storage
56
+ flush
57
+ @file = nil
58
+ end
59
+
60
+ def flush
61
+ @file.flush
62
+ end
63
+ end
64
+
@@ -0,0 +1,369 @@
1
+ require_relative 'tools'
2
+
3
+ module ExternalSortModule
4
+
5
+ if false
6
+ warn("using small chunk size")
7
+ MAX_CHUNK_SIZE_ = 128
8
+ else
9
+ MAX_CHUNK_SIZE_ = 4_000_000
10
+ end
11
+
12
+ privatize(self)
13
+
14
+ # Base class for chunking file access.
15
+ # Essentially a buffer that acts as a sliding window into a binary file.
16
+ #
17
+ class Chunk
18
+ # Constructor
19
+ # @param target_file file containing target area
20
+ # @param target_offset offset to start of the target area for this chunk
21
+ # @param target_length length of target area
22
+ # @param element_size size of each element; target_length must be a multiple of this
23
+ #
24
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
25
+ @target_file = target_file
26
+ @target_offset = target_offset
27
+ @target_length = target_length
28
+
29
+ @target_end_offset = target_offset + target_length
30
+ @element_size = element_size
31
+ raise ArgumentError if target_length % element_size != 0
32
+
33
+ set_chunk_size(chunk_size)
34
+
35
+ @buffer = []
36
+ @buffer_offset = 0
37
+ end
38
+
39
+ def set_chunk_size(n)
40
+ n -= (n % @element_size)
41
+ raise ArgumentError if n <= 0
42
+ @max_chunk_size = [n,@target_length].min
43
+ end
44
+
45
+ def done
46
+ @buffer_offset == @buffer.size && @target_offset == @target_end_offset
47
+ end
48
+ end
49
+
50
+ # A subclass of Chunk that does not use a sliding window, and
51
+ # instead can contain the entire target length;
52
+ # includes methods for accessing target elements in arbitrary (non-streaming) order
53
+ class ChunkRandomAccess < Chunk
54
+
55
+ attr_reader :num_elements;
56
+ # Construct chunk, and read the complete targeted bytes to the buffer
57
+ #
58
+ def initialize(target_file, target_offset, target_length, element_size)
59
+ super(target_file,target_offset,target_length,element_size,target_length)
60
+
61
+ @num_elements = target_length / element_size
62
+
63
+ chunk_size = target_length
64
+
65
+ f = @target_file
66
+ f.pos = @target_offset
67
+ @buffer = f.read(chunk_size)
68
+ raise IOError if !@buffer || @buffer.size != chunk_size
69
+ end
70
+
71
+ # Get element from chunk
72
+ # @param index of element,
73
+ def element(index)
74
+ raise ArgumentError if index < 0 || index >= num_elements
75
+ off = index * @element_size
76
+ [@buffer,off]
77
+ end
78
+
79
+ # Replace existing buffer
80
+ def replace_buffer_with(b)
81
+ raise IllegalArgumentException if b.size != @buffer.size
82
+ @buffer = b
83
+ end
84
+
85
+ # Write buffer to target
86
+ def write
87
+ f = @target_file
88
+ f.pos = @target_end_offset - @target_length
89
+ bytes_written = f.write(@buffer)
90
+ raise IOError if @buffer.size != bytes_written
91
+ end
92
+
93
+ end
94
+
95
+ # Chunk subclass that performs streaming reading of target with sliding window
96
+ #
97
+ class ChunkReader < Chunk
98
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
99
+ super(target_file,target_offset,target_length,element_size, chunk_size)
100
+ end
101
+
102
+ # Display record being viewed using hex dump
103
+ def peek_dump
104
+ "(done)" if done
105
+
106
+ buff, off = peek
107
+ "Next element: "+hex_dump_to_string(buff,nil,off,@element_size)
108
+ end
109
+
110
+ # Get next element
111
+ # @return (array, offset) containing element, or nil if chunk is done
112
+ def peek
113
+ nil if done
114
+
115
+ # If no more elements exist in the buffer, fill it from the target
116
+ if @buffer_offset == @buffer.size
117
+ max_size = @max_chunk_size
118
+
119
+ chunk_size = [@target_end_offset - @target_offset, max_size].min
120
+
121
+ f = @target_file
122
+ f.pos = @target_offset
123
+ @buffer = f.read(chunk_size)
124
+ raise IOError if !@buffer || @buffer.size != chunk_size
125
+
126
+ @target_offset += chunk_size
127
+ @buffer_offset = 0
128
+ end
129
+ [@buffer, @buffer_offset]
130
+ end
131
+
132
+ # Read next element, advance pointers
133
+ # @return (array, offset) containing element
134
+ # @raise IllegalStateException if already done
135
+ def read
136
+ ret = peek
137
+ raise IllegalStateException if !ret
138
+ @buffer_offset += @element_size
139
+ ret
140
+ end
141
+ end
142
+
143
+ # Chunk subclass that performs streaming writing to target with sliding window
144
+ #
145
+ class ChunkWriter < Chunk
146
+ def initialize(target_file, target_offset, target_length, element_size, chunk_size = MAX_CHUNK_SIZE_)
147
+ super(target_file,target_offset,target_length,element_size, chunk_size)
148
+ end
149
+
150
+ # Write an element to the target
151
+ # @param src_buffer source of element
152
+ # @param src_offset offset into source
153
+ #
154
+ def write(src_buffer, src_offset = 0)
155
+ raise IllegalStateException if done
156
+ raise ArgumentError if (src_buffer.size - src_offset < @element_size)
157
+
158
+ if @buffer_offset == @buffer.length
159
+ max_size = @max_chunk_size
160
+ chunk_size = [@target_end_offset - @target_offset, max_size].min
161
+ @buffer = zero_bytes(chunk_size)
162
+ @buffer_offset = 0
163
+ end
164
+
165
+ @buffer[@buffer_offset,@element_size] = src_buffer[src_offset,@element_size]
166
+ @buffer_offset += @element_size
167
+
168
+ # If buffer is now full, flush to target
169
+ if @buffer_offset == @buffer.size
170
+ f = @target_file
171
+ f.pos = @target_offset
172
+ bytes_written = f.write(@buffer)
173
+ raise IOError if @buffer.size != bytes_written
174
+ @target_offset += bytes_written
175
+ end
176
+ end
177
+ end
178
+
179
+ # Performs an external sort of a binary file.
180
+ # Used by the GeoTree module to shuffle buffered point sets into a random
181
+ # order prior to adding to the tree, in order to create a balanced tree.
182
+ #
183
+ class Sorter
184
+
185
+ MAX_CHUNKS_ = 8
186
+ privatize(self)
187
+
188
+ # Constructor
189
+ # @param path of file to sort
190
+ # @param element_size size, in bytes, of each element
191
+ # @param comparator to compare elements; if nil, compares the bytes as substrings
192
+ #
193
+ def initialize(path, element_size, comparator=nil, max_chunk_size = MAX_CHUNK_SIZE_, max_chunks = MAX_CHUNKS_)
194
+ raise ArgumentError,"no such file" if !File.file?(path)
195
+
196
+ @comparator = comparator || Proc.new do |x,y|
197
+ bx,ox = x
198
+ by,oy = y
199
+ bx[ox,@element_size] <=> by[oy,@element_size]
200
+ end
201
+
202
+ @path = path
203
+
204
+ @work_file = nil
205
+
206
+ @file_len = File.size(path)
207
+ if @file_len == 0 || @file_len % element_size != 0
208
+ raise ArgumentError,"File length #{@file_len} is not a positive multiple of element size #{element_size}"
209
+ end
210
+ @element_size = element_size
211
+ @max_chunks = max_chunks
212
+ @max_chunk_size = max_chunk_size - max_chunk_size % element_size
213
+ raise ArgumentError if @max_chunk_size <= 0
214
+ end
215
+
216
+ def sort
217
+ @file = File.open(@path,"r+b")
218
+
219
+ # Break file into chunks, sorting them in place
220
+ build_initial_segments
221
+ sort_chunks_in_place
222
+
223
+ require 'tempfile'
224
+
225
+ @work_file = Tempfile.new('_externalsort_')
226
+ @work_file.binmode
227
+
228
+ while @segments.size > 1
229
+ @segments = merge_segments(@segments)
230
+ end
231
+
232
+ @work_file.unlink
233
+ end
234
+
235
+ private
236
+
237
+ # Merge segments into one; if too many to handle at once, process recursively
238
+ def merge_segments(segs)
239
+
240
+ return segs if segs.size <= 1
241
+
242
+ if segs.size > MAX_CHUNKS_
243
+ k = segs.size/2
244
+ s1 = segs[0 .. k]
245
+ s2 = segs[k+1 .. -1]
246
+ ret = merge_segments(s1)
247
+ ret.concat(merge_segments(s2))
248
+ return ret
249
+ end
250
+
251
+ # Build a chunk for reading each segment; also, determine
252
+ # bounds of the set of segments.
253
+
254
+ # Sort the chunks by their next elements.
255
+
256
+ segset_start = nil
257
+ segset_end = nil
258
+
259
+ chunks = []
260
+ segs.each do |sg|
261
+ off,len = sg
262
+
263
+ ch = ChunkReader.new(@file, off, len, @element_size, @max_chunk_size)
264
+ chunks << ch
265
+ if !segset_start
266
+ segset_start = off
267
+ segset_end = off+len
268
+ else
269
+ segset_start = [segset_start,off].min
270
+ segset_end = [segset_end,off+len].max
271
+ end
272
+ end
273
+ segset_size = segset_end - segset_start
274
+
275
+ # Sort the chunks into order by their peek items, so the lowest item is at the end of the array
276
+ chunks.sort! do |a,b|
277
+ ex = a.peek
278
+ ey = b.peek
279
+ @comparator.call(ey,ex)
280
+ end
281
+
282
+ # Build a chunk for writing merged result to work file
283
+ wch = ChunkWriter.new(@work_file,0,segset_size, @element_size, @max_chunk_size)
284
+
285
+ while !chunks.empty?
286
+ ch = chunks.pop
287
+ buff,off = ch.peek
288
+ wch.write(buff,off)
289
+ ch.read
290
+
291
+ next if ch.done
292
+
293
+ # Examine this chunk's next item to reinsert the chunk back into the sorted array.
294
+ # Perform a binary search:
295
+ i0 = 0
296
+ i1 = chunks.size
297
+ while i0 < i1
298
+ i = (i0+i1)/2
299
+ ci = chunks[i]
300
+ if @comparator.call(ci.peek, ch.peek) > 0
301
+ i0 = i+1
302
+ else
303
+ i1 = i
304
+ end
305
+ end
306
+ chunks.insert(i1, ch)
307
+ end
308
+
309
+ # Read from work file and write to segment set's position in original file
310
+
311
+ rch = ChunkReader.new(@work_file,0,segset_size, @element_size, @max_chunk_size)
312
+ wch = ChunkWriter.new(@file,segset_start,segset_size, @element_size, @max_chunk_size)
313
+
314
+ while !rch.done
315
+ buff,off = rch.peek
316
+ wch.write(buff,off)
317
+ rch.read
318
+ end
319
+
320
+ # We must flush the file we're writing to, now that the
321
+ # operation is complete
322
+ @file.flush
323
+
324
+ [[segset_start,segset_size]]
325
+ end
326
+
327
+ # Partition the file into segments, each the size of a chunk
328
+ def build_initial_segments
329
+ db = false
330
+
331
+ !db || pr("build_initial_segments, @file_len=#@file_len\n")
332
+ raise IllegalStateException if @file_len == 0
333
+
334
+ @segments = []
335
+ off = 0
336
+ while off < @file_len
337
+ seg_len = [@file_len - off, @max_chunk_size].min
338
+ @segments << [off, seg_len]
339
+ off += seg_len
340
+ end
341
+ end
342
+
343
+ def sort_chunks_in_place
344
+ @segments.each do |offset,length|
345
+ ch = ChunkRandomAccess.new(@file, offset, length, @element_size)
346
+
347
+ a = (0 ... ch.num_elements).to_a
348
+
349
+ a.sort! do |x,y|
350
+ ex = ch.element(x)
351
+ ey = ch.element(y)
352
+ @comparator.call(ex,ey)
353
+ end
354
+
355
+ # Construct another buffer, in the sorted order
356
+ b = zero_bytes(@element_size * a.size)
357
+ j = 0
358
+ a.each do |i|
359
+ buff,off = ch.element(i)
360
+ b[j, @element_size] = buff[off,@element_size]
361
+ j += @element_size
362
+ end
363
+ ch.replace_buffer_with(b)
364
+ ch.write
365
+ end
366
+ end
367
+ end
368
+
369
+ end # module