geotree 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.txt +3 -0
- data/README.txt +6 -0
- data/lib/geotree.rb +1 -0
- data/lib/geotree/blockfile.rb +453 -0
- data/lib/geotree/bounds.rb +81 -0
- data/lib/geotree/datapoint.rb +68 -0
- data/lib/geotree/diskblockfile.rb +64 -0
- data/lib/geotree/externalsort.rb +369 -0
- data/lib/geotree/geotree.rb +980 -0
- data/lib/geotree/loc.rb +76 -0
- data/lib/geotree/multitree.rb +190 -0
- data/lib/geotree/node.rb +252 -0
- data/lib/geotree/pswriter.rb +471 -0
- data/lib/geotree/ptbuffer.rb +120 -0
- data/lib/geotree/tools.rb +626 -0
- data/test/test_blockfile.rb +153 -0
- data/test/test_externalsort.rb +139 -0
- data/test/test_geotree.rb +432 -0
- data/test/test_ps.rb +56 -0
- metadata +76 -0
@@ -0,0 +1,980 @@
|
|
1
|
+
require_relative 'node'
|
2
|
+
|
3
|
+
req 'diskblockfile ptbuffer'
|
4
|
+
|
5
|
+
module GeoTreeModule
|
6
|
+
|
7
|
+
# A variant of a kd-tree, it is capable of maintaining sets of 2D points and efficiently
|
8
|
+
# reporting all points lying within (axis-aligned) query rectangles.
|
9
|
+
#
|
10
|
+
# Like a B+ tree, it has a large branching factor
|
11
|
+
# and the nodes are large to improve performance when the tree is stored
|
12
|
+
# on disk.
|
13
|
+
#
|
14
|
+
# A GeoTree is usually stored within a disk file, though it is also possible to
|
15
|
+
# construct a tree that exists only in memory; see the initialize(...) method.
|
16
|
+
#
|
17
|
+
# Usage:
|
18
|
+
#
|
19
|
+
# [] Open a tree. If no tree exists, a new, empty one is created.
|
20
|
+
#
|
21
|
+
# t = GeoTree.open("treepath.bin")
|
22
|
+
#
|
23
|
+
# [] Add datapoints.
|
24
|
+
#
|
25
|
+
# dp = DataPoint.new(...)
|
26
|
+
# t.add(dp)
|
27
|
+
#
|
28
|
+
# [] Remove datapoints.
|
29
|
+
#
|
30
|
+
# t.remove(dp)
|
31
|
+
#
|
32
|
+
# [] Find all points within a particular rectangle.
|
33
|
+
#
|
34
|
+
# b = Bounds.new(x,y,width,height)
|
35
|
+
#
|
36
|
+
# pts = t.find(b)
|
37
|
+
#
|
38
|
+
# [] Close tree; flush any changes.
|
39
|
+
#
|
40
|
+
# t.close()
|
41
|
+
#
|
42
|
+
#
|
43
|
+
# One of the problems with kd-trees (including this one) is that they can become
|
44
|
+
# unbalanced after a number of insertions and deletions. To deal with this,
|
45
|
+
# consider these two suggestions:
|
46
|
+
#
|
47
|
+
# 1) When constructing the initial tree, if the datapoints are given in a random
|
48
|
+
# order, the tree will (with high probability) be constructed in a balanced form.
|
49
|
+
# By contrast, consider what happens if the points (1,1), (2,2), (3,3), ... are
|
50
|
+
# added in sequence to an initially empty tree. The tree will be very unbalanced,
|
51
|
+
# with poor performance.
|
52
|
+
# To address this problem, if you are not confident that the points you initially
|
53
|
+
# provide are in a sufficiently random sequence, you can enable 'point buffering':
|
54
|
+
#
|
55
|
+
# t = GeoTree.open("treepath.bin")
|
56
|
+
#
|
57
|
+
# t.buffering = true # buffering is now active
|
58
|
+
#
|
59
|
+
# t.add(dp1)
|
60
|
+
# t.add(dp2) # these points are stored in a temporary disk file
|
61
|
+
# t.add(dp3)
|
62
|
+
# :
|
63
|
+
#
|
64
|
+
# t.buffering = false # the points will be shuffled into a random sequence and
|
65
|
+
# # added to the tree
|
66
|
+
#
|
67
|
+
#
|
68
|
+
# 2) Periodically, you can start with a new tree, and add all of the datapoints using the
|
69
|
+
# above buffering technique. This is easy to do if the datapoints are also stored
|
70
|
+
# externally to the GeoTree (for instance, as parts of larger records in some database).
|
71
|
+
# Otherwise, (i) the datapoints can be retrieved from the tree to an array
|
72
|
+
# (by using a sufficiently large query rectangle), (ii) a new tree can be constructed,
|
73
|
+
# and (iii) each of the points in the array can be added to the new tree.
|
74
|
+
#
|
75
|
+
class GeoTree
|
76
|
+
|
77
|
+
ROOT_NODE_NAME_ = BlockFile::FIRST_BLOCK_ID
|
78
|
+
|
79
|
+
privatize(self)
|
80
|
+
def buffering=(val)
|
81
|
+
db = false
|
82
|
+
# db = true
|
83
|
+
!db || pr("\nSetting buffering to #{val} (was #{@buffer.active})\n\n")
|
84
|
+
|
85
|
+
raise IllegalStateException if !open?
|
86
|
+
|
87
|
+
@buffer.active = val
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
# Construct GeoTree
|
92
|
+
# @param block_file if nil, creates in-memory tree
|
93
|
+
def initialize(block_file = nil)
|
94
|
+
|
95
|
+
block_file ||= BlockFile.new(KDTREE_BLOCKSIZE)
|
96
|
+
@block_file = block_file
|
97
|
+
@buffer = PtBuffer.new(self)
|
98
|
+
|
99
|
+
@mod_nodes = Set.new # names of modified nodes
|
100
|
+
@cache_dict = {}
|
101
|
+
@c_start = NodeI.new(555,false,Bounds.new(0,0,0,0))
|
102
|
+
@c_end = NodeI.new(666,false,Bounds.new(0,0,0,0))
|
103
|
+
GeoTree.join_nodes(@c_start,@c_end)
|
104
|
+
|
105
|
+
@block_file.open
|
106
|
+
|
107
|
+
# The root node, if it exists, will be in the first block.
|
108
|
+
if @block_file.name_max <= ROOT_NODE_NAME_
|
109
|
+
root = NodeL.new(ROOT_NODE_NAME_,false, @@start_bounds)
|
110
|
+
# we need to add this node to the cache since it's just been built
|
111
|
+
cache_node(root)
|
112
|
+
root_name = @block_file.alloc(encode_block(root))
|
113
|
+
write_node(root)
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
def open?
|
119
|
+
@block_file != nil
|
120
|
+
end
|
121
|
+
|
122
|
+
def close
|
123
|
+
raise IllegalStateException if !open?
|
124
|
+
|
125
|
+
# Stop buffering, in case we were, to flush points to tree
|
126
|
+
@buffer.active = false
|
127
|
+
|
128
|
+
# Flush the block file, among other things
|
129
|
+
done_operation
|
130
|
+
|
131
|
+
@block_file.close
|
132
|
+
@block_file = nil
|
133
|
+
end
|
134
|
+
|
135
|
+
def add_buffered_point(data_point)
|
136
|
+
# construct path of interior nodes leading to leaf node set
|
137
|
+
path = []
|
138
|
+
add_data_point(data_point, ROOT_NODE_NAME_,path,@@start_bounds,false)
|
139
|
+
|
140
|
+
# adjust populations for each internal node on path
|
141
|
+
path.each do |n|
|
142
|
+
n.adjust_population(1)
|
143
|
+
write_node(n)
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
private
|
148
|
+
|
149
|
+
# cache start and end nodes
|
150
|
+
attr_accessor :c_start, :c_end
|
151
|
+
attr_accessor :cache_dict, :mod_nodes, :block_file
|
152
|
+
|
153
|
+
@@start_bounds = Bounds.new(LOC_MIN,LOC_MIN,LOC_MAX - LOC_MIN,LOC_MAX - LOC_MIN)
|
154
|
+
public
|
155
|
+
|
156
|
+
def self.max_bounds
|
157
|
+
@@start_bounds
|
158
|
+
end
|
159
|
+
|
160
|
+
# Open tree from file; if it doesn't exist, creates an empty tree, one prepared to
|
161
|
+
# use that file to persist it
|
162
|
+
# @param path path of file; if nil, constructs tree in memory only
|
163
|
+
#
|
164
|
+
def self.open(path = nil)
|
165
|
+
db = false
|
166
|
+
# db = true
|
167
|
+
!db || pr("GeoTree.open path=#{path}\n")
|
168
|
+
bf = nil
|
169
|
+
if path
|
170
|
+
!db || pr(" exists=#{File.file?(path)}\n")
|
171
|
+
|
172
|
+
if (db && File.file?(path))
|
173
|
+
hex_dump(read_text_file(path),"path #{path}")
|
174
|
+
end
|
175
|
+
|
176
|
+
bf = DiskBlockFile.new(KDTREE_BLOCKSIZE, path)
|
177
|
+
end
|
178
|
+
GeoTree.new(bf);
|
179
|
+
end
|
180
|
+
|
181
|
+
# Add a datapoint to the tree.
|
182
|
+
# Does not ensure that a datapoint with this name already exists in the
|
183
|
+
# tree, even if it has the same location.
|
184
|
+
#
|
185
|
+
def add(data_point)
|
186
|
+
raise IllegalStateException if !open?
|
187
|
+
@buffer.add(data_point)
|
188
|
+
end
|
189
|
+
|
190
|
+
# Remove a datapoint. Returns the datapoint if it was found and removed,
|
191
|
+
# otherwise nil.
|
192
|
+
# A datapoint will be removed iff both its name and location match
|
193
|
+
# the sought point; the weight is ignored.
|
194
|
+
def remove(data_point)
|
195
|
+
|
196
|
+
raise IllegalStateException if @buffer.active
|
197
|
+
|
198
|
+
db = false
|
199
|
+
!db || pr("remove #{data_point}\n")
|
200
|
+
|
201
|
+
removed = nil
|
202
|
+
block do
|
203
|
+
|
204
|
+
# construct path of interior nodes leading to the leaf node set that contains the point
|
205
|
+
# (if one exists)
|
206
|
+
internal_path = []
|
207
|
+
|
208
|
+
n = read_root_node
|
209
|
+
|
210
|
+
while !n.leaf
|
211
|
+
|
212
|
+
!db || pr(" add #{n} to internal path\n")
|
213
|
+
internal_path << n
|
214
|
+
|
215
|
+
# find the child that will contain the point
|
216
|
+
child_slot = n.slot_intersecting_line(n.vertical ? data_point.loc.y : data_point.loc.x)
|
217
|
+
next_name = n.slot_child(child_slot)
|
218
|
+
!db || pr(" child_slot=#{child_slot}, next_name=#{next_name}\n")
|
219
|
+
if next_name == 0
|
220
|
+
n = nil
|
221
|
+
break
|
222
|
+
end
|
223
|
+
n = read_node(next_name,n.slot_bounds(child_slot),!n.vertical)
|
224
|
+
end
|
225
|
+
break if !n
|
226
|
+
|
227
|
+
# build list of overflow nodes
|
228
|
+
leaf_set = build_leaf_set(n)
|
229
|
+
!db || pr(" built leaf set: #{d(leaf_set)}\n")
|
230
|
+
|
231
|
+
# We now have path containing the path of internal nodes, and leaf_set the leaf nodes
|
232
|
+
|
233
|
+
# find the node containing this point
|
234
|
+
found_leaf_index = found_slot = -1
|
235
|
+
|
236
|
+
leaf_set.each_with_index do |leaf,i|
|
237
|
+
found_slot = leaf.find_point(data_point)
|
238
|
+
if found_slot >= 0
|
239
|
+
found_leaf_index = i
|
240
|
+
break
|
241
|
+
end
|
242
|
+
end
|
243
|
+
break if found_leaf_index < 0
|
244
|
+
|
245
|
+
# copy last datapoint to found location's, then delete last datapoint
|
246
|
+
leaf_node = leaf_set[found_leaf_index]
|
247
|
+
removed = leaf_node.data_point(found_slot)
|
248
|
+
|
249
|
+
last_leaf_node = leaf_set[-1]
|
250
|
+
lu = last_leaf_node.used
|
251
|
+
|
252
|
+
leaf_node.set_data_point(found_slot, last_leaf_node.data_point(lu-1))
|
253
|
+
last_leaf_node.pop_last_point
|
254
|
+
|
255
|
+
write_node(last_leaf_node)
|
256
|
+
write_node(leaf_node)
|
257
|
+
|
258
|
+
# If the last leaf is now empty, remove it
|
259
|
+
if last_leaf_node.used == 0
|
260
|
+
leaf_set.pop
|
261
|
+
if leaf_set.size != 0
|
262
|
+
prev_leaf = leaf_set[-1]
|
263
|
+
prev_leaf.overflow = 0
|
264
|
+
write_node(prev_leaf)
|
265
|
+
delete_node(last_leaf_node)
|
266
|
+
else
|
267
|
+
# It was the first leaf in the set, so we should remove it
|
268
|
+
# from its parent (NodeI) slot (if it's not the root)
|
269
|
+
if last_leaf_node.name != ROOT_NODE_NAME_
|
270
|
+
parent = internal_path[-1]
|
271
|
+
parent.remove_child_named(last_leaf_node.name)
|
272
|
+
write_node(parent)
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
# for each internal node in the path:
|
278
|
+
# [] adjust population by -1
|
279
|
+
# [] if population has dropped below half the capacity of a leaf node,
|
280
|
+
# convert subtree to leaf node
|
281
|
+
while internal_path.size != 0
|
282
|
+
inode = internal_path.pop
|
283
|
+
|
284
|
+
inode.adjust_population(-1)
|
285
|
+
write_node(inode)
|
286
|
+
|
287
|
+
if inode.population < SPLIT_SIZE/2
|
288
|
+
collapse_internal_node(inode)
|
289
|
+
end
|
290
|
+
|
291
|
+
end
|
292
|
+
end
|
293
|
+
done_operation
|
294
|
+
removed
|
295
|
+
end
|
296
|
+
|
297
|
+
# Find all points intersecting a rectangle.
|
298
|
+
#
|
299
|
+
def find(rect)
|
300
|
+
raise IllegalStateException if (!open? || @buffer.active)
|
301
|
+
a = []
|
302
|
+
find_aux(rect,a,ROOT_NODE_NAME_,@@start_bounds,false)
|
303
|
+
done_operation
|
304
|
+
a
|
305
|
+
end
|
306
|
+
|
307
|
+
# Determine if a particular datapoint lies in the tree
|
308
|
+
def find_point(df)
|
309
|
+
raise IllegalStateException if (!open? || @buffer.active)
|
310
|
+
vb = Bounds.new(df.loc.x,df.loc.y,1,1)
|
311
|
+
fa = find(vb)
|
312
|
+
ret = false
|
313
|
+
fa.each do |dp|
|
314
|
+
if dp.name == df.name
|
315
|
+
ret = true
|
316
|
+
break;
|
317
|
+
end
|
318
|
+
end
|
319
|
+
ret
|
320
|
+
end
|
321
|
+
|
322
|
+
# Calculate some statistics about the tree
|
323
|
+
# @return dictionary of field(string) => value
|
324
|
+
def statistics
|
325
|
+
raise IllegalStateException if !open?
|
326
|
+
|
327
|
+
st = TreeStats.new
|
328
|
+
i = ROOT_NODE_NAME_
|
329
|
+
aux_stats(ROOT_NODE_NAME_,@@start_bounds,false,false,0,st)
|
330
|
+
|
331
|
+
st.summary
|
332
|
+
end
|
333
|
+
|
334
|
+
# Dump tree in graphical form
|
335
|
+
def dump(root_node = nil)
|
336
|
+
raise IllegalStateException if !open?
|
337
|
+
root_node ||= read_root_node
|
338
|
+
|
339
|
+
s2 = "-"*50+"\n"
|
340
|
+
|
341
|
+
s = "KDTree (rooted at #{root_node.name})\n"
|
342
|
+
s << s2
|
343
|
+
|
344
|
+
dump_aux(s,root_node,0,{})
|
345
|
+
s << s2
|
346
|
+
s
|
347
|
+
end
|
348
|
+
|
349
|
+
def self.rnd_points(count)
|
350
|
+
b = Bounds.new(100,100,900,900)
|
351
|
+
rnd_points_within(count,b)
|
352
|
+
end
|
353
|
+
|
354
|
+
@@next_pt_id = 500
|
355
|
+
|
356
|
+
def self.rnd_points_within(count, bounds)
|
357
|
+
|
358
|
+
a = []
|
359
|
+
count.times do |i|
|
360
|
+
w = Loc.new(bounds.x + rand(1 + bounds.w), bounds.y + rand(1 + bounds.h))
|
361
|
+
next if !@@start_bounds.contains_point(w)
|
362
|
+
|
363
|
+
wt = (rand * rand * rand * MAX_POINT_WEIGHT).to_i
|
364
|
+
a << DataPoint.create_with_name(@@next_pt_id,wt,w)
|
365
|
+
@@next_pt_id += 1
|
366
|
+
end
|
367
|
+
a
|
368
|
+
end
|
369
|
+
|
370
|
+
def self.read_data_point_from(b, offset)
|
371
|
+
name = BlockFile.read_int(b, offset)
|
372
|
+
weight = BlockFile.read_int(b, offset+1)
|
373
|
+
locn = Loc.new(BlockFile.read_int(b,offset+2),BlockFile.read_int(b,offset+3))
|
374
|
+
DataPoint.new(name,weight,locn)
|
375
|
+
end
|
376
|
+
|
377
|
+
def self.write_data_point(dp, b, offset)
|
378
|
+
BlockFile.write_int(b,offset, dp.name)
|
379
|
+
BlockFile.write_int(b,offset+1, dp.weight)
|
380
|
+
BlockFile.write_int(b,offset+2, dp.loc.x)
|
381
|
+
BlockFile.write_int(b,offset+3, dp.loc.y)
|
382
|
+
end
|
383
|
+
|
384
|
+
private
|
385
|
+
|
386
|
+
def gather_datapoints(n,dp_set,node_set)
|
387
|
+
if !n.leaf
|
388
|
+
NODEI_CHILDREN.times do |i|
|
389
|
+
child = n.slot_child(i)
|
390
|
+
next if child == 0
|
391
|
+
|
392
|
+
b = n.slot_bounds(i)
|
393
|
+
child_node = read_node(child, b, !n.vertical )
|
394
|
+
|
395
|
+
node_set << child_node
|
396
|
+
gather_datapoints(child_node, dp_set,node_set)
|
397
|
+
end
|
398
|
+
else
|
399
|
+
while true
|
400
|
+
dp_set.concat(n.pts)
|
401
|
+
ov = n.overflow
|
402
|
+
break if ov == 0
|
403
|
+
n = read_node(ov,n.bounds,n.vertical)
|
404
|
+
node_set << n
|
405
|
+
end
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
# Replace an internal node with a leaf node, one containing all the
|
410
|
+
# datapoints in the internal node's subtree.
|
411
|
+
def collapse_internal_node(n)
|
412
|
+
db = false
|
413
|
+
!db || pr("internal node population has dropped below half leaf set capacity;\n%s\n",d(n))
|
414
|
+
!db || puts(dump)
|
415
|
+
|
416
|
+
dp_set = []
|
417
|
+
node_set = []
|
418
|
+
gather_datapoints(n,dp_set,node_set)
|
419
|
+
|
420
|
+
if dp_set.size != n.population
|
421
|
+
raise IllegalStateException,\
|
422
|
+
"Interior node actual population #{dp_set.size} disagrees with stored value #{n.population};\n#{dump(n)}"
|
423
|
+
end
|
424
|
+
|
425
|
+
!db || pr("\ndp_set=#{d2(dp_set)}\n\n")
|
426
|
+
!db || pr("node_set=#{d2(node_set)}\n\n")
|
427
|
+
|
428
|
+
node_set.each do |n2|
|
429
|
+
!db || pr(" removing #{n2} from mod/cache\n")
|
430
|
+
delete_node(n2)
|
431
|
+
end
|
432
|
+
|
433
|
+
n2 = NodeL.new(n.name,n.vertical,n.bounds)
|
434
|
+
replace_node(n,n2)
|
435
|
+
n = n2
|
436
|
+
while true
|
437
|
+
j = [dp_set.size, NODEL_CAPACITY].min
|
438
|
+
pts = n.pts()
|
439
|
+
j.times{pts << dp_set.pop}
|
440
|
+
if dp_set.empty?
|
441
|
+
write_node(n)
|
442
|
+
break
|
443
|
+
end
|
444
|
+
|
445
|
+
n2 = get_next_overflow(n)
|
446
|
+
write_node(n)
|
447
|
+
n = n2
|
448
|
+
end
|
449
|
+
|
450
|
+
!db || printf("After collapsing\n#{dump}\n\n")
|
451
|
+
|
452
|
+
end
|
453
|
+
|
454
|
+
def aux_stats(node_name, b,v,overflow,depth, st)
|
455
|
+
n = read_node(node_name,b,v)
|
456
|
+
st.process_node(n,overflow,depth)
|
457
|
+
|
458
|
+
if !n.leaf
|
459
|
+
NODEI_CHILDREN.times do |i|
|
460
|
+
child_name = n.slot_child(i)
|
461
|
+
next if child_name == 0
|
462
|
+
r2 = n.slot_bounds(i)
|
463
|
+
aux_stats(child_name, r2, !v, false, depth+1, st)
|
464
|
+
end
|
465
|
+
else
|
466
|
+
ov = n.overflow
|
467
|
+
if ov != 0
|
468
|
+
aux_stats(ov, b, v, true, depth, st)
|
469
|
+
end
|
470
|
+
end
|
471
|
+
end
|
472
|
+
|
473
|
+
def self.join_nodes(a,b)
|
474
|
+
a.next_node = b
|
475
|
+
b.prev_node = a
|
476
|
+
end
|
477
|
+
|
478
|
+
def remove_from(node, from_cache, from_list)
|
479
|
+
if from_cache
|
480
|
+
@cache_dict.delete(node.name)
|
481
|
+
end
|
482
|
+
if from_list && node.prev_node
|
483
|
+
n_prev = node.prev_node
|
484
|
+
n_next = node.next_node
|
485
|
+
node.next_node = nil
|
486
|
+
node.prev_node = nil
|
487
|
+
GeoTree.join_nodes(n_prev,n_next)
|
488
|
+
end
|
489
|
+
end
|
490
|
+
|
491
|
+
# Add node to cache; move to front
|
492
|
+
def cache_node(node)
|
493
|
+
cs = @c_start
|
494
|
+
if cs.next_node != node
|
495
|
+
remove_from(node,false,true)
|
496
|
+
node2 = cs.next_node
|
497
|
+
GeoTree.join_nodes(cs,node)
|
498
|
+
GeoTree.join_nodes(node,node2)
|
499
|
+
end
|
500
|
+
@cache_dict[node.name] = node
|
501
|
+
end
|
502
|
+
|
503
|
+
# Calculate where partitions should go in a node
|
504
|
+
#
|
505
|
+
# If any slots end up having zero width, these are placed at the
|
506
|
+
# end of the list
|
507
|
+
#
|
508
|
+
# @param bounds bounds of node
|
509
|
+
# @param unsorted_pts array of DataPoints
|
510
|
+
# @param vertical orientation
|
511
|
+
# @return locations of partitions (1 + NODEI_CHILDREN of them)
|
512
|
+
#
|
513
|
+
def self.calc_partitions(bounds, unsorted_pts, vertical)
|
514
|
+
db = false
|
515
|
+
# db = true
|
516
|
+
!db || pr("calc_partitions for bounds #{bounds}\n")
|
517
|
+
|
518
|
+
a = []
|
519
|
+
|
520
|
+
# Convert inputs so we need deal only with x coordinates
|
521
|
+
if vertical
|
522
|
+
b = []
|
523
|
+
bounds = bounds.flip
|
524
|
+
unsorted_pts.each do |p|
|
525
|
+
b << p.flip
|
526
|
+
end
|
527
|
+
unsorted_pts = b
|
528
|
+
end
|
529
|
+
|
530
|
+
pts = unsorted_pts.sort{|a,b| a.loc.x <=> b.loc.x}
|
531
|
+
!db || pr(" starting with left boundary #{bounds.x}\n")
|
532
|
+
|
533
|
+
# Add location of left boundary
|
534
|
+
a << bounds.x
|
535
|
+
|
536
|
+
# how many zones are we cutting it into?
|
537
|
+
n_zones = NODEI_CHILDREN
|
538
|
+
|
539
|
+
# how many zones are the items cutting it into at present?
|
540
|
+
n_items = pts.size + 1
|
541
|
+
f_step = n_items / (n_zones.to_f)
|
542
|
+
!db || puts(" n_items=#{n_items}, zones=#{n_zones}, step=#{f_step}")
|
543
|
+
while a.size < n_zones
|
544
|
+
f_pos = f_step * a.size
|
545
|
+
left_item = f_pos.floor.to_i
|
546
|
+
f_rem = f_pos - f_pos.floor
|
547
|
+
|
548
|
+
if left_item == 0
|
549
|
+
x0 = bounds.x
|
550
|
+
else
|
551
|
+
x0 = pts[left_item-1].loc.x
|
552
|
+
end
|
553
|
+
|
554
|
+
if left_item == pts.size
|
555
|
+
x1 = bounds.x + bounds.w
|
556
|
+
assert!(x1 >= bounds.x)
|
557
|
+
else
|
558
|
+
x1 = pts[left_item].loc.x
|
559
|
+
end
|
560
|
+
|
561
|
+
x_new = (((x1-x0) * f_rem) + x0).to_i
|
562
|
+
|
563
|
+
# make sure we are at least one unit further than the previous value
|
564
|
+
# (unless we've reached the right edge)
|
565
|
+
prev = a[-1]
|
566
|
+
|
567
|
+
if (x_new <= prev)
|
568
|
+
x_new = [prev+1, bounds.x + bounds.w].min
|
569
|
+
end
|
570
|
+
|
571
|
+
!db || pr(" adding #{x_new}, for f_step #{f_step}\n")
|
572
|
+
a << x_new
|
573
|
+
end
|
574
|
+
!db || pr("partitions=#{a} (bounds=#{bounds})\n")
|
575
|
+
a
|
576
|
+
end
|
577
|
+
|
578
|
+
def read_cached_node(node_name)
|
579
|
+
# Determine if node is in cache
|
580
|
+
n = @cache_dict[node_name]
|
581
|
+
cache_node(n)
|
582
|
+
n
|
583
|
+
end
|
584
|
+
|
585
|
+
def read_node(node_name, bounds, vertical)
|
586
|
+
db = false
|
587
|
+
# Determine if node is in cache
|
588
|
+
n = @cache_dict[node_name]
|
589
|
+
!db || pr("read_node #{node_name}, from cache=#{n}\n")
|
590
|
+
if !n
|
591
|
+
bp = @block_file.read(node_name)
|
592
|
+
n = decode_block(bp, node_name, vertical, bounds)
|
593
|
+
end
|
594
|
+
cache_node(n)
|
595
|
+
n
|
596
|
+
end
|
597
|
+
|
598
|
+
# Serialize node to bytes and write to blockfile
|
599
|
+
# (actually, just mark it as modified so this serialization/writing
|
600
|
+
# occurs at the end of the current operation)
|
601
|
+
#
|
602
|
+
def write_node(node)
|
603
|
+
if !node.modified
|
604
|
+
node.modified = true
|
605
|
+
@mod_nodes.add(node.name)
|
606
|
+
end
|
607
|
+
end
|
608
|
+
|
609
|
+
def done_operation
|
610
|
+
s = @mod_nodes
|
611
|
+
s.each do |name|
|
612
|
+
flush_modified_node(read_cached_node(name))
|
613
|
+
end
|
614
|
+
s.clear
|
615
|
+
@block_file.flush
|
616
|
+
|
617
|
+
# While cache size is too large, remove last item
|
618
|
+
size = @cache_dict.size
|
619
|
+
trim = [0,size - KD_CACHE_SIZE].max
|
620
|
+
|
621
|
+
while trim > 0
|
622
|
+
trim -= 1
|
623
|
+
back = @c_end.prev_node
|
624
|
+
remove_from(back, true, true)
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
def flush_modified_node(node)
|
629
|
+
bp = encode_block(node)
|
630
|
+
@block_file.write(node.name, bp)
|
631
|
+
node.modified = false;
|
632
|
+
end
|
633
|
+
|
634
|
+
# Encode a node to a block of bytes
|
635
|
+
def encode_block(n)
|
636
|
+
|
637
|
+
db = false
|
638
|
+
!db || pr("encode_block for #{n}\n")
|
639
|
+
|
640
|
+
b = @block_file.alloc_buffer
|
641
|
+
|
642
|
+
flags = 0
|
643
|
+
flags |= 1 if n.leaf
|
644
|
+
|
645
|
+
BlockFile.write_int(b,HDR_FLAGS,flags)
|
646
|
+
|
647
|
+
if !n.leaf
|
648
|
+
BlockFile.write_int(b, IFLD_POPULATION,n.population)
|
649
|
+
off = IFLD_PARTITIONS
|
650
|
+
NODEI_CHILDREN.times do |i|
|
651
|
+
p = n.slot(i)
|
652
|
+
BlockFile.write_int(b, off, p.start_position)
|
653
|
+
BlockFile.write_int(b,off+1,p.child_name)
|
654
|
+
off += 2
|
655
|
+
end
|
656
|
+
else
|
657
|
+
BlockFile.write_int(b,LFLD_OVERFLOW,n.overflow)
|
658
|
+
BlockFile.write_int(b,LFLD_USED,n.used)
|
659
|
+
off = LFLD_DATAPOINTS
|
660
|
+
n.used.times do |i|
|
661
|
+
GeoTree.write_data_point(n.data_point(i), b, off)
|
662
|
+
off += DATAPOINT_INTS
|
663
|
+
end
|
664
|
+
end
|
665
|
+
!db || hex_dump(b)
|
666
|
+
|
667
|
+
b
|
668
|
+
end
|
669
|
+
|
670
|
+
# Decode a node from a block of bytes
|
671
|
+
def decode_block(b, node_name, vertical, bounds)
|
672
|
+
db = false
|
673
|
+
# db = (node_name == 2)
|
674
|
+
!db || pr("decode_block\n")
|
675
|
+
!db || hex_dump(b)
|
676
|
+
|
677
|
+
flags = BlockFile.read_int(b, HDR_FLAGS)
|
678
|
+
type = (flags & 1)
|
679
|
+
n = nil
|
680
|
+
|
681
|
+
if type == 0
|
682
|
+
n = NodeI.new(node_name, vertical, bounds)
|
683
|
+
n.population = BlockFile.read_int(b, IFLD_POPULATION)
|
684
|
+
off = IFLD_PARTITIONS
|
685
|
+
NODEI_CHILDREN.times do |i|
|
686
|
+
off = IFLD_PARTITIONS + i*PARTITION_INTS
|
687
|
+
p = Partition.new(BlockFile.read_int(b, off), BlockFile.read_int(b,off+1))
|
688
|
+
n.set_slot(i,p)
|
689
|
+
off += PARTITION_INTS
|
690
|
+
end
|
691
|
+
else
|
692
|
+
n = NodeL.new(node_name,vertical,bounds)
|
693
|
+
|
694
|
+
n.overflow = BlockFile.read_int(b,LFLD_OVERFLOW)
|
695
|
+
n_used = BlockFile.read_int(b,LFLD_USED)
|
696
|
+
|
697
|
+
off = LFLD_DATAPOINTS
|
698
|
+
n_used.times do |i|
|
699
|
+
n.set_data_point(i, GeoTree.read_data_point_from(b, off))
|
700
|
+
off += DATAPOINT_INTS
|
701
|
+
end
|
702
|
+
end
|
703
|
+
!db || pr("decoded to #{n}\n")
|
704
|
+
n
|
705
|
+
end
|
706
|
+
|
707
|
+
# Delete node from tree
|
708
|
+
def delete_node(n)
|
709
|
+
@block_file.free(n.name)
|
710
|
+
remove_from(n,true,true);
|
711
|
+
@mod_nodes.delete(n.name)
|
712
|
+
end
|
713
|
+
|
714
|
+
# Replace one node with another within the cache (they should both have the same id)
|
715
|
+
def replace_node(orig, new_node)
|
716
|
+
remove_from(orig,true,true)
|
717
|
+
cache_node(new_node)
|
718
|
+
end
|
719
|
+
|
720
|
+
# Convert a leaf node to an internal node.
|
721
|
+
# Redistributes its data points (and those of any linked overflow nodes) to
|
722
|
+
# new child nodes.
|
723
|
+
# Returns the new internal node
|
724
|
+
def split_leaf_set(node,path)
|
725
|
+
db = false
|
726
|
+
# db = true
|
727
|
+
!db || pr("\nsplit_leaf_set #{node} bounds=#{node.bounds} vert=#{node.vertical}...\n")
|
728
|
+
|
729
|
+
# list of data points from the leaf node (and its overflow siblings)
|
730
|
+
dp = []
|
731
|
+
|
732
|
+
n2 = node
|
733
|
+
while true
|
734
|
+
# append this node's points to our buffer
|
735
|
+
dp.concat n2.pts
|
736
|
+
|
737
|
+
next_id = n2.overflow
|
738
|
+
# clear this node's link to its overflow, if any
|
739
|
+
n2.overflow = 0
|
740
|
+
|
741
|
+
# If it's one of the overflow nodes (and not the original leaf node), delete it
|
742
|
+
if n2 != node
|
743
|
+
delete_node(n2)
|
744
|
+
end
|
745
|
+
|
746
|
+
break if (next_id == 0)
|
747
|
+
|
748
|
+
b = n2.bounds
|
749
|
+
|
750
|
+
n2 = read_node(next_id,b,n2.vertical)
|
751
|
+
end
|
752
|
+
|
753
|
+
!db || pr(" datapoints=#{d(dp)}\n")
|
754
|
+
|
755
|
+
ni = NodeI.new(node.name,node.vertical,node.bounds)
|
756
|
+
|
757
|
+
a = GeoTree.calc_partitions(ni.bounds,dp,ni.vertical)
|
758
|
+
!db || pr(" partitions=#{d(a)}\n")
|
759
|
+
|
760
|
+
a.each_with_index do |posn,i|
|
761
|
+
p = Partition.new(posn,0)
|
762
|
+
ni.set_slot(i,p)
|
763
|
+
end
|
764
|
+
|
765
|
+
replace_node(node,ni)
|
766
|
+
|
767
|
+
# Add each of the data points to this new internal node
|
768
|
+
dp.each do |pt|
|
769
|
+
add_data_point(pt,ni.name,path,ni.bounds,ni.vertical)
|
770
|
+
end
|
771
|
+
ni
|
772
|
+
end
|
773
|
+
|
774
|
+
def leaf_population(node)
|
775
|
+
p = node.used
|
776
|
+
while node.overflow != 0
|
777
|
+
node = read_node(node.overflow,node.bounds,node.vertical)
|
778
|
+
p += node.used
|
779
|
+
end
|
780
|
+
p
|
781
|
+
end
|
782
|
+
|
783
|
+
def add_data_point(dp, node_name, path, b, v)
|
784
|
+
db = false
|
785
|
+
# db = true
|
786
|
+
|
787
|
+
!db || pr("\n\nadd_data_point #{dp}, node name #{node_name}\n")
|
788
|
+
n = read_node(node_name,b,v)
|
789
|
+
|
790
|
+
# iterate until we have found a leaf node with remaining capacity
|
791
|
+
while true
|
792
|
+
!db || pr(" ...top of iteration\n")
|
793
|
+
|
794
|
+
if (n.leaf)
|
795
|
+
# If the leaf node and overflow nodes have reached a certain size, create a new internal node,
|
796
|
+
# and continue recursing.
|
797
|
+
# Don't do this if the node's bounds are very small.
|
798
|
+
|
799
|
+
cap = SPLIT_SIZE
|
800
|
+
|
801
|
+
if (leaf_population(n) >= cap && n.splittable)
|
802
|
+
n = split_leaf_set(n,path)
|
803
|
+
next # do another iteration
|
804
|
+
end
|
805
|
+
|
806
|
+
# Add to next unused slot; create new overflow node if necessary
|
807
|
+
leaf_set_size = 1
|
808
|
+
while n.used == NODEL_CAPACITY
|
809
|
+
# Move to overflow node; if it doesn't exist, create one
|
810
|
+
n = get_next_overflow(n)
|
811
|
+
leaf_set_size += 1
|
812
|
+
end
|
813
|
+
|
814
|
+
n.add_data_point(dp)
|
815
|
+
write_node(n)
|
816
|
+
break
|
817
|
+
end
|
818
|
+
|
819
|
+
# An internal node
|
820
|
+
if (path)
|
821
|
+
path << n #n.name
|
822
|
+
end
|
823
|
+
child_slot = n.slot_containing_point(dp.loc)
|
824
|
+
child_node_id = n.slot_child(child_slot)
|
825
|
+
b = n.slot_bounds(child_slot)
|
826
|
+
|
827
|
+
v = !v
|
828
|
+
if child_node_id == 0
|
829
|
+
# Create a new child node
|
830
|
+
child_node_id = @block_file.alloc
|
831
|
+
|
832
|
+
n3 = NodeL.new(child_node_id,v,b)
|
833
|
+
# we need to add this node to the cache since it's just been built
|
834
|
+
cache_node(n3)
|
835
|
+
write_node(n3)
|
836
|
+
n.set_slot_child(child_slot, child_node_id)
|
837
|
+
write_node(n)
|
838
|
+
n = n3
|
839
|
+
else
|
840
|
+
n = read_node(child_node_id, b,v)
|
841
|
+
end
|
842
|
+
end
|
843
|
+
end
|
844
|
+
|
845
|
+
# Get the next overflow node for a leaf node; create one if necessary
|
846
|
+
def get_next_overflow(n)
|
847
|
+
ovid = n.overflow
|
848
|
+
if ovid==0
|
849
|
+
ovid = @block_file.alloc()
|
850
|
+
n2 = NodeL.new(ovid,n.vertical,n.bounds)
|
851
|
+
# we need to add this node to the cache since it's just been built
|
852
|
+
cache_node(n2)
|
853
|
+
write_node(n2)
|
854
|
+
n.overflow = ovid
|
855
|
+
write_node(n)
|
856
|
+
end
|
857
|
+
read_node(ovid,n.bounds,n.vertical)
|
858
|
+
end
|
859
|
+
|
860
|
+
def find_aux(rect,dest,name,b,v)
|
861
|
+
n = read_node(name,b,v)
|
862
|
+
if !n.leaf
|
863
|
+
|
864
|
+
NODEI_CHILDREN.times do |i|
|
865
|
+
child_name = n.slot_child(i)
|
866
|
+
next if child_name == 0
|
867
|
+
|
868
|
+
r2 = n.slot_bounds(i)
|
869
|
+
next if !Bounds.intersect(rect,r2)
|
870
|
+
find_aux(rect,dest,child_name,r2,!v)
|
871
|
+
end
|
872
|
+
|
873
|
+
else
|
874
|
+
n.pts().each do |dp|
|
875
|
+
next if !rect.contains_point(dp.loc)
|
876
|
+
dest << dp
|
877
|
+
end
|
878
|
+
|
879
|
+
overflow = n.overflow
|
880
|
+
if overflow != 0
|
881
|
+
find_aux(rect,dest,overflow,b,v)
|
882
|
+
end
|
883
|
+
end
|
884
|
+
end
|
885
|
+
|
886
|
+
def build_leaf_set(leaf_node)
|
887
|
+
a = []
|
888
|
+
a << leaf_node
|
889
|
+
n = leaf_node
|
890
|
+
while n.overflow != 0
|
891
|
+
n = read_node(n.overflow,n.bounds,n.vertical)
|
892
|
+
a << n
|
893
|
+
end
|
894
|
+
a
|
895
|
+
end
|
896
|
+
|
897
|
+
def tab(s, indent)
|
898
|
+
s << " "*indent
|
899
|
+
end
|
900
|
+
|
901
|
+
def dump_aux(s, n, indent, dc)
|
902
|
+
# assert!(!(dc.member? n.name))
|
903
|
+
dc[n.name] = n.name
|
904
|
+
tab(s,indent)
|
905
|
+
s << n.to_s
|
906
|
+
s << "\n"
|
907
|
+
if !n.leaf
|
908
|
+
indent += 1
|
909
|
+
NODEI_CHILDREN.times do |i|
|
910
|
+
p = n.slot(i)
|
911
|
+
if p.child_name != 0
|
912
|
+
tab(s,indent)
|
913
|
+
s << "Slot ##{i}:#{p.child_name} \n"
|
914
|
+
cb = n.slot_bounds(i)
|
915
|
+
dump_aux(s,read_node(p.child_name,cb,!n.vertical),indent+1,dc)
|
916
|
+
end
|
917
|
+
end
|
918
|
+
else
|
919
|
+
ovf = n.overflow
|
920
|
+
if ovf > 0
|
921
|
+
dump_aux(s,read_node(ovf,n.bounds,n.vertical),indent,dc)
|
922
|
+
end
|
923
|
+
end
|
924
|
+
end
|
925
|
+
|
926
|
+
def read_root_node
|
927
|
+
read_node(ROOT_NODE_NAME_,@@start_bounds,false)
|
928
|
+
end
|
929
|
+
|
930
|
+
end
|
931
|
+
|
932
|
+
private
|
933
|
+
|
934
|
+
class TreeStats
|
935
|
+
attr_accessor :leaf_count, :interior_count, :overflow_count, :leaf_depth_max
|
936
|
+
def initialize
|
937
|
+
@leaf_count = 0
|
938
|
+
@interior_count = 0
|
939
|
+
@overflow_count = 0
|
940
|
+
@leaf_used_sum = 0
|
941
|
+
@leaf_depth_sum = 0
|
942
|
+
@leaf_depth_max = 0
|
943
|
+
end
|
944
|
+
|
945
|
+
def process_node(n, overflow, depth)
|
946
|
+
if n.leaf
|
947
|
+
@leaf_count += 1
|
948
|
+
@leaf_used_sum += n.used
|
949
|
+
@leaf_depth_sum += depth
|
950
|
+
if overflow
|
951
|
+
@overflow_count += 1
|
952
|
+
end
|
953
|
+
@leaf_depth_max = [@leaf_depth_max,depth].max
|
954
|
+
else
|
955
|
+
@interior_count += 1
|
956
|
+
end
|
957
|
+
end
|
958
|
+
|
959
|
+
def summary
|
960
|
+
s = {}
|
961
|
+
s['leaf_nodes'] = leaf_count
|
962
|
+
s['interior_nodes'] = interior_count
|
963
|
+
s['overflow_nodes'] = overflow_count
|
964
|
+
leaf_usage = 0
|
965
|
+
if (leaf_count > 0)
|
966
|
+
leaf_usage = (@leaf_used_sum / @leaf_count.to_f) / NODEL_CAPACITY
|
967
|
+
end
|
968
|
+
s['leaf_usage'] = leaf_usage
|
969
|
+
avg_depth = 0
|
970
|
+
if @leaf_count > 0
|
971
|
+
avg_depth = @leaf_depth_sum / @leaf_count.to_f
|
972
|
+
end
|
973
|
+
s['leaf_depth (avg)'] = avg_depth
|
974
|
+
s['leaf_depth (max)'] = leaf_depth_max
|
975
|
+
s
|
976
|
+
end
|
977
|
+
|
978
|
+
end
|
979
|
+
|
980
|
+
end
|