geotree 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,980 @@
1
+ require_relative 'node'
2
+
3
+ req 'diskblockfile ptbuffer'
4
+
5
+ module GeoTreeModule
6
+
7
+ # A variant of a kd-tree, it is capable of maintaining sets of 2D points and efficiently
8
+ # reporting all points lying within (axis-aligned) query rectangles.
9
+ #
10
+ # Like a B+ tree, it has a large branching factor
11
+ # and the nodes are large to improve performance when the tree is stored
12
+ # on disk.
13
+ #
14
+ # A GeoTree is usually stored within a disk file, though it is also possible to
15
+ # construct a tree that exists only in memory; see the initialize(...) method.
16
+ #
17
+ # Usage:
18
+ #
19
+ # [] Open a tree. If no tree exists, a new, empty one is created.
20
+ #
21
+ # t = GeoTree.open("treepath.bin")
22
+ #
23
+ # [] Add datapoints.
24
+ #
25
+ # dp = DataPoint.new(...)
26
+ # t.add(dp)
27
+ #
28
+ # [] Remove datapoints.
29
+ #
30
+ # t.remove(dp)
31
+ #
32
+ # [] Find all points within a particular rectangle.
33
+ #
34
+ # b = Bounds.new(x,y,width,height)
35
+ #
36
+ # pts = t.find(b)
37
+ #
38
+ # [] Close tree; flush any changes.
39
+ #
40
+ # t.close()
41
+ #
42
+ #
43
+ # One of the problems with kd-trees (including this one) is that they can become
44
+ # unbalanced after a number of insertions and deletions. To deal with this,
45
+ # consider these two suggestions:
46
+ #
47
+ # 1) When constructing the initial tree, if the datapoints are given in a random
48
+ # order, the tree will (with high probability) be constructed in a balanced form.
49
+ # By contrast, consider what happens if the points (1,1), (2,2), (3,3), ... are
50
+ # added in sequence to an initially empty tree. The tree will be very unbalanced,
51
+ # with poor performance.
52
+ # To address this problem, if you are not confident that the points you initially
53
+ # provide are in a sufficiently random sequence, you can enable 'point buffering':
54
+ #
55
+ # t = GeoTree.open("treepath.bin")
56
+ #
57
+ # t.buffering = true # buffering is now active
58
+ #
59
+ # t.add(dp1)
60
+ # t.add(dp2) # these points are stored in a temporary disk file
61
+ # t.add(dp3)
62
+ # :
63
+ #
64
+ # t.buffering = false # the points will be shuffled into a random sequence and
65
+ # # added to the tree
66
+ #
67
+ #
68
+ # 2) Periodically, you can start with a new tree, and add all of the datapoints using the
69
+ # above buffering technique. This is easy to do if the datapoints are also stored
70
+ # externally to the GeoTree (for instance, as parts of larger records in some database).
71
+ # Otherwise, (i) the datapoints can be retrieved from the tree to an array
72
+ # (by using a sufficiently large query rectangle), (ii) a new tree can be constructed,
73
+ # and (iii) each of the points in the array can be added to the new tree.
74
+ #
75
+ class GeoTree
76
+
77
+ ROOT_NODE_NAME_ = BlockFile::FIRST_BLOCK_ID
78
+
79
+ privatize(self)
80
+ def buffering=(val)
81
+ db = false
82
+ # db = true
83
+ !db || pr("\nSetting buffering to #{val} (was #{@buffer.active})\n\n")
84
+
85
+ raise IllegalStateException if !open?
86
+
87
+ @buffer.active = val
88
+
89
+ end
90
+
91
+ # Construct GeoTree
92
+ # @param block_file if nil, creates in-memory tree
93
+ def initialize(block_file = nil)
94
+
95
+ block_file ||= BlockFile.new(KDTREE_BLOCKSIZE)
96
+ @block_file = block_file
97
+ @buffer = PtBuffer.new(self)
98
+
99
+ @mod_nodes = Set.new # names of modified nodes
100
+ @cache_dict = {}
101
+ @c_start = NodeI.new(555,false,Bounds.new(0,0,0,0))
102
+ @c_end = NodeI.new(666,false,Bounds.new(0,0,0,0))
103
+ GeoTree.join_nodes(@c_start,@c_end)
104
+
105
+ @block_file.open
106
+
107
+ # The root node, if it exists, will be in the first block.
108
+ if @block_file.name_max <= ROOT_NODE_NAME_
109
+ root = NodeL.new(ROOT_NODE_NAME_,false, @@start_bounds)
110
+ # we need to add this node to the cache since it's just been built
111
+ cache_node(root)
112
+ root_name = @block_file.alloc(encode_block(root))
113
+ write_node(root)
114
+ end
115
+
116
+ end
117
+
118
+ def open?
119
+ @block_file != nil
120
+ end
121
+
122
+ def close
123
+ raise IllegalStateException if !open?
124
+
125
+ # Stop buffering, in case we were, to flush points to tree
126
+ @buffer.active = false
127
+
128
+ # Flush the block file, among other things
129
+ done_operation
130
+
131
+ @block_file.close
132
+ @block_file = nil
133
+ end
134
+
135
+ def add_buffered_point(data_point)
136
+ # construct path of interior nodes leading to leaf node set
137
+ path = []
138
+ add_data_point(data_point, ROOT_NODE_NAME_,path,@@start_bounds,false)
139
+
140
+ # adjust populations for each internal node on path
141
+ path.each do |n|
142
+ n.adjust_population(1)
143
+ write_node(n)
144
+ end
145
+ end
146
+
147
+ private
148
+
149
+ # cache start and end nodes
150
+ attr_accessor :c_start, :c_end
151
+ attr_accessor :cache_dict, :mod_nodes, :block_file
152
+
153
+ @@start_bounds = Bounds.new(LOC_MIN,LOC_MIN,LOC_MAX - LOC_MIN,LOC_MAX - LOC_MIN)
154
+ public
155
+
156
+ def self.max_bounds
157
+ @@start_bounds
158
+ end
159
+
160
+ # Open tree from file; if it doesn't exist, creates an empty tree, one prepared to
161
+ # use that file to persist it
162
+ # @param path path of file; if nil, constructs tree in memory only
163
+ #
164
+ def self.open(path = nil)
165
+ db = false
166
+ # db = true
167
+ !db || pr("GeoTree.open path=#{path}\n")
168
+ bf = nil
169
+ if path
170
+ !db || pr(" exists=#{File.file?(path)}\n")
171
+
172
+ if (db && File.file?(path))
173
+ hex_dump(read_text_file(path),"path #{path}")
174
+ end
175
+
176
+ bf = DiskBlockFile.new(KDTREE_BLOCKSIZE, path)
177
+ end
178
+ GeoTree.new(bf);
179
+ end
180
+
181
+ # Add a datapoint to the tree.
182
+ # Does not ensure that a datapoint with this name already exists in the
183
+ # tree, even if it has the same location.
184
+ #
185
+ def add(data_point)
186
+ raise IllegalStateException if !open?
187
+ @buffer.add(data_point)
188
+ end
189
+
190
+ # Remove a datapoint. Returns the datapoint if it was found and removed,
191
+ # otherwise nil.
192
+ # A datapoint will be removed iff both its name and location match
193
+ # the sought point; the weight is ignored.
194
+ def remove(data_point)
195
+
196
+ raise IllegalStateException if @buffer.active
197
+
198
+ db = false
199
+ !db || pr("remove #{data_point}\n")
200
+
201
+ removed = nil
202
+ block do
203
+
204
+ # construct path of interior nodes leading to the leaf node set that contains the point
205
+ # (if one exists)
206
+ internal_path = []
207
+
208
+ n = read_root_node
209
+
210
+ while !n.leaf
211
+
212
+ !db || pr(" add #{n} to internal path\n")
213
+ internal_path << n
214
+
215
+ # find the child that will contain the point
216
+ child_slot = n.slot_intersecting_line(n.vertical ? data_point.loc.y : data_point.loc.x)
217
+ next_name = n.slot_child(child_slot)
218
+ !db || pr(" child_slot=#{child_slot}, next_name=#{next_name}\n")
219
+ if next_name == 0
220
+ n = nil
221
+ break
222
+ end
223
+ n = read_node(next_name,n.slot_bounds(child_slot),!n.vertical)
224
+ end
225
+ break if !n
226
+
227
+ # build list of overflow nodes
228
+ leaf_set = build_leaf_set(n)
229
+ !db || pr(" built leaf set: #{d(leaf_set)}\n")
230
+
231
+ # We now have path containing the path of internal nodes, and leaf_set the leaf nodes
232
+
233
+ # find the node containing this point
234
+ found_leaf_index = found_slot = -1
235
+
236
+ leaf_set.each_with_index do |leaf,i|
237
+ found_slot = leaf.find_point(data_point)
238
+ if found_slot >= 0
239
+ found_leaf_index = i
240
+ break
241
+ end
242
+ end
243
+ break if found_leaf_index < 0
244
+
245
+ # copy last datapoint to found location's, then delete last datapoint
246
+ leaf_node = leaf_set[found_leaf_index]
247
+ removed = leaf_node.data_point(found_slot)
248
+
249
+ last_leaf_node = leaf_set[-1]
250
+ lu = last_leaf_node.used
251
+
252
+ leaf_node.set_data_point(found_slot, last_leaf_node.data_point(lu-1))
253
+ last_leaf_node.pop_last_point
254
+
255
+ write_node(last_leaf_node)
256
+ write_node(leaf_node)
257
+
258
+ # If the last leaf is now empty, remove it
259
+ if last_leaf_node.used == 0
260
+ leaf_set.pop
261
+ if leaf_set.size != 0
262
+ prev_leaf = leaf_set[-1]
263
+ prev_leaf.overflow = 0
264
+ write_node(prev_leaf)
265
+ delete_node(last_leaf_node)
266
+ else
267
+ # It was the first leaf in the set, so we should remove it
268
+ # from its parent (NodeI) slot (if it's not the root)
269
+ if last_leaf_node.name != ROOT_NODE_NAME_
270
+ parent = internal_path[-1]
271
+ parent.remove_child_named(last_leaf_node.name)
272
+ write_node(parent)
273
+ end
274
+ end
275
+ end
276
+
277
+ # for each internal node in the path:
278
+ # [] adjust population by -1
279
+ # [] if population has dropped below half the capacity of a leaf node,
280
+ # convert subtree to leaf node
281
+ while internal_path.size != 0
282
+ inode = internal_path.pop
283
+
284
+ inode.adjust_population(-1)
285
+ write_node(inode)
286
+
287
+ if inode.population < SPLIT_SIZE/2
288
+ collapse_internal_node(inode)
289
+ end
290
+
291
+ end
292
+ end
293
+ done_operation
294
+ removed
295
+ end
296
+
297
+ # Find all points intersecting a rectangle.
298
+ #
299
+ def find(rect)
300
+ raise IllegalStateException if (!open? || @buffer.active)
301
+ a = []
302
+ find_aux(rect,a,ROOT_NODE_NAME_,@@start_bounds,false)
303
+ done_operation
304
+ a
305
+ end
306
+
307
+ # Determine if a particular datapoint lies in the tree
308
+ def find_point(df)
309
+ raise IllegalStateException if (!open? || @buffer.active)
310
+ vb = Bounds.new(df.loc.x,df.loc.y,1,1)
311
+ fa = find(vb)
312
+ ret = false
313
+ fa.each do |dp|
314
+ if dp.name == df.name
315
+ ret = true
316
+ break;
317
+ end
318
+ end
319
+ ret
320
+ end
321
+
322
+ # Calculate some statistics about the tree
323
+ # @return dictionary of field(string) => value
324
+ def statistics
325
+ raise IllegalStateException if !open?
326
+
327
+ st = TreeStats.new
328
+ i = ROOT_NODE_NAME_
329
+ aux_stats(ROOT_NODE_NAME_,@@start_bounds,false,false,0,st)
330
+
331
+ st.summary
332
+ end
333
+
334
+ # Dump tree in graphical form
335
+ def dump(root_node = nil)
336
+ raise IllegalStateException if !open?
337
+ root_node ||= read_root_node
338
+
339
+ s2 = "-"*50+"\n"
340
+
341
+ s = "KDTree (rooted at #{root_node.name})\n"
342
+ s << s2
343
+
344
+ dump_aux(s,root_node,0,{})
345
+ s << s2
346
+ s
347
+ end
348
+
349
+ def self.rnd_points(count)
350
+ b = Bounds.new(100,100,900,900)
351
+ rnd_points_within(count,b)
352
+ end
353
+
354
+ @@next_pt_id = 500
355
+
356
+ def self.rnd_points_within(count, bounds)
357
+
358
+ a = []
359
+ count.times do |i|
360
+ w = Loc.new(bounds.x + rand(1 + bounds.w), bounds.y + rand(1 + bounds.h))
361
+ next if !@@start_bounds.contains_point(w)
362
+
363
+ wt = (rand * rand * rand * MAX_POINT_WEIGHT).to_i
364
+ a << DataPoint.create_with_name(@@next_pt_id,wt,w)
365
+ @@next_pt_id += 1
366
+ end
367
+ a
368
+ end
369
+
370
+ def self.read_data_point_from(b, offset)
371
+ name = BlockFile.read_int(b, offset)
372
+ weight = BlockFile.read_int(b, offset+1)
373
+ locn = Loc.new(BlockFile.read_int(b,offset+2),BlockFile.read_int(b,offset+3))
374
+ DataPoint.new(name,weight,locn)
375
+ end
376
+
377
+ def self.write_data_point(dp, b, offset)
378
+ BlockFile.write_int(b,offset, dp.name)
379
+ BlockFile.write_int(b,offset+1, dp.weight)
380
+ BlockFile.write_int(b,offset+2, dp.loc.x)
381
+ BlockFile.write_int(b,offset+3, dp.loc.y)
382
+ end
383
+
384
+ private
385
+
386
+ def gather_datapoints(n,dp_set,node_set)
387
+ if !n.leaf
388
+ NODEI_CHILDREN.times do |i|
389
+ child = n.slot_child(i)
390
+ next if child == 0
391
+
392
+ b = n.slot_bounds(i)
393
+ child_node = read_node(child, b, !n.vertical )
394
+
395
+ node_set << child_node
396
+ gather_datapoints(child_node, dp_set,node_set)
397
+ end
398
+ else
399
+ while true
400
+ dp_set.concat(n.pts)
401
+ ov = n.overflow
402
+ break if ov == 0
403
+ n = read_node(ov,n.bounds,n.vertical)
404
+ node_set << n
405
+ end
406
+ end
407
+ end
408
+
409
+ # Replace an internal node with a leaf node, one containing all the
410
+ # datapoints in the internal node's subtree.
411
+ def collapse_internal_node(n)
412
+ db = false
413
+ !db || pr("internal node population has dropped below half leaf set capacity;\n%s\n",d(n))
414
+ !db || puts(dump)
415
+
416
+ dp_set = []
417
+ node_set = []
418
+ gather_datapoints(n,dp_set,node_set)
419
+
420
+ if dp_set.size != n.population
421
+ raise IllegalStateException,\
422
+ "Interior node actual population #{dp_set.size} disagrees with stored value #{n.population};\n#{dump(n)}"
423
+ end
424
+
425
+ !db || pr("\ndp_set=#{d2(dp_set)}\n\n")
426
+ !db || pr("node_set=#{d2(node_set)}\n\n")
427
+
428
+ node_set.each do |n2|
429
+ !db || pr(" removing #{n2} from mod/cache\n")
430
+ delete_node(n2)
431
+ end
432
+
433
+ n2 = NodeL.new(n.name,n.vertical,n.bounds)
434
+ replace_node(n,n2)
435
+ n = n2
436
+ while true
437
+ j = [dp_set.size, NODEL_CAPACITY].min
438
+ pts = n.pts()
439
+ j.times{pts << dp_set.pop}
440
+ if dp_set.empty?
441
+ write_node(n)
442
+ break
443
+ end
444
+
445
+ n2 = get_next_overflow(n)
446
+ write_node(n)
447
+ n = n2
448
+ end
449
+
450
+ !db || printf("After collapsing\n#{dump}\n\n")
451
+
452
+ end
453
+
454
+ def aux_stats(node_name, b,v,overflow,depth, st)
455
+ n = read_node(node_name,b,v)
456
+ st.process_node(n,overflow,depth)
457
+
458
+ if !n.leaf
459
+ NODEI_CHILDREN.times do |i|
460
+ child_name = n.slot_child(i)
461
+ next if child_name == 0
462
+ r2 = n.slot_bounds(i)
463
+ aux_stats(child_name, r2, !v, false, depth+1, st)
464
+ end
465
+ else
466
+ ov = n.overflow
467
+ if ov != 0
468
+ aux_stats(ov, b, v, true, depth, st)
469
+ end
470
+ end
471
+ end
472
+
473
+ def self.join_nodes(a,b)
474
+ a.next_node = b
475
+ b.prev_node = a
476
+ end
477
+
478
+ def remove_from(node, from_cache, from_list)
479
+ if from_cache
480
+ @cache_dict.delete(node.name)
481
+ end
482
+ if from_list && node.prev_node
483
+ n_prev = node.prev_node
484
+ n_next = node.next_node
485
+ node.next_node = nil
486
+ node.prev_node = nil
487
+ GeoTree.join_nodes(n_prev,n_next)
488
+ end
489
+ end
490
+
491
+ # Add node to cache; move to front
492
+ def cache_node(node)
493
+ cs = @c_start
494
+ if cs.next_node != node
495
+ remove_from(node,false,true)
496
+ node2 = cs.next_node
497
+ GeoTree.join_nodes(cs,node)
498
+ GeoTree.join_nodes(node,node2)
499
+ end
500
+ @cache_dict[node.name] = node
501
+ end
502
+
503
+ # Calculate where partitions should go in a node
504
+ #
505
+ # If any slots end up having zero width, these are placed at the
506
+ # end of the list
507
+ #
508
+ # @param bounds bounds of node
509
+ # @param unsorted_pts array of DataPoints
510
+ # @param vertical orientation
511
+ # @return locations of partitions (1 + NODEI_CHILDREN of them)
512
+ #
513
+ def self.calc_partitions(bounds, unsorted_pts, vertical)
514
+ db = false
515
+ # db = true
516
+ !db || pr("calc_partitions for bounds #{bounds}\n")
517
+
518
+ a = []
519
+
520
+ # Convert inputs so we need deal only with x coordinates
521
+ if vertical
522
+ b = []
523
+ bounds = bounds.flip
524
+ unsorted_pts.each do |p|
525
+ b << p.flip
526
+ end
527
+ unsorted_pts = b
528
+ end
529
+
530
+ pts = unsorted_pts.sort{|a,b| a.loc.x <=> b.loc.x}
531
+ !db || pr(" starting with left boundary #{bounds.x}\n")
532
+
533
+ # Add location of left boundary
534
+ a << bounds.x
535
+
536
+ # how many zones are we cutting it into?
537
+ n_zones = NODEI_CHILDREN
538
+
539
+ # how many zones are the items cutting it into at present?
540
+ n_items = pts.size + 1
541
+ f_step = n_items / (n_zones.to_f)
542
+ !db || puts(" n_items=#{n_items}, zones=#{n_zones}, step=#{f_step}")
543
+ while a.size < n_zones
544
+ f_pos = f_step * a.size
545
+ left_item = f_pos.floor.to_i
546
+ f_rem = f_pos - f_pos.floor
547
+
548
+ if left_item == 0
549
+ x0 = bounds.x
550
+ else
551
+ x0 = pts[left_item-1].loc.x
552
+ end
553
+
554
+ if left_item == pts.size
555
+ x1 = bounds.x + bounds.w
556
+ assert!(x1 >= bounds.x)
557
+ else
558
+ x1 = pts[left_item].loc.x
559
+ end
560
+
561
+ x_new = (((x1-x0) * f_rem) + x0).to_i
562
+
563
+ # make sure we are at least one unit further than the previous value
564
+ # (unless we've reached the right edge)
565
+ prev = a[-1]
566
+
567
+ if (x_new <= prev)
568
+ x_new = [prev+1, bounds.x + bounds.w].min
569
+ end
570
+
571
+ !db || pr(" adding #{x_new}, for f_step #{f_step}\n")
572
+ a << x_new
573
+ end
574
+ !db || pr("partitions=#{a} (bounds=#{bounds})\n")
575
+ a
576
+ end
577
+
578
+ def read_cached_node(node_name)
579
+ # Determine if node is in cache
580
+ n = @cache_dict[node_name]
581
+ cache_node(n)
582
+ n
583
+ end
584
+
585
+ def read_node(node_name, bounds, vertical)
586
+ db = false
587
+ # Determine if node is in cache
588
+ n = @cache_dict[node_name]
589
+ !db || pr("read_node #{node_name}, from cache=#{n}\n")
590
+ if !n
591
+ bp = @block_file.read(node_name)
592
+ n = decode_block(bp, node_name, vertical, bounds)
593
+ end
594
+ cache_node(n)
595
+ n
596
+ end
597
+
598
+ # Serialize node to bytes and write to blockfile
599
+ # (actually, just mark it as modified so this serialization/writing
600
+ # occurs at the end of the current operation)
601
+ #
602
+ def write_node(node)
603
+ if !node.modified
604
+ node.modified = true
605
+ @mod_nodes.add(node.name)
606
+ end
607
+ end
608
+
609
+ def done_operation
610
+ s = @mod_nodes
611
+ s.each do |name|
612
+ flush_modified_node(read_cached_node(name))
613
+ end
614
+ s.clear
615
+ @block_file.flush
616
+
617
+ # While cache size is too large, remove last item
618
+ size = @cache_dict.size
619
+ trim = [0,size - KD_CACHE_SIZE].max
620
+
621
+ while trim > 0
622
+ trim -= 1
623
+ back = @c_end.prev_node
624
+ remove_from(back, true, true)
625
+ end
626
+ end
627
+
628
+ def flush_modified_node(node)
629
+ bp = encode_block(node)
630
+ @block_file.write(node.name, bp)
631
+ node.modified = false;
632
+ end
633
+
634
+ # Encode a node to a block of bytes
635
+ def encode_block(n)
636
+
637
+ db = false
638
+ !db || pr("encode_block for #{n}\n")
639
+
640
+ b = @block_file.alloc_buffer
641
+
642
+ flags = 0
643
+ flags |= 1 if n.leaf
644
+
645
+ BlockFile.write_int(b,HDR_FLAGS,flags)
646
+
647
+ if !n.leaf
648
+ BlockFile.write_int(b, IFLD_POPULATION,n.population)
649
+ off = IFLD_PARTITIONS
650
+ NODEI_CHILDREN.times do |i|
651
+ p = n.slot(i)
652
+ BlockFile.write_int(b, off, p.start_position)
653
+ BlockFile.write_int(b,off+1,p.child_name)
654
+ off += 2
655
+ end
656
+ else
657
+ BlockFile.write_int(b,LFLD_OVERFLOW,n.overflow)
658
+ BlockFile.write_int(b,LFLD_USED,n.used)
659
+ off = LFLD_DATAPOINTS
660
+ n.used.times do |i|
661
+ GeoTree.write_data_point(n.data_point(i), b, off)
662
+ off += DATAPOINT_INTS
663
+ end
664
+ end
665
+ !db || hex_dump(b)
666
+
667
+ b
668
+ end
669
+
670
+ # Decode a node from a block of bytes
671
+ def decode_block(b, node_name, vertical, bounds)
672
+ db = false
673
+ # db = (node_name == 2)
674
+ !db || pr("decode_block\n")
675
+ !db || hex_dump(b)
676
+
677
+ flags = BlockFile.read_int(b, HDR_FLAGS)
678
+ type = (flags & 1)
679
+ n = nil
680
+
681
+ if type == 0
682
+ n = NodeI.new(node_name, vertical, bounds)
683
+ n.population = BlockFile.read_int(b, IFLD_POPULATION)
684
+ off = IFLD_PARTITIONS
685
+ NODEI_CHILDREN.times do |i|
686
+ off = IFLD_PARTITIONS + i*PARTITION_INTS
687
+ p = Partition.new(BlockFile.read_int(b, off), BlockFile.read_int(b,off+1))
688
+ n.set_slot(i,p)
689
+ off += PARTITION_INTS
690
+ end
691
+ else
692
+ n = NodeL.new(node_name,vertical,bounds)
693
+
694
+ n.overflow = BlockFile.read_int(b,LFLD_OVERFLOW)
695
+ n_used = BlockFile.read_int(b,LFLD_USED)
696
+
697
+ off = LFLD_DATAPOINTS
698
+ n_used.times do |i|
699
+ n.set_data_point(i, GeoTree.read_data_point_from(b, off))
700
+ off += DATAPOINT_INTS
701
+ end
702
+ end
703
+ !db || pr("decoded to #{n}\n")
704
+ n
705
+ end
706
+
707
+ # Delete node from tree
708
+ def delete_node(n)
709
+ @block_file.free(n.name)
710
+ remove_from(n,true,true);
711
+ @mod_nodes.delete(n.name)
712
+ end
713
+
714
+ # Replace one node with another within the cache (they should both have the same id)
715
+ def replace_node(orig, new_node)
716
+ remove_from(orig,true,true)
717
+ cache_node(new_node)
718
+ end
719
+
720
+ # Convert a leaf node to an internal node.
721
+ # Redistributes its data points (and those of any linked overflow nodes) to
722
+ # new child nodes.
723
+ # Returns the new internal node
724
+ def split_leaf_set(node,path)
725
+ db = false
726
+ # db = true
727
+ !db || pr("\nsplit_leaf_set #{node} bounds=#{node.bounds} vert=#{node.vertical}...\n")
728
+
729
+ # list of data points from the leaf node (and its overflow siblings)
730
+ dp = []
731
+
732
+ n2 = node
733
+ while true
734
+ # append this node's points to our buffer
735
+ dp.concat n2.pts
736
+
737
+ next_id = n2.overflow
738
+ # clear this node's link to its overflow, if any
739
+ n2.overflow = 0
740
+
741
+ # If it's one of the overflow nodes (and not the original leaf node), delete it
742
+ if n2 != node
743
+ delete_node(n2)
744
+ end
745
+
746
+ break if (next_id == 0)
747
+
748
+ b = n2.bounds
749
+
750
+ n2 = read_node(next_id,b,n2.vertical)
751
+ end
752
+
753
+ !db || pr(" datapoints=#{d(dp)}\n")
754
+
755
+ ni = NodeI.new(node.name,node.vertical,node.bounds)
756
+
757
+ a = GeoTree.calc_partitions(ni.bounds,dp,ni.vertical)
758
+ !db || pr(" partitions=#{d(a)}\n")
759
+
760
+ a.each_with_index do |posn,i|
761
+ p = Partition.new(posn,0)
762
+ ni.set_slot(i,p)
763
+ end
764
+
765
+ replace_node(node,ni)
766
+
767
+ # Add each of the data points to this new internal node
768
+ dp.each do |pt|
769
+ add_data_point(pt,ni.name,path,ni.bounds,ni.vertical)
770
+ end
771
+ ni
772
+ end
773
+
774
+ def leaf_population(node)
775
+ p = node.used
776
+ while node.overflow != 0
777
+ node = read_node(node.overflow,node.bounds,node.vertical)
778
+ p += node.used
779
+ end
780
+ p
781
+ end
782
+
783
+ def add_data_point(dp, node_name, path, b, v)
784
+ db = false
785
+ # db = true
786
+
787
+ !db || pr("\n\nadd_data_point #{dp}, node name #{node_name}\n")
788
+ n = read_node(node_name,b,v)
789
+
790
+ # iterate until we have found a leaf node with remaining capacity
791
+ while true
792
+ !db || pr(" ...top of iteration\n")
793
+
794
+ if (n.leaf)
795
+ # If the leaf node and overflow nodes have reached a certain size, create a new internal node,
796
+ # and continue recursing.
797
+ # Don't do this if the node's bounds are very small.
798
+
799
+ cap = SPLIT_SIZE
800
+
801
+ if (leaf_population(n) >= cap && n.splittable)
802
+ n = split_leaf_set(n,path)
803
+ next # do another iteration
804
+ end
805
+
806
+ # Add to next unused slot; create new overflow node if necessary
807
+ leaf_set_size = 1
808
+ while n.used == NODEL_CAPACITY
809
+ # Move to overflow node; if it doesn't exist, create one
810
+ n = get_next_overflow(n)
811
+ leaf_set_size += 1
812
+ end
813
+
814
+ n.add_data_point(dp)
815
+ write_node(n)
816
+ break
817
+ end
818
+
819
+ # An internal node
820
+ if (path)
821
+ path << n #n.name
822
+ end
823
+ child_slot = n.slot_containing_point(dp.loc)
824
+ child_node_id = n.slot_child(child_slot)
825
+ b = n.slot_bounds(child_slot)
826
+
827
+ v = !v
828
+ if child_node_id == 0
829
+ # Create a new child node
830
+ child_node_id = @block_file.alloc
831
+
832
+ n3 = NodeL.new(child_node_id,v,b)
833
+ # we need to add this node to the cache since it's just been built
834
+ cache_node(n3)
835
+ write_node(n3)
836
+ n.set_slot_child(child_slot, child_node_id)
837
+ write_node(n)
838
+ n = n3
839
+ else
840
+ n = read_node(child_node_id, b,v)
841
+ end
842
+ end
843
+ end
844
+
845
+ # Get the next overflow node for a leaf node; create one if necessary
846
+ def get_next_overflow(n)
847
+ ovid = n.overflow
848
+ if ovid==0
849
+ ovid = @block_file.alloc()
850
+ n2 = NodeL.new(ovid,n.vertical,n.bounds)
851
+ # we need to add this node to the cache since it's just been built
852
+ cache_node(n2)
853
+ write_node(n2)
854
+ n.overflow = ovid
855
+ write_node(n)
856
+ end
857
+ read_node(ovid,n.bounds,n.vertical)
858
+ end
859
+
860
+ def find_aux(rect,dest,name,b,v)
861
+ n = read_node(name,b,v)
862
+ if !n.leaf
863
+
864
+ NODEI_CHILDREN.times do |i|
865
+ child_name = n.slot_child(i)
866
+ next if child_name == 0
867
+
868
+ r2 = n.slot_bounds(i)
869
+ next if !Bounds.intersect(rect,r2)
870
+ find_aux(rect,dest,child_name,r2,!v)
871
+ end
872
+
873
+ else
874
+ n.pts().each do |dp|
875
+ next if !rect.contains_point(dp.loc)
876
+ dest << dp
877
+ end
878
+
879
+ overflow = n.overflow
880
+ if overflow != 0
881
+ find_aux(rect,dest,overflow,b,v)
882
+ end
883
+ end
884
+ end
885
+
886
+ def build_leaf_set(leaf_node)
887
+ a = []
888
+ a << leaf_node
889
+ n = leaf_node
890
+ while n.overflow != 0
891
+ n = read_node(n.overflow,n.bounds,n.vertical)
892
+ a << n
893
+ end
894
+ a
895
+ end
896
+
897
+ def tab(s, indent)
898
+ s << " "*indent
899
+ end
900
+
901
+ def dump_aux(s, n, indent, dc)
902
+ # assert!(!(dc.member? n.name))
903
+ dc[n.name] = n.name
904
+ tab(s,indent)
905
+ s << n.to_s
906
+ s << "\n"
907
+ if !n.leaf
908
+ indent += 1
909
+ NODEI_CHILDREN.times do |i|
910
+ p = n.slot(i)
911
+ if p.child_name != 0
912
+ tab(s,indent)
913
+ s << "Slot ##{i}:#{p.child_name} \n"
914
+ cb = n.slot_bounds(i)
915
+ dump_aux(s,read_node(p.child_name,cb,!n.vertical),indent+1,dc)
916
+ end
917
+ end
918
+ else
919
+ ovf = n.overflow
920
+ if ovf > 0
921
+ dump_aux(s,read_node(ovf,n.bounds,n.vertical),indent,dc)
922
+ end
923
+ end
924
+ end
925
+
926
+ def read_root_node
927
+ read_node(ROOT_NODE_NAME_,@@start_bounds,false)
928
+ end
929
+
930
+ end
931
+
932
+ private
933
+
934
+ class TreeStats
935
+ attr_accessor :leaf_count, :interior_count, :overflow_count, :leaf_depth_max
936
+ def initialize
937
+ @leaf_count = 0
938
+ @interior_count = 0
939
+ @overflow_count = 0
940
+ @leaf_used_sum = 0
941
+ @leaf_depth_sum = 0
942
+ @leaf_depth_max = 0
943
+ end
944
+
945
+ def process_node(n, overflow, depth)
946
+ if n.leaf
947
+ @leaf_count += 1
948
+ @leaf_used_sum += n.used
949
+ @leaf_depth_sum += depth
950
+ if overflow
951
+ @overflow_count += 1
952
+ end
953
+ @leaf_depth_max = [@leaf_depth_max,depth].max
954
+ else
955
+ @interior_count += 1
956
+ end
957
+ end
958
+
959
+ def summary
960
+ s = {}
961
+ s['leaf_nodes'] = leaf_count
962
+ s['interior_nodes'] = interior_count
963
+ s['overflow_nodes'] = overflow_count
964
+ leaf_usage = 0
965
+ if (leaf_count > 0)
966
+ leaf_usage = (@leaf_used_sum / @leaf_count.to_f) / NODEL_CAPACITY
967
+ end
968
+ s['leaf_usage'] = leaf_usage
969
+ avg_depth = 0
970
+ if @leaf_count > 0
971
+ avg_depth = @leaf_depth_sum / @leaf_count.to_f
972
+ end
973
+ s['leaf_depth (avg)'] = avg_depth
974
+ s['leaf_depth (max)'] = leaf_depth_max
975
+ s
976
+ end
977
+
978
+ end
979
+
980
+ end