geotree 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,980 @@
1
+ require_relative 'node'
2
+
3
+ req 'diskblockfile ptbuffer'
4
+
5
+ module GeoTreeModule
6
+
7
+ # A variant of a kd-tree, it is capable of maintaining sets of 2D points and efficiently
8
+ # reporting all points lying within (axis-aligned) query rectangles.
9
+ #
10
+ # Like a B+ tree, it has a large branching factor
11
+ # and the nodes are large to improve performance when the tree is stored
12
+ # on disk.
13
+ #
14
+ # A GeoTree is usually stored within a disk file, though it is also possible to
15
+ # construct a tree that exists only in memory; see the initialize(...) method.
16
+ #
17
+ # Usage:
18
+ #
19
+ # [] Open a tree. If no tree exists, a new, empty one is created.
20
+ #
21
+ # t = GeoTree.open("treepath.bin")
22
+ #
23
+ # [] Add datapoints.
24
+ #
25
+ # dp = DataPoint.new(...)
26
+ # t.add(dp)
27
+ #
28
+ # [] Remove datapoints.
29
+ #
30
+ # t.remove(dp)
31
+ #
32
+ # [] Find all points within a particular rectangle.
33
+ #
34
+ # b = Bounds.new(x,y,width,height)
35
+ #
36
+ # pts = t.find(b)
37
+ #
38
+ # [] Close tree; flush any changes.
39
+ #
40
+ # t.close()
41
+ #
42
+ #
43
+ # One of the problems with kd-trees (including this one) is that they can become
44
+ # unbalanced after a number of insertions and deletions. To deal with this,
45
+ # consider these two suggestions:
46
+ #
47
+ # 1) When constructing the initial tree, if the datapoints are given in a random
48
+ # order, the tree will (with high probability) be constructed in a balanced form.
49
+ # By contrast, consider what happens if the points (1,1), (2,2), (3,3), ... are
50
+ # added in sequence to an initially empty tree. The tree will be very unbalanced,
51
+ # with poor performance.
52
+ # To address this problem, if you are not confident that the points you initially
53
+ # provide are in a sufficiently random sequence, you can enable 'point buffering':
54
+ #
55
+ # t = GeoTree.open("treepath.bin")
56
+ #
57
+ # t.buffering = true # buffering is now active
58
+ #
59
+ # t.add(dp1)
60
+ # t.add(dp2) # these points are stored in a temporary disk file
61
+ # t.add(dp3)
62
+ # :
63
+ #
64
+ # t.buffering = false # the points will be shuffled into a random sequence and
65
+ # # added to the tree
66
+ #
67
+ #
68
+ # 2) Periodically, you can start with a new tree, and add all of the datapoints using the
69
+ # above buffering technique. This is easy to do if the datapoints are also stored
70
+ # externally to the GeoTree (for instance, as parts of larger records in some database).
71
+ # Otherwise, (i) the datapoints can be retrieved from the tree to an array
72
+ # (by using a sufficiently large query rectangle), (ii) a new tree can be constructed,
73
+ # and (iii) each of the points in the array can be added to the new tree.
74
+ #
75
+ class GeoTree
76
+
77
+ ROOT_NODE_NAME_ = BlockFile::FIRST_BLOCK_ID
78
+
79
+ privatize(self)
80
+ def buffering=(val)
81
+ db = false
82
+ # db = true
83
+ !db || pr("\nSetting buffering to #{val} (was #{@buffer.active})\n\n")
84
+
85
+ raise IllegalStateException if !open?
86
+
87
+ @buffer.active = val
88
+
89
+ end
90
+
91
+ # Construct GeoTree
92
+ # @param block_file if nil, creates in-memory tree
93
+ def initialize(block_file = nil)
94
+
95
+ block_file ||= BlockFile.new(KDTREE_BLOCKSIZE)
96
+ @block_file = block_file
97
+ @buffer = PtBuffer.new(self)
98
+
99
+ @mod_nodes = Set.new # names of modified nodes
100
+ @cache_dict = {}
101
+ @c_start = NodeI.new(555,false,Bounds.new(0,0,0,0))
102
+ @c_end = NodeI.new(666,false,Bounds.new(0,0,0,0))
103
+ GeoTree.join_nodes(@c_start,@c_end)
104
+
105
+ @block_file.open
106
+
107
+ # The root node, if it exists, will be in the first block.
108
+ if @block_file.name_max <= ROOT_NODE_NAME_
109
+ root = NodeL.new(ROOT_NODE_NAME_,false, @@start_bounds)
110
+ # we need to add this node to the cache since it's just been built
111
+ cache_node(root)
112
+ root_name = @block_file.alloc(encode_block(root))
113
+ write_node(root)
114
+ end
115
+
116
+ end
117
+
118
+ def open?
119
+ @block_file != nil
120
+ end
121
+
122
+ def close
123
+ raise IllegalStateException if !open?
124
+
125
+ # Stop buffering, in case we were, to flush points to tree
126
+ @buffer.active = false
127
+
128
+ # Flush the block file, among other things
129
+ done_operation
130
+
131
+ @block_file.close
132
+ @block_file = nil
133
+ end
134
+
135
+ def add_buffered_point(data_point)
136
+ # construct path of interior nodes leading to leaf node set
137
+ path = []
138
+ add_data_point(data_point, ROOT_NODE_NAME_,path,@@start_bounds,false)
139
+
140
+ # adjust populations for each internal node on path
141
+ path.each do |n|
142
+ n.adjust_population(1)
143
+ write_node(n)
144
+ end
145
+ end
146
+
147
+ private
148
+
149
+ # cache start and end nodes
150
+ attr_accessor :c_start, :c_end
151
+ attr_accessor :cache_dict, :mod_nodes, :block_file
152
+
153
+ @@start_bounds = Bounds.new(LOC_MIN,LOC_MIN,LOC_MAX - LOC_MIN,LOC_MAX - LOC_MIN)
154
+ public
155
+
156
+ def self.max_bounds
157
+ @@start_bounds
158
+ end
159
+
160
+ # Open tree from file; if it doesn't exist, creates an empty tree, one prepared to
161
+ # use that file to persist it
162
+ # @param path path of file; if nil, constructs tree in memory only
163
+ #
164
+ def self.open(path = nil)
165
+ db = false
166
+ # db = true
167
+ !db || pr("GeoTree.open path=#{path}\n")
168
+ bf = nil
169
+ if path
170
+ !db || pr(" exists=#{File.file?(path)}\n")
171
+
172
+ if (db && File.file?(path))
173
+ hex_dump(read_text_file(path),"path #{path}")
174
+ end
175
+
176
+ bf = DiskBlockFile.new(KDTREE_BLOCKSIZE, path)
177
+ end
178
+ GeoTree.new(bf);
179
+ end
180
+
181
+ # Add a datapoint to the tree.
182
+ # Does not ensure that a datapoint with this name already exists in the
183
+ # tree, even if it has the same location.
184
+ #
185
+ def add(data_point)
186
+ raise IllegalStateException if !open?
187
+ @buffer.add(data_point)
188
+ end
189
+
190
+ # Remove a datapoint. Returns the datapoint if it was found and removed,
191
+ # otherwise nil.
192
+ # A datapoint will be removed iff both its name and location match
193
+ # the sought point; the weight is ignored.
194
+ def remove(data_point)
195
+
196
+ raise IllegalStateException if @buffer.active
197
+
198
+ db = false
199
+ !db || pr("remove #{data_point}\n")
200
+
201
+ removed = nil
202
+ block do
203
+
204
+ # construct path of interior nodes leading to the leaf node set that contains the point
205
+ # (if one exists)
206
+ internal_path = []
207
+
208
+ n = read_root_node
209
+
210
+ while !n.leaf
211
+
212
+ !db || pr(" add #{n} to internal path\n")
213
+ internal_path << n
214
+
215
+ # find the child that will contain the point
216
+ child_slot = n.slot_intersecting_line(n.vertical ? data_point.loc.y : data_point.loc.x)
217
+ next_name = n.slot_child(child_slot)
218
+ !db || pr(" child_slot=#{child_slot}, next_name=#{next_name}\n")
219
+ if next_name == 0
220
+ n = nil
221
+ break
222
+ end
223
+ n = read_node(next_name,n.slot_bounds(child_slot),!n.vertical)
224
+ end
225
+ break if !n
226
+
227
+ # build list of overflow nodes
228
+ leaf_set = build_leaf_set(n)
229
+ !db || pr(" built leaf set: #{d(leaf_set)}\n")
230
+
231
+ # We now have path containing the path of internal nodes, and leaf_set the leaf nodes
232
+
233
+ # find the node containing this point
234
+ found_leaf_index = found_slot = -1
235
+
236
+ leaf_set.each_with_index do |leaf,i|
237
+ found_slot = leaf.find_point(data_point)
238
+ if found_slot >= 0
239
+ found_leaf_index = i
240
+ break
241
+ end
242
+ end
243
+ break if found_leaf_index < 0
244
+
245
+ # copy last datapoint to found location's, then delete last datapoint
246
+ leaf_node = leaf_set[found_leaf_index]
247
+ removed = leaf_node.data_point(found_slot)
248
+
249
+ last_leaf_node = leaf_set[-1]
250
+ lu = last_leaf_node.used
251
+
252
+ leaf_node.set_data_point(found_slot, last_leaf_node.data_point(lu-1))
253
+ last_leaf_node.pop_last_point
254
+
255
+ write_node(last_leaf_node)
256
+ write_node(leaf_node)
257
+
258
+ # If the last leaf is now empty, remove it
259
+ if last_leaf_node.used == 0
260
+ leaf_set.pop
261
+ if leaf_set.size != 0
262
+ prev_leaf = leaf_set[-1]
263
+ prev_leaf.overflow = 0
264
+ write_node(prev_leaf)
265
+ delete_node(last_leaf_node)
266
+ else
267
+ # It was the first leaf in the set, so we should remove it
268
+ # from its parent (NodeI) slot (if it's not the root)
269
+ if last_leaf_node.name != ROOT_NODE_NAME_
270
+ parent = internal_path[-1]
271
+ parent.remove_child_named(last_leaf_node.name)
272
+ write_node(parent)
273
+ end
274
+ end
275
+ end
276
+
277
+ # for each internal node in the path:
278
+ # [] adjust population by -1
279
+ # [] if population has dropped below half the capacity of a leaf node,
280
+ # convert subtree to leaf node
281
+ while internal_path.size != 0
282
+ inode = internal_path.pop
283
+
284
+ inode.adjust_population(-1)
285
+ write_node(inode)
286
+
287
+ if inode.population < SPLIT_SIZE/2
288
+ collapse_internal_node(inode)
289
+ end
290
+
291
+ end
292
+ end
293
+ done_operation
294
+ removed
295
+ end
296
+
297
+ # Find all points intersecting a rectangle.
298
+ #
299
+ def find(rect)
300
+ raise IllegalStateException if (!open? || @buffer.active)
301
+ a = []
302
+ find_aux(rect,a,ROOT_NODE_NAME_,@@start_bounds,false)
303
+ done_operation
304
+ a
305
+ end
306
+
307
+ # Determine if a particular datapoint lies in the tree
308
+ def find_point(df)
309
+ raise IllegalStateException if (!open? || @buffer.active)
310
+ vb = Bounds.new(df.loc.x,df.loc.y,1,1)
311
+ fa = find(vb)
312
+ ret = false
313
+ fa.each do |dp|
314
+ if dp.name == df.name
315
+ ret = true
316
+ break;
317
+ end
318
+ end
319
+ ret
320
+ end
321
+
322
+ # Calculate some statistics about the tree
323
+ # @return dictionary of field(string) => value
324
+ def statistics
325
+ raise IllegalStateException if !open?
326
+
327
+ st = TreeStats.new
328
+ i = ROOT_NODE_NAME_
329
+ aux_stats(ROOT_NODE_NAME_,@@start_bounds,false,false,0,st)
330
+
331
+ st.summary
332
+ end
333
+
334
+ # Dump tree in graphical form
335
+ def dump(root_node = nil)
336
+ raise IllegalStateException if !open?
337
+ root_node ||= read_root_node
338
+
339
+ s2 = "-"*50+"\n"
340
+
341
+ s = "KDTree (rooted at #{root_node.name})\n"
342
+ s << s2
343
+
344
+ dump_aux(s,root_node,0,{})
345
+ s << s2
346
+ s
347
+ end
348
+
349
+ def self.rnd_points(count)
350
+ b = Bounds.new(100,100,900,900)
351
+ rnd_points_within(count,b)
352
+ end
353
+
354
+ @@next_pt_id = 500
355
+
356
+ def self.rnd_points_within(count, bounds)
357
+
358
+ a = []
359
+ count.times do |i|
360
+ w = Loc.new(bounds.x + rand(1 + bounds.w), bounds.y + rand(1 + bounds.h))
361
+ next if !@@start_bounds.contains_point(w)
362
+
363
+ wt = (rand * rand * rand * MAX_POINT_WEIGHT).to_i
364
+ a << DataPoint.create_with_name(@@next_pt_id,wt,w)
365
+ @@next_pt_id += 1
366
+ end
367
+ a
368
+ end
369
+
370
+ def self.read_data_point_from(b, offset)
371
+ name = BlockFile.read_int(b, offset)
372
+ weight = BlockFile.read_int(b, offset+1)
373
+ locn = Loc.new(BlockFile.read_int(b,offset+2),BlockFile.read_int(b,offset+3))
374
+ DataPoint.new(name,weight,locn)
375
+ end
376
+
377
+ def self.write_data_point(dp, b, offset)
378
+ BlockFile.write_int(b,offset, dp.name)
379
+ BlockFile.write_int(b,offset+1, dp.weight)
380
+ BlockFile.write_int(b,offset+2, dp.loc.x)
381
+ BlockFile.write_int(b,offset+3, dp.loc.y)
382
+ end
383
+
384
+ private
385
+
386
+ def gather_datapoints(n,dp_set,node_set)
387
+ if !n.leaf
388
+ NODEI_CHILDREN.times do |i|
389
+ child = n.slot_child(i)
390
+ next if child == 0
391
+
392
+ b = n.slot_bounds(i)
393
+ child_node = read_node(child, b, !n.vertical )
394
+
395
+ node_set << child_node
396
+ gather_datapoints(child_node, dp_set,node_set)
397
+ end
398
+ else
399
+ while true
400
+ dp_set.concat(n.pts)
401
+ ov = n.overflow
402
+ break if ov == 0
403
+ n = read_node(ov,n.bounds,n.vertical)
404
+ node_set << n
405
+ end
406
+ end
407
+ end
408
+
409
+ # Replace an internal node with a leaf node, one containing all the
410
+ # datapoints in the internal node's subtree.
411
+ def collapse_internal_node(n)
412
+ db = false
413
+ !db || pr("internal node population has dropped below half leaf set capacity;\n%s\n",d(n))
414
+ !db || puts(dump)
415
+
416
+ dp_set = []
417
+ node_set = []
418
+ gather_datapoints(n,dp_set,node_set)
419
+
420
+ if dp_set.size != n.population
421
+ raise IllegalStateException,\
422
+ "Interior node actual population #{dp_set.size} disagrees with stored value #{n.population};\n#{dump(n)}"
423
+ end
424
+
425
+ !db || pr("\ndp_set=#{d2(dp_set)}\n\n")
426
+ !db || pr("node_set=#{d2(node_set)}\n\n")
427
+
428
+ node_set.each do |n2|
429
+ !db || pr(" removing #{n2} from mod/cache\n")
430
+ delete_node(n2)
431
+ end
432
+
433
+ n2 = NodeL.new(n.name,n.vertical,n.bounds)
434
+ replace_node(n,n2)
435
+ n = n2
436
+ while true
437
+ j = [dp_set.size, NODEL_CAPACITY].min
438
+ pts = n.pts()
439
+ j.times{pts << dp_set.pop}
440
+ if dp_set.empty?
441
+ write_node(n)
442
+ break
443
+ end
444
+
445
+ n2 = get_next_overflow(n)
446
+ write_node(n)
447
+ n = n2
448
+ end
449
+
450
+ !db || printf("After collapsing\n#{dump}\n\n")
451
+
452
+ end
453
+
454
+ def aux_stats(node_name, b,v,overflow,depth, st)
455
+ n = read_node(node_name,b,v)
456
+ st.process_node(n,overflow,depth)
457
+
458
+ if !n.leaf
459
+ NODEI_CHILDREN.times do |i|
460
+ child_name = n.slot_child(i)
461
+ next if child_name == 0
462
+ r2 = n.slot_bounds(i)
463
+ aux_stats(child_name, r2, !v, false, depth+1, st)
464
+ end
465
+ else
466
+ ov = n.overflow
467
+ if ov != 0
468
+ aux_stats(ov, b, v, true, depth, st)
469
+ end
470
+ end
471
+ end
472
+
473
+ def self.join_nodes(a,b)
474
+ a.next_node = b
475
+ b.prev_node = a
476
+ end
477
+
478
+ def remove_from(node, from_cache, from_list)
479
+ if from_cache
480
+ @cache_dict.delete(node.name)
481
+ end
482
+ if from_list && node.prev_node
483
+ n_prev = node.prev_node
484
+ n_next = node.next_node
485
+ node.next_node = nil
486
+ node.prev_node = nil
487
+ GeoTree.join_nodes(n_prev,n_next)
488
+ end
489
+ end
490
+
491
+ # Add node to cache; move to front
492
+ def cache_node(node)
493
+ cs = @c_start
494
+ if cs.next_node != node
495
+ remove_from(node,false,true)
496
+ node2 = cs.next_node
497
+ GeoTree.join_nodes(cs,node)
498
+ GeoTree.join_nodes(node,node2)
499
+ end
500
+ @cache_dict[node.name] = node
501
+ end
502
+
503
+ # Calculate where partitions should go in a node
504
+ #
505
+ # If any slots end up having zero width, these are placed at the
506
+ # end of the list
507
+ #
508
+ # @param bounds bounds of node
509
+ # @param unsorted_pts array of DataPoints
510
+ # @param vertical orientation
511
+ # @return locations of partitions (1 + NODEI_CHILDREN of them)
512
+ #
513
+ def self.calc_partitions(bounds, unsorted_pts, vertical)
514
+ db = false
515
+ # db = true
516
+ !db || pr("calc_partitions for bounds #{bounds}\n")
517
+
518
+ a = []
519
+
520
+ # Convert inputs so we need deal only with x coordinates
521
+ if vertical
522
+ b = []
523
+ bounds = bounds.flip
524
+ unsorted_pts.each do |p|
525
+ b << p.flip
526
+ end
527
+ unsorted_pts = b
528
+ end
529
+
530
+ pts = unsorted_pts.sort{|a,b| a.loc.x <=> b.loc.x}
531
+ !db || pr(" starting with left boundary #{bounds.x}\n")
532
+
533
+ # Add location of left boundary
534
+ a << bounds.x
535
+
536
+ # how many zones are we cutting it into?
537
+ n_zones = NODEI_CHILDREN
538
+
539
+ # how many zones are the items cutting it into at present?
540
+ n_items = pts.size + 1
541
+ f_step = n_items / (n_zones.to_f)
542
+ !db || puts(" n_items=#{n_items}, zones=#{n_zones}, step=#{f_step}")
543
+ while a.size < n_zones
544
+ f_pos = f_step * a.size
545
+ left_item = f_pos.floor.to_i
546
+ f_rem = f_pos - f_pos.floor
547
+
548
+ if left_item == 0
549
+ x0 = bounds.x
550
+ else
551
+ x0 = pts[left_item-1].loc.x
552
+ end
553
+
554
+ if left_item == pts.size
555
+ x1 = bounds.x + bounds.w
556
+ assert!(x1 >= bounds.x)
557
+ else
558
+ x1 = pts[left_item].loc.x
559
+ end
560
+
561
+ x_new = (((x1-x0) * f_rem) + x0).to_i
562
+
563
+ # make sure we are at least one unit further than the previous value
564
+ # (unless we've reached the right edge)
565
+ prev = a[-1]
566
+
567
+ if (x_new <= prev)
568
+ x_new = [prev+1, bounds.x + bounds.w].min
569
+ end
570
+
571
+ !db || pr(" adding #{x_new}, for f_step #{f_step}\n")
572
+ a << x_new
573
+ end
574
+ !db || pr("partitions=#{a} (bounds=#{bounds})\n")
575
+ a
576
+ end
577
+
578
+ def read_cached_node(node_name)
579
+ # Determine if node is in cache
580
+ n = @cache_dict[node_name]
581
+ cache_node(n)
582
+ n
583
+ end
584
+
585
+ def read_node(node_name, bounds, vertical)
586
+ db = false
587
+ # Determine if node is in cache
588
+ n = @cache_dict[node_name]
589
+ !db || pr("read_node #{node_name}, from cache=#{n}\n")
590
+ if !n
591
+ bp = @block_file.read(node_name)
592
+ n = decode_block(bp, node_name, vertical, bounds)
593
+ end
594
+ cache_node(n)
595
+ n
596
+ end
597
+
598
+ # Serialize node to bytes and write to blockfile
599
+ # (actually, just mark it as modified so this serialization/writing
600
+ # occurs at the end of the current operation)
601
+ #
602
+ def write_node(node)
603
+ if !node.modified
604
+ node.modified = true
605
+ @mod_nodes.add(node.name)
606
+ end
607
+ end
608
+
609
+ def done_operation
610
+ s = @mod_nodes
611
+ s.each do |name|
612
+ flush_modified_node(read_cached_node(name))
613
+ end
614
+ s.clear
615
+ @block_file.flush
616
+
617
+ # While cache size is too large, remove last item
618
+ size = @cache_dict.size
619
+ trim = [0,size - KD_CACHE_SIZE].max
620
+
621
+ while trim > 0
622
+ trim -= 1
623
+ back = @c_end.prev_node
624
+ remove_from(back, true, true)
625
+ end
626
+ end
627
+
628
+ def flush_modified_node(node)
629
+ bp = encode_block(node)
630
+ @block_file.write(node.name, bp)
631
+ node.modified = false;
632
+ end
633
+
634
+ # Encode a node to a block of bytes
635
+ def encode_block(n)
636
+
637
+ db = false
638
+ !db || pr("encode_block for #{n}\n")
639
+
640
+ b = @block_file.alloc_buffer
641
+
642
+ flags = 0
643
+ flags |= 1 if n.leaf
644
+
645
+ BlockFile.write_int(b,HDR_FLAGS,flags)
646
+
647
+ if !n.leaf
648
+ BlockFile.write_int(b, IFLD_POPULATION,n.population)
649
+ off = IFLD_PARTITIONS
650
+ NODEI_CHILDREN.times do |i|
651
+ p = n.slot(i)
652
+ BlockFile.write_int(b, off, p.start_position)
653
+ BlockFile.write_int(b,off+1,p.child_name)
654
+ off += 2
655
+ end
656
+ else
657
+ BlockFile.write_int(b,LFLD_OVERFLOW,n.overflow)
658
+ BlockFile.write_int(b,LFLD_USED,n.used)
659
+ off = LFLD_DATAPOINTS
660
+ n.used.times do |i|
661
+ GeoTree.write_data_point(n.data_point(i), b, off)
662
+ off += DATAPOINT_INTS
663
+ end
664
+ end
665
+ !db || hex_dump(b)
666
+
667
+ b
668
+ end
669
+
670
+ # Decode a node from a block of bytes
671
+ def decode_block(b, node_name, vertical, bounds)
672
+ db = false
673
+ # db = (node_name == 2)
674
+ !db || pr("decode_block\n")
675
+ !db || hex_dump(b)
676
+
677
+ flags = BlockFile.read_int(b, HDR_FLAGS)
678
+ type = (flags & 1)
679
+ n = nil
680
+
681
+ if type == 0
682
+ n = NodeI.new(node_name, vertical, bounds)
683
+ n.population = BlockFile.read_int(b, IFLD_POPULATION)
684
+ off = IFLD_PARTITIONS
685
+ NODEI_CHILDREN.times do |i|
686
+ off = IFLD_PARTITIONS + i*PARTITION_INTS
687
+ p = Partition.new(BlockFile.read_int(b, off), BlockFile.read_int(b,off+1))
688
+ n.set_slot(i,p)
689
+ off += PARTITION_INTS
690
+ end
691
+ else
692
+ n = NodeL.new(node_name,vertical,bounds)
693
+
694
+ n.overflow = BlockFile.read_int(b,LFLD_OVERFLOW)
695
+ n_used = BlockFile.read_int(b,LFLD_USED)
696
+
697
+ off = LFLD_DATAPOINTS
698
+ n_used.times do |i|
699
+ n.set_data_point(i, GeoTree.read_data_point_from(b, off))
700
+ off += DATAPOINT_INTS
701
+ end
702
+ end
703
+ !db || pr("decoded to #{n}\n")
704
+ n
705
+ end
706
+
707
+ # Delete node from tree
708
+ def delete_node(n)
709
+ @block_file.free(n.name)
710
+ remove_from(n,true,true);
711
+ @mod_nodes.delete(n.name)
712
+ end
713
+
714
+ # Replace one node with another within the cache (they should both have the same id)
715
+ def replace_node(orig, new_node)
716
+ remove_from(orig,true,true)
717
+ cache_node(new_node)
718
+ end
719
+
720
+ # Convert a leaf node to an internal node.
721
+ # Redistributes its data points (and those of any linked overflow nodes) to
722
+ # new child nodes.
723
+ # Returns the new internal node
724
+ def split_leaf_set(node,path)
725
+ db = false
726
+ # db = true
727
+ !db || pr("\nsplit_leaf_set #{node} bounds=#{node.bounds} vert=#{node.vertical}...\n")
728
+
729
+ # list of data points from the leaf node (and its overflow siblings)
730
+ dp = []
731
+
732
+ n2 = node
733
+ while true
734
+ # append this node's points to our buffer
735
+ dp.concat n2.pts
736
+
737
+ next_id = n2.overflow
738
+ # clear this node's link to its overflow, if any
739
+ n2.overflow = 0
740
+
741
+ # If it's one of the overflow nodes (and not the original leaf node), delete it
742
+ if n2 != node
743
+ delete_node(n2)
744
+ end
745
+
746
+ break if (next_id == 0)
747
+
748
+ b = n2.bounds
749
+
750
+ n2 = read_node(next_id,b,n2.vertical)
751
+ end
752
+
753
+ !db || pr(" datapoints=#{d(dp)}\n")
754
+
755
+ ni = NodeI.new(node.name,node.vertical,node.bounds)
756
+
757
+ a = GeoTree.calc_partitions(ni.bounds,dp,ni.vertical)
758
+ !db || pr(" partitions=#{d(a)}\n")
759
+
760
+ a.each_with_index do |posn,i|
761
+ p = Partition.new(posn,0)
762
+ ni.set_slot(i,p)
763
+ end
764
+
765
+ replace_node(node,ni)
766
+
767
+ # Add each of the data points to this new internal node
768
+ dp.each do |pt|
769
+ add_data_point(pt,ni.name,path,ni.bounds,ni.vertical)
770
+ end
771
+ ni
772
+ end
773
+
774
+ def leaf_population(node)
775
+ p = node.used
776
+ while node.overflow != 0
777
+ node = read_node(node.overflow,node.bounds,node.vertical)
778
+ p += node.used
779
+ end
780
+ p
781
+ end
782
+
783
+ def add_data_point(dp, node_name, path, b, v)
784
+ db = false
785
+ # db = true
786
+
787
+ !db || pr("\n\nadd_data_point #{dp}, node name #{node_name}\n")
788
+ n = read_node(node_name,b,v)
789
+
790
+ # iterate until we have found a leaf node with remaining capacity
791
+ while true
792
+ !db || pr(" ...top of iteration\n")
793
+
794
+ if (n.leaf)
795
+ # If the leaf node and overflow nodes have reached a certain size, create a new internal node,
796
+ # and continue recursing.
797
+ # Don't do this if the node's bounds are very small.
798
+
799
+ cap = SPLIT_SIZE
800
+
801
+ if (leaf_population(n) >= cap && n.splittable)
802
+ n = split_leaf_set(n,path)
803
+ next # do another iteration
804
+ end
805
+
806
+ # Add to next unused slot; create new overflow node if necessary
807
+ leaf_set_size = 1
808
+ while n.used == NODEL_CAPACITY
809
+ # Move to overflow node; if it doesn't exist, create one
810
+ n = get_next_overflow(n)
811
+ leaf_set_size += 1
812
+ end
813
+
814
+ n.add_data_point(dp)
815
+ write_node(n)
816
+ break
817
+ end
818
+
819
+ # An internal node
820
+ if (path)
821
+ path << n #n.name
822
+ end
823
+ child_slot = n.slot_containing_point(dp.loc)
824
+ child_node_id = n.slot_child(child_slot)
825
+ b = n.slot_bounds(child_slot)
826
+
827
+ v = !v
828
+ if child_node_id == 0
829
+ # Create a new child node
830
+ child_node_id = @block_file.alloc
831
+
832
+ n3 = NodeL.new(child_node_id,v,b)
833
+ # we need to add this node to the cache since it's just been built
834
+ cache_node(n3)
835
+ write_node(n3)
836
+ n.set_slot_child(child_slot, child_node_id)
837
+ write_node(n)
838
+ n = n3
839
+ else
840
+ n = read_node(child_node_id, b,v)
841
+ end
842
+ end
843
+ end
844
+
845
+ # Get the next overflow node for a leaf node; create one if necessary
846
+ def get_next_overflow(n)
847
+ ovid = n.overflow
848
+ if ovid==0
849
+ ovid = @block_file.alloc()
850
+ n2 = NodeL.new(ovid,n.vertical,n.bounds)
851
+ # we need to add this node to the cache since it's just been built
852
+ cache_node(n2)
853
+ write_node(n2)
854
+ n.overflow = ovid
855
+ write_node(n)
856
+ end
857
+ read_node(ovid,n.bounds,n.vertical)
858
+ end
859
+
860
+ def find_aux(rect,dest,name,b,v)
861
+ n = read_node(name,b,v)
862
+ if !n.leaf
863
+
864
+ NODEI_CHILDREN.times do |i|
865
+ child_name = n.slot_child(i)
866
+ next if child_name == 0
867
+
868
+ r2 = n.slot_bounds(i)
869
+ next if !Bounds.intersect(rect,r2)
870
+ find_aux(rect,dest,child_name,r2,!v)
871
+ end
872
+
873
+ else
874
+ n.pts().each do |dp|
875
+ next if !rect.contains_point(dp.loc)
876
+ dest << dp
877
+ end
878
+
879
+ overflow = n.overflow
880
+ if overflow != 0
881
+ find_aux(rect,dest,overflow,b,v)
882
+ end
883
+ end
884
+ end
885
+
886
+ def build_leaf_set(leaf_node)
887
+ a = []
888
+ a << leaf_node
889
+ n = leaf_node
890
+ while n.overflow != 0
891
+ n = read_node(n.overflow,n.bounds,n.vertical)
892
+ a << n
893
+ end
894
+ a
895
+ end
896
+
897
+ def tab(s, indent)
898
+ s << " "*indent
899
+ end
900
+
901
+ def dump_aux(s, n, indent, dc)
902
+ # assert!(!(dc.member? n.name))
903
+ dc[n.name] = n.name
904
+ tab(s,indent)
905
+ s << n.to_s
906
+ s << "\n"
907
+ if !n.leaf
908
+ indent += 1
909
+ NODEI_CHILDREN.times do |i|
910
+ p = n.slot(i)
911
+ if p.child_name != 0
912
+ tab(s,indent)
913
+ s << "Slot ##{i}:#{p.child_name} \n"
914
+ cb = n.slot_bounds(i)
915
+ dump_aux(s,read_node(p.child_name,cb,!n.vertical),indent+1,dc)
916
+ end
917
+ end
918
+ else
919
+ ovf = n.overflow
920
+ if ovf > 0
921
+ dump_aux(s,read_node(ovf,n.bounds,n.vertical),indent,dc)
922
+ end
923
+ end
924
+ end
925
+
926
+ def read_root_node
927
+ read_node(ROOT_NODE_NAME_,@@start_bounds,false)
928
+ end
929
+
930
+ end
931
+
932
+ private
933
+
934
+ class TreeStats
935
+ attr_accessor :leaf_count, :interior_count, :overflow_count, :leaf_depth_max
936
+ def initialize
937
+ @leaf_count = 0
938
+ @interior_count = 0
939
+ @overflow_count = 0
940
+ @leaf_used_sum = 0
941
+ @leaf_depth_sum = 0
942
+ @leaf_depth_max = 0
943
+ end
944
+
945
+ def process_node(n, overflow, depth)
946
+ if n.leaf
947
+ @leaf_count += 1
948
+ @leaf_used_sum += n.used
949
+ @leaf_depth_sum += depth
950
+ if overflow
951
+ @overflow_count += 1
952
+ end
953
+ @leaf_depth_max = [@leaf_depth_max,depth].max
954
+ else
955
+ @interior_count += 1
956
+ end
957
+ end
958
+
959
+ def summary
960
+ s = {}
961
+ s['leaf_nodes'] = leaf_count
962
+ s['interior_nodes'] = interior_count
963
+ s['overflow_nodes'] = overflow_count
964
+ leaf_usage = 0
965
+ if (leaf_count > 0)
966
+ leaf_usage = (@leaf_used_sum / @leaf_count.to_f) / NODEL_CAPACITY
967
+ end
968
+ s['leaf_usage'] = leaf_usage
969
+ avg_depth = 0
970
+ if @leaf_count > 0
971
+ avg_depth = @leaf_depth_sum / @leaf_count.to_f
972
+ end
973
+ s['leaf_depth (avg)'] = avg_depth
974
+ s['leaf_depth (max)'] = leaf_depth_max
975
+ s
976
+ end
977
+
978
+ end
979
+
980
+ end