metrocot 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/metrocot.rb ADDED
@@ -0,0 +1,1112 @@
1
+
2
+ #############################################################################
3
+ #
4
+ # Copyright (c) 2009 Metro Cascade Media Inc
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # 'Software'), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+ #############################################################################
26
+ #
27
+ # Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
28
+ # January 1 2009
29
+ #
30
+ #############################################################################
31
+ #
32
+ # We are like tiny pleasantly chirping hex bugs coding away on the
33
+ # shoulders of Why so that we can create more, and do more with less
34
+ # code, not by virtue of any sharpness of mind on our part, or any
35
+ # other distinction, but because we are carried high and raised up
36
+ # by his giant size.
37
+ #
38
+ #############################################################################
39
+ #
40
+
41
+ class Metrocot < Object
42
+
43
+ VERSION = '1.0.0'
44
+
45
+ class MatchRange
46
+
47
+ attr_accessor :node_scraper, :start_index, :start_offset, :end_index, :end_offset, :verbose
48
+
49
+
50
+ def initialize( node_scraper, start_index, start_offset, end_index, end_offset )
51
+ @node_scraper = node_scraper
52
+ @start_index = start_index
53
+ @start_offset = start_offset
54
+ @end_index = end_index
55
+ @end_offset = end_offset
56
+ @verbose = false
57
+ end
58
+
59
+
60
+ def hnodes
61
+ @node_scraper.flattened_hnodes
62
+ end
63
+
64
+
65
+ def crop( crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
66
+ MatchRange.new( node_scraper, crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
67
+ end
68
+
69
+
70
+ def empty?
71
+ return @start_index >= @end_index || (@start_index == @end_index && @start_offset >= end_offset)
72
+ end
73
+
74
+
75
+ def following( other_range )
76
+ MatchRange.new( node_scraper, other_range.end_index, other_range.end_offset, end_index, end_offset )
77
+ end
78
+
79
+
80
+ def tail( tail_start_index, tail_start_offset )
81
+ MatchRange.new( @node_scraper, tail_start_index, tail_start_offset, @end_index, @end_offset )
82
+ end
83
+
84
+
85
+ def head( head_end_index, head_end_offset )
86
+ MatchRange.new( @node_scraper, @start_index, @start_offset, head_end_index, head_end_offset )
87
+ end
88
+
89
+
90
+ def split_at( middle )
91
+ parts = []
92
+ if middle.start_index > 0 || middle.start_offset > 0
93
+ parts << crop_at( 0, 0, middle.start_index, middle.start_offset )
94
+ end
95
+ parts << middle
96
+ if middle.end_index < end_index || (middle.start_end_index == end_index && middle.end_offset < end_offset)
97
+ parts << crop( middle.end_index, middle.end_offset, end_index, end_offset )
98
+ end
99
+ end
100
+
101
+
102
+ def extend( other )
103
+
104
+ extended_range = MatchRange.new( @node_scraper, @start_index, @start_offset, @end_index, @end_offset )
105
+ if other.start_index < extended_range.start_index
106
+ extended_range.start_index = other.start_index
107
+ elsif other.start_index == extended_range.start_index && other.start_offset < extended_range.start_offset
108
+ extended_range.start_offset = other.start_offset
109
+ end
110
+
111
+ if other.end_index > extended_range.end_index
112
+ extended_range.end_index = other.end_index
113
+ elsif other.end_index == extended_range.end_index && other.end_offset > extended_range.end_offset
114
+ extended_range.end_offset = other.end_offset
115
+ end
116
+
117
+ extended_range
118
+
119
+ end
120
+
121
+
122
+ def describe
123
+ "[#{start_index}+#{start_offset} ... #{end_index}+#{end_offset}]"
124
+ end
125
+
126
+ end
127
+
128
+
129
+
130
+ class BasePattern
131
+
132
+ attr_accessor :pred, :succ, :source, :name, :matched, :node_scraper, :metrocot, :pattern_no
133
+
134
+ @@instance_count = 0
135
+
136
+ def initialize( source )
137
+ @source = source
138
+ @pattern_no = @@instance_count
139
+ @@instance_count += 1
140
+ end
141
+
142
+ def optional
143
+ false
144
+ end
145
+
146
+ def log( s )
147
+ metrocot.log("#{self.description}: #{s}")
148
+ end
149
+
150
+ def dump( level, out )
151
+ out << " " * level + description + " p=#{priority}\n"
152
+ end
153
+
154
+ def self.parse(s)
155
+ raise "not supported"
156
+ end
157
+
158
+ def description
159
+ self.class.name
160
+ end
161
+
162
+
163
+ def dump_match_map( out, level, match_map )
164
+ if match_map.is_a? Hash
165
+ out << "{\n"
166
+ level += 1
167
+ match_map.each { |key, value|
168
+ out << " " * level + "#{key} => "
169
+ dump_match_map( out, level, value )
170
+ }
171
+ level -= 1
172
+ out << " " * level + "}\n"
173
+ elsif match_map.is_a? Array
174
+ out << "[\n"
175
+ level += 1
176
+ match_map.each { |value|
177
+ out << " " * level
178
+ dump_match_map( out, level, value )
179
+ }
180
+ level -= 1
181
+ out << " " * level + "]\n"
182
+ elsif match_map.is_a? String
183
+ out << "\"" + match_map + "\"\n"
184
+ elsif match_map.is_a? Hpricot::Elem
185
+ out << "<" + match_map.stag.name + ">\n"
186
+ else
187
+ out << match_map.class.to_s + "\n"
188
+ end
189
+ end
190
+
191
+
192
+ def log_match_data( msg, match_range, match_map )
193
+ log("#{msg} #{match_range.describe} map:")
194
+ if @node_scraper.verbose
195
+ if ! match_map.nil? && match_map != {}
196
+ dump_match_map( STDOUT, 0, match_map )
197
+ end
198
+ end
199
+ end
200
+
201
+
202
+ def each_match( match_range, match_map )
203
+ log_match_data("each_match", match_range, match_map)
204
+ end
205
+
206
+ def priority
207
+ 0
208
+ end
209
+
210
+ def with_scanned_match_data( match_map, match_data )
211
+
212
+ scanner = if name
213
+ @node_scraper.scanner_by_name(name)
214
+ else
215
+ default_scanner
216
+ end
217
+
218
+ if scanner
219
+ # begin
220
+ match_map[name] = scanner.scan(match_data)
221
+ # rescue
222
+ # log("scanner error: #{$!}")
223
+ # return nil
224
+ # end
225
+ elsif name
226
+ match_map[name] = match_data
227
+ elsif self.is_a?(CompositePattern) && match_data.is_a?(Hash)
228
+ log("copying #{match_data.class.name} match data from #{self.description}")
229
+ match_data.each { |key, value|
230
+ match_map[key] = value
231
+ }
232
+ else
233
+ log("not carrying #{match_data.class.name} match data from #{self.description}")
234
+ end
235
+
236
+ result = yield( match_map )
237
+ match_map.delete(name) if name
238
+ return result
239
+
240
+ end
241
+
242
+ def default_scanner
243
+ nil
244
+ end
245
+
246
+
247
+ end
248
+
249
+
250
+ class PathPattern < BasePattern
251
+
252
+ def initialize( source, path )
253
+ super(source)
254
+ @path = path
255
+ end
256
+
257
+ def self.parse( s )
258
+ return nil unless s.index(".") == 0
259
+ return nil if s.index("..") == 0
260
+ space_index = s.index(" ") || s.size
261
+ return nil if space_index == 1
262
+ self.new( s[0 .. space_index - 1], s[1 .. space_index - 1] )
263
+ end
264
+
265
+
266
+ def source
267
+ @source
268
+ end
269
+
270
+
271
+ def description
272
+ "path \"#{@path}\""
273
+ end
274
+
275
+
276
+ def each_match( match_range, match_map )
277
+ super(match_range, match_map)
278
+ result = nil
279
+ search_root = match_range.node_scraper.hnode
280
+ search_root.search( @path ).each { |descendent|
281
+ nix = node_scraper.hnode_index[descendent]
282
+ next_node_nix = node_scraper.hnode_succ_index[descendent]
283
+ unless nix
284
+ @node_scraper.flattened_hnodes.each { |node|
285
+ log( "#{@node_scraper.hnode_index[node]}: #{node}" )
286
+ }
287
+ raise "no node index for #{descendent.class} #{descendent}"
288
+ end
289
+ if nix < match_range.start_index
290
+ log( "too far left: #{nix}" )
291
+ next
292
+ end
293
+ if nix >= match_range.end_index
294
+ log( "too far right: #{nix}" )
295
+ break
296
+ end
297
+ log( "matched path at node #{nix}" )
298
+ result = with_scanned_match_data( match_map, descendent ) { |match_map|
299
+ yield( match_range.crop(nix, 0, next_node_nix, 0) , match_map )
300
+ }
301
+ break if result
302
+ }
303
+ result
304
+ end
305
+
306
+
307
+ def priority
308
+ 1
309
+ end
310
+
311
+
312
+ end
313
+
314
+
315
+ class OptSpacePattern < BasePattern
316
+
317
+ def initialize
318
+ super(" ")
319
+ end
320
+
321
+ def description
322
+ "spaces"
323
+ end
324
+
325
+ def optional
326
+ true
327
+ end
328
+
329
+ def self.parse( s )
330
+ return nil unless s[0..0] == " "
331
+ OptSpacePattern.new
332
+ end
333
+
334
+ def priority
335
+ -7
336
+ end
337
+
338
+ def each_match( match_range, match_map )
339
+ super(match_range, match_map)
340
+ match_start_index = match_range.start_index
341
+ match_start_offset = match_range.start_offset
342
+ match_end_index = match_range.start_index
343
+ match_end_offset = match_range.start_offset
344
+
345
+ # consume rest of first text node
346
+
347
+ hnodes = match_range.hnodes
348
+
349
+ if hnodes[match_start_index] && hnodes[match_start_index].text?
350
+ hnode_text = hnodes[match_start_index].inner_text
351
+ while match_end_offset < hnode_text.size && (/\s+/.=== hnode_text[match_start_offset .. match_end_offset])
352
+ match_end_offset += 1
353
+ end
354
+
355
+ if match_end_offset > match_start_offset
356
+ if match_end_offset >= hnode_text.size
357
+ match_range = match_range.tail( match_end_index + 1, 0 )
358
+ log( "matched entire string of #{match_end_offset - match_start_offset} spaces" )
359
+ else
360
+ match_range = match_range.tail( match_end_index, match_end_offset )
361
+ log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
362
+ end
363
+ end
364
+ end
365
+
366
+ result = with_scanned_match_data( match_map, hnodes[match_start_index ... match_end_index] ) { |match_map|
367
+ yield( match_range, match_map )
368
+ }
369
+ result
370
+
371
+ end
372
+
373
+ end
374
+
375
+
376
+ class OneOrMorePattern < BasePattern
377
+
378
+ def initialize(repeatee)
379
+ super( nil )
380
+ @repeatee = repeatee
381
+ end
382
+
383
+
384
+ def self.parse(s)
385
+ raise "not implemented"
386
+ end
387
+
388
+
389
+ def repeatee
390
+ @repeatee
391
+ end
392
+
393
+
394
+ def description
395
+ "one+ ##{pattern_no}"
396
+ end
397
+
398
+
399
+ def dump( level, out )
400
+ out << " " * level + "one or more p=#{priority}\n"
401
+ @repeatee.dump( level + 1, out )
402
+ end
403
+
404
+
405
+ def priority
406
+ @repeatee.priority
407
+ end
408
+
409
+
410
+ def consume_remaining_matches( match_range, match_map, matches )
411
+
412
+ log("consuming remaining matches in #{match_range}")
413
+ @repeatee.each_match( match_range, match_map ) { |r_match_range, r_match_map|
414
+ matches << r_match_range.hnodes[r_match_range.start_index ... r_match_range.end_index]
415
+ last_match_range = consume_remaining_matches( match_range.tail( r_match_range.end_index, r_match_range.end_offset ), match_map, matches )
416
+ return last_match_range || r_match_range
417
+ }
418
+ return nil
419
+
420
+ end
421
+
422
+
423
+ def each_match( match_range, match_map )
424
+
425
+ super(match_range, match_map)
426
+
427
+ log("looking for first match in #{match_range}")
428
+ @repeatee.each_match( match_range, match_map ) { |first_match_range, first_match_map|
429
+ results = []
430
+ last_match_range = consume_remaining_matches( match_range.tail( first_match_range.end_index, first_match_range.end_offset ), match_map, results )
431
+
432
+ combined_match_range = if last_match_range
433
+ match_range.crop(
434
+ first_match_range.start_index, first_match_range.start_offset,
435
+ last_match_range.end_index, last_match_range.end_offset
436
+ )
437
+ else
438
+ first_match_range
439
+ end
440
+
441
+ log("combined match in #{combined_match_range}")
442
+
443
+ result = with_scanned_match_data( match_map, results ) { |match_map|
444
+ yield( combined_match_range, results )
445
+ }
446
+
447
+ if result
448
+ log("one+ match done with #{result}")
449
+ return result
450
+ else
451
+ log("one+ match not done")
452
+ end
453
+ }
454
+
455
+ return nil
456
+ end
457
+
458
+ end
459
+
460
+
461
+ class AnythingPattern < BasePattern
462
+
463
+ def description
464
+ "anything"
465
+ end
466
+
467
+ def self.parse( s )
468
+ return nil unless s.index("...") == 0
469
+ self.new("...")
470
+ end
471
+
472
+
473
+ # always matches the whole thing because it will get matched last...
474
+ # it just expands to fill whatever gap
475
+
476
+ def each_match( match_range, match_map )
477
+ with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index .. match_range.end_index] ) { |match_map|
478
+ yield( match_range, match_map )
479
+ }
480
+ end
481
+
482
+
483
+ def priority
484
+ -7
485
+ end
486
+
487
+ end
488
+
489
+
490
+ class TextPattern < BasePattern
491
+
492
+ def initialize( source, text )
493
+ super(source)
494
+ @text = text
495
+ end
496
+
497
+ def description
498
+ "text \"#{@text}\""
499
+ end
500
+
501
+ def self.parse( s )
502
+
503
+ if s.index("$") == 0
504
+ return self.new( "$", /[\r\n]/ )
505
+ end
506
+
507
+ if s.index("/") == 0
508
+
509
+ r_s = ""
510
+ s = s[1..-1]
511
+ src = "/"
512
+ while !s.empty?
513
+
514
+ if s.index("/") == 0
515
+ s = s[1..-1]
516
+ src << "/"
517
+ break
518
+ elsif s.index("\\/") == 0
519
+ s = s[2..-1]
520
+ r_s << "/"
521
+ src << "\\/"
522
+ else
523
+ r_s << s[0..0]
524
+ src << s[0..0]
525
+ s = s[1..-1]
526
+ end
527
+
528
+ end
529
+
530
+ regexp = Regexp.compile( r_s )
531
+ return self.new( src, regexp )
532
+
533
+ end
534
+
535
+ if s.index("\"") == 0
536
+
537
+ r_s = ""
538
+ s = s[1..-1]
539
+ src = "\""
540
+ while !s.empty?
541
+
542
+ if s.index("\"") == 0
543
+ s = s[1..-1]
544
+ src << "\""
545
+ break
546
+ elsif s.index("\\\"") == 0
547
+ s = s[2..-1]
548
+ r_s << "\""
549
+ src << "\\\""
550
+ else
551
+ r_s << s[0..0]
552
+ src << s[0..0]
553
+ s = s[1..-1]
554
+ end
555
+
556
+ end
557
+
558
+ return self.new( src, r_s )
559
+ end
560
+
561
+ end
562
+
563
+ def priority
564
+ if name
565
+ if @text.is_a?(String)
566
+ -4
567
+ else
568
+ -5
569
+ end
570
+ else
571
+ if @text.is_a?(String)
572
+ -2
573
+ else
574
+ -3
575
+ end
576
+ end
577
+ end
578
+
579
+ def each_match( match_range, match_map )
580
+
581
+ super(match_range, match_map)
582
+
583
+ match_start_index = match_range.start_index
584
+ match_start_offset = match_range.start_offset
585
+ match_end_index = match_range.start_index
586
+ match_end_offset = match_range.start_offset
587
+
588
+ # consume rest of first text node
589
+
590
+ hnodes = match_range.hnodes
591
+
592
+ actual_match = nil
593
+
594
+ while match_start_index < match_range.end_index
595
+
596
+ while match_start_index < match_range.end_index && ! hnodes[match_start_index].text?
597
+ log( "not text: ##{match_start_index} #{hnodes[match_start_index].class}" )
598
+ match_start_index += 1
599
+ match_start_offset = 0
600
+ end
601
+
602
+ unless match_start_index < match_range.end_index && hnodes[match_start_index].text?
603
+ log( "no match found" )
604
+ return nil
605
+ end
606
+
607
+ hnode_text = hnodes[match_start_index].inner_text
608
+
609
+ log( "trying text match on: #{hnode_text[match_start_offset .. -1]}" )
610
+
611
+ match_offset = hnode_text.index( @text, match_start_offset )
612
+
613
+ if match_offset
614
+
615
+ actual_match = if @text.is_a? Regexp
616
+ hnode_text[match_offset..-1][@text]
617
+ else
618
+ @text
619
+ end
620
+
621
+ match_end_offset = match_start_offset + actual_match.size
622
+ match_start_offset = match_start_offset + actual_match.size
623
+
624
+ if match_end_offset >= match_start_offset
625
+ if match_end_offset >= hnode_text.size
626
+ log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
627
+ else
628
+ log( "matched first #{match_end_offset - match_start_offset} chars" )
629
+ end
630
+ break
631
+ end
632
+ end
633
+
634
+ match_start_index += 1
635
+ match_start_offset = 0
636
+
637
+ end
638
+
639
+
640
+ result = with_scanned_match_data( match_map, actual_match ) { |match_map|
641
+ yield( match_range.crop( match_start_index, match_start_offset, match_start_index, match_end_offset), match_map )
642
+ }
643
+ result
644
+
645
+ end
646
+
647
+ end
648
+
649
+
650
+ class CompositePattern < BasePattern
651
+
652
+ attr_reader :parts
653
+
654
+ def initialize( parts = nil )
655
+ super(nil)
656
+ @parts = if parts.nil?
657
+ []
658
+ else
659
+ parts.clone
660
+ end
661
+ succ_pred = nil
662
+ @parts.each { |part|
663
+ part.pred = succ_pred
664
+ succ_pred.succ = part unless succ_pred.nil?
665
+ succ_pred = part
666
+ }
667
+ end
668
+
669
+
670
+ def dump( level, out )
671
+ out << " " * level + "composite p=#{priority} [\n"
672
+ @parts.each { |part|
673
+ part.dump( level + 1, out )
674
+ }
675
+ out << " " * level + "]\n"
676
+ end
677
+
678
+
679
+ def each_split_match( match_range, match_map, parts_by_priority, ppx, part_matches )
680
+
681
+ pattern = nil
682
+
683
+ while (ppx < parts_by_priority.size)
684
+ pattern = parts_by_priority[ppx]
685
+ break unless pattern.optional
686
+ break unless pattern.pred && ! pattern.pred.matched
687
+ log("skipping optional #{pattern.description}")
688
+ ppx += 1
689
+ end
690
+
691
+ if ppx >= parts_by_priority.size
692
+ log("comp nothing left to do")
693
+ return yield( match_range, match_map )
694
+ end
695
+
696
+
697
+ #
698
+ # figure out which gap this pattern is supposed to fill
699
+ #
700
+
701
+ matched_on_right = pattern.succ
702
+
703
+ while matched_on_right && ! matched_on_right.matched
704
+ matched_on_right = matched_on_right.succ
705
+ end
706
+
707
+ if matched_on_right
708
+ log("comp matching must be left of #{matched_on_right.description}")
709
+ match_range = match_range.head(matched_on_right.matched.start_index, matched_on_right.matched.start_offset)
710
+ end
711
+
712
+ matched_on_left = pattern.pred
713
+
714
+ while matched_on_left && ! matched_on_left.matched
715
+ matched_on_left = matched_on_left.pred
716
+ end
717
+
718
+ if matched_on_left
719
+ log("comp matching must be right of #{matched_on_left.description}")
720
+ match_range = match_range.tail(matched_on_left.matched.end_index, matched_on_left.matched.end_offset)
721
+ end
722
+
723
+ log("comp matching sub-pattern: #{pattern.description} at #{match_range.describe}")
724
+
725
+ pattern.each_match( match_range, match_map ) { |part_match_range, match_map|
726
+
727
+ pattern.matched = part_match_range
728
+
729
+ result = each_split_match( match_range, match_map, parts_by_priority, ppx + 1, part_matches ) { |sub_match_range, sub_match_map|
730
+ yield( sub_match_range, match_map )
731
+ }
732
+
733
+ pattern.matched = nil
734
+
735
+ if result
736
+ log("comp done, returning: #{result}")
737
+ return result
738
+ else
739
+ log("comp not done")
740
+ end
741
+ }
742
+
743
+ return nil
744
+
745
+ end
746
+
747
+
748
+ def each_match( match_range, match_map )
749
+
750
+ @parts.each { |part| part.matched = nil }
751
+
752
+ super(match_range, match_map)
753
+
754
+ # find the hightest priority part and divine up the children
755
+
756
+ parts_by_priority = @parts.sort_by { |part| 0 - part.priority }
757
+ each_split_match( match_range, {}, parts_by_priority, 0, {} ) { |last_match_range, last_match_map|
758
+ comp_match_range = nil
759
+ @parts.each { |part|
760
+ next unless part.matched
761
+ if ! comp_match_range
762
+ comp_match_range = part.matched
763
+ next
764
+ end
765
+ comp_match_range = comp_match_range.extend(part.matched)
766
+ }
767
+ result = with_scanned_match_data( match_map, last_match_map ) { |match_map|
768
+ log_match_data("comp match trying", comp_match_range, match_map)
769
+ yield( comp_match_range, match_map )
770
+ }
771
+ if result
772
+ log("comp match done, returning: #{result}")
773
+ return result
774
+ else
775
+ log("comp match not done")
776
+ end
777
+ }
778
+
779
+ end
780
+
781
+ def description
782
+ "comp ##{pattern_no}"
783
+ end
784
+
785
+ end
786
+
787
+
788
+ class NodeScraper
789
+
790
+ attr_accessor :mcot, :root, :parent, :hnode, :pattern_classes, :top_part_names, :verbose
791
+
792
+
793
+ def initialize( mcot, parent, root, hnode )
794
+ @mcot = mcot
795
+ @parent = parent
796
+ @root = root || self
797
+ @hnode = hnode
798
+ @verbose = mcot.verbose
799
+ end
800
+
801
+
802
+ def log( s )
803
+ mcot.log( s ) if @verbose
804
+ end
805
+
806
+
807
+ def descend( path )
808
+
809
+ results = {}
810
+ @hnode.search( path ).each { |hchild|
811
+ results[hchild] = yield( NodeScraper.new( @mcot, self, @root, hchild ) )
812
+ }
813
+ results
814
+
815
+ end
816
+
817
+
818
+ def flatten_hnodes( ix, node )
819
+ @flattened_hnodes << node
820
+ @hnode_index[node] = ix
821
+ ix += 1
822
+ if node.elem?
823
+ node.children.each { |child|
824
+ ix = flatten_hnodes(ix, child)
825
+ }
826
+ end
827
+ @hnode_succ_index[node] = ix
828
+ ix
829
+ end
830
+
831
+
832
+ def build_hnode_index
833
+
834
+ @flattened_hnodes = []
835
+ @hnode_index = {}
836
+ @hnode_succ_index = {}
837
+
838
+ n = flatten_hnodes( 0, hnode )
839
+
840
+ log( "built index for #{n} hnodes" )
841
+
842
+ end
843
+
844
+
845
+ def flattened_hnodes
846
+ unless @flattened_hnodes
847
+ build_hnode_index
848
+ end
849
+ return @flattened_hnodes
850
+ end
851
+
852
+
853
+ def hnode_index
854
+ unless @hnode_index
855
+ build_hnode_index
856
+ end
857
+ return @hnode_index
858
+ end
859
+
860
+
861
+ def hnode_succ_index
862
+ unless @hnode_succ_index
863
+ build_hnode_succ_index
864
+ end
865
+ return @hnode_succ_index
866
+ end
867
+
868
+
869
+ def collect_gen( pattern_s, call_with, &block )
870
+ pattern = @mcot.compile_pattern( pattern_s, self )
871
+ top_part_names = []
872
+ if pattern.is_a? CompositePattern
873
+ pattern.parts.each { |part|
874
+ top_part_names << part.name if part.name
875
+ }
876
+ end
877
+ log("top part names: #{top_part_names.join(", ")}")
878
+ build_hnode_index
879
+ pattern.dump( 0, $stdout ) if @verbose
880
+ results = []
881
+ match_range = MatchRange.new( self, 0, 0, flattened_hnodes.size, 0)
882
+ while ! match_range.empty?
883
+ result = nil
884
+ pattern.each_match( match_range, {} ) { |sub_match_range, match_map|
885
+ match_list = []
886
+ block_args = if (call_with == :positional) && top_part_names.size > 0
887
+ top_part_names.collect { |top_name|
888
+ match_map[top_name]
889
+ }
890
+ else
891
+ match_map
892
+ end
893
+ log("calling scan block with: #{block_args.join(", ")}")
894
+ result = block.call( *block_args )
895
+ if result
896
+ results << result
897
+ match_range = match_range.following( sub_match_range )
898
+ end
899
+
900
+ result
901
+ }
902
+
903
+ break unless result
904
+ end
905
+ results
906
+ end
907
+
908
+
909
+ def collect( pattern_s, &block )
910
+ collect_gen( pattern_s, :positional, &block )
911
+ end
912
+
913
+
914
+ def collect_hashed( pattern_s, &block )
915
+ collect_gen( pattern_s, :map, &block )
916
+ end
917
+
918
+ def scanner_by_name( name )
919
+ return mcot.scanner_by_name(name)
920
+ end
921
+
922
+ end
923
+
924
+
925
+ def log( s )
926
+ puts( s ) if @verbose
927
+ end
928
+
929
+
930
+
931
+ def scanner_by_name( name )
932
+ @scanners[name]
933
+ end
934
+
935
+
936
+ def compile_pattern( pattern_s, node_scraper )
937
+
938
+ # if @compiled_patterns.key? pattern_s
939
+ # return @compiled_patterns[ pattern_s ]
940
+ # end
941
+
942
+ s = pattern_s
943
+ patterns = []
944
+
945
+ log("compiling: #{s}")
946
+
947
+ while !s.empty?
948
+
949
+ log("left: #{s}")
950
+
951
+ if s.index(")") == 0
952
+ break
953
+ end
954
+
955
+ name = nil
956
+ s.scan( /^(\w+)=/ ) { |match|
957
+ log "got name #{match.class} (#{match.size})"
958
+ name = match[0]
959
+ }
960
+ s = s[(name.length + 1) .. -1] if name
961
+
962
+ log("after name #{name}: #{s}")
963
+
964
+ pattern = nil
965
+
966
+ [PathPattern, TextPattern, AnythingPattern, OptSpacePattern].each { |pattern_class|
967
+ pattern = pattern_class.parse(s)
968
+ if pattern
969
+ pattern.metrocot = self
970
+ pattern.node_scraper = node_scraper
971
+ break
972
+ end
973
+ log "not a #{pattern_class}"
974
+ }
975
+
976
+ if pattern
977
+ s = s[pattern.source.size .. -1]
978
+ patterns << pattern
979
+ log("found: #{pattern.description}")
980
+ if name
981
+ log("scanned as: #{name}")
982
+ pattern.name = name.to_sym
983
+ end
984
+ next
985
+ end
986
+
987
+ if s[0..0] == "+"
988
+ raise "+ must follow pattern" unless patterns.size > 0
989
+ raise "+ applied twice does not make sense" if patterns[-1].is_a? OneOrMorePattern
990
+ pattern = OneOrMorePattern.new( patterns[-1] )
991
+ pattern.metrocot = self
992
+ pattern.node_scraper = node_scraper
993
+ patterns[-1] = pattern
994
+ log("now one or more: #{pattern.repeatee}")
995
+ s = s[1 .. -1]
996
+ next
997
+ end
998
+
999
+ if s[0..0] == "("
1000
+ pattern = compile_pattern( s[1 .. -1], node_scraper )
1001
+ close_par_index = pattern.source.size + 1
1002
+ raise "expected ')' found '#{s[0..0]}'" unless s[close_par_index..close_par_index] == ")"
1003
+ s = s[close_par_index + 1 .. -1]
1004
+ log("found nested: #{pattern.description} \"#{pattern.source}\"")
1005
+ patterns << pattern
1006
+ if name
1007
+ pattern.name = name.to_sym
1008
+ end
1009
+ next
1010
+ end
1011
+
1012
+ raise "unrecognizable pattern: \"#{s[0..10]}...\""
1013
+
1014
+ end
1015
+
1016
+ pattern = if patterns.size > 1
1017
+ CompositePattern.new( patterns )
1018
+ elsif patterns.size == 1
1019
+ patterns[0]
1020
+ else
1021
+ nil
1022
+ end
1023
+
1024
+ if pattern
1025
+ pattern.metrocot = self
1026
+ pattern.node_scraper = node_scraper
1027
+ pattern.source = pattern_s[0 .. (0 - (1 + s.size))]
1028
+ end
1029
+
1030
+ return @compiled_patterns[ pattern_s ] = pattern
1031
+
1032
+ end
1033
+
1034
+
1035
+ attr_accessor :verbose
1036
+
1037
+
1038
+ def initialize( scanners )
1039
+
1040
+ @scanners = {}
1041
+ @compiled_patterns = {}
1042
+
1043
+ scanners.each { |name, value|
1044
+ if value.is_a? Class
1045
+ @scanners[name] = value.new
1046
+ else
1047
+ @scanners[name] = value
1048
+ end
1049
+ }
1050
+
1051
+ @verbose = false
1052
+
1053
+ log("scanners: #{@scanners.inspect}")
1054
+
1055
+ end
1056
+
1057
+
1058
+ def scrape(doc)
1059
+ NodeScraper.new( self, nil, nil, doc )
1060
+ end
1061
+
1062
+
1063
+ module Scanners
1064
+
1065
+ class BaseScanner
1066
+ def scan(data)
1067
+ data.to_s
1068
+ end
1069
+ end
1070
+
1071
+ class DateTimeScanner < BaseScanner
1072
+ def scan( data )
1073
+ if data.is_a? Hpricot::Elem
1074
+ data = data.inner_text
1075
+ end
1076
+ Time.parse(data)
1077
+ end
1078
+ end
1079
+
1080
+ class TextLookupScanner < BaseScanner
1081
+ end
1082
+
1083
+ class TextileScanner < BaseScanner
1084
+ def scan( data )
1085
+ if data.is_a? Hpricot::Elem
1086
+ data = data.inner_text
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ class TextScanner < BaseScanner
1092
+ def scan( data )
1093
+ if data.is_a? Hpricot::Elem
1094
+ data = data.inner_text
1095
+ else
1096
+ data = data.to_s
1097
+ end
1098
+ data
1099
+ end
1100
+ end
1101
+
1102
+ class LineScanner < BaseScanner
1103
+ end
1104
+
1105
+
1106
+ end
1107
+
1108
+ end
1109
+
1110
+ #
1111
+ #############################################################################
1112
+ #