metrocot 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/metrocot.rb ADDED
@@ -0,0 +1,1112 @@
1
+
2
+ #############################################################################
3
+ #
4
+ # Copyright (c) 2009 Metro Cascade Media Inc
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # 'Software'), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+ #############################################################################
26
+ #
27
+ # Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
28
+ # January 1 2009
29
+ #
30
+ #############################################################################
31
+ #
32
+ # We are like tiny pleasantly chirping hex bugs coding away on the
33
+ # shoulders of Why so that we can create more, and do more with less
34
+ # code, not by virtue of any sharpness of mind on our part, or any
35
+ # other distinction, but because we are carried high and raised up
36
+ # by his giant size.
37
+ #
38
+ #############################################################################
39
+ #
40
+
41
+ class Metrocot < Object
42
+
43
+ VERSION = '1.0.0'
44
+
45
+ class MatchRange
46
+
47
+ attr_accessor :node_scraper, :start_index, :start_offset, :end_index, :end_offset, :verbose
48
+
49
+
50
+ def initialize( node_scraper, start_index, start_offset, end_index, end_offset )
51
+ @node_scraper = node_scraper
52
+ @start_index = start_index
53
+ @start_offset = start_offset
54
+ @end_index = end_index
55
+ @end_offset = end_offset
56
+ @verbose = false
57
+ end
58
+
59
+
60
+ def hnodes
61
+ @node_scraper.flattened_hnodes
62
+ end
63
+
64
+
65
+ def crop( crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
66
+ MatchRange.new( node_scraper, crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
67
+ end
68
+
69
+
70
+ def empty?
71
+ return @start_index >= @end_index || (@start_index == @end_index && @start_offset >= end_offset)
72
+ end
73
+
74
+
75
+ def following( other_range )
76
+ MatchRange.new( node_scraper, other_range.end_index, other_range.end_offset, end_index, end_offset )
77
+ end
78
+
79
+
80
+ def tail( tail_start_index, tail_start_offset )
81
+ MatchRange.new( @node_scraper, tail_start_index, tail_start_offset, @end_index, @end_offset )
82
+ end
83
+
84
+
85
+ def head( head_end_index, head_end_offset )
86
+ MatchRange.new( @node_scraper, @start_index, @start_offset, head_end_index, head_end_offset )
87
+ end
88
+
89
+
90
+ def split_at( middle )
91
+ parts = []
92
+ if middle.start_index > 0 || middle.start_offset > 0
93
+ parts << crop_at( 0, 0, middle.start_index, middle.start_offset )
94
+ end
95
+ parts << middle
96
+ if middle.end_index < end_index || (middle.start_end_index == end_index && middle.end_offset < end_offset)
97
+ parts << crop( middle.end_index, middle.end_offset, end_index, end_offset )
98
+ end
99
+ end
100
+
101
+
102
+ def extend( other )
103
+
104
+ extended_range = MatchRange.new( @node_scraper, @start_index, @start_offset, @end_index, @end_offset )
105
+ if other.start_index < extended_range.start_index
106
+ extended_range.start_index = other.start_index
107
+ elsif other.start_index == extended_range.start_index && other.start_offset < extended_range.start_offset
108
+ extended_range.start_offset = other.start_offset
109
+ end
110
+
111
+ if other.end_index > extended_range.end_index
112
+ extended_range.end_index = other.end_index
113
+ elsif other.end_index == extended_range.end_index && other.end_offset > extended_range.end_offset
114
+ extended_range.end_offset = other.end_offset
115
+ end
116
+
117
+ extended_range
118
+
119
+ end
120
+
121
+
122
+ def describe
123
+ "[#{start_index}+#{start_offset} ... #{end_index}+#{end_offset}]"
124
+ end
125
+
126
+ end
127
+
128
+
129
+
130
+ class BasePattern
131
+
132
+ attr_accessor :pred, :succ, :source, :name, :matched, :node_scraper, :metrocot, :pattern_no
133
+
134
+ @@instance_count = 0
135
+
136
+ def initialize( source )
137
+ @source = source
138
+ @pattern_no = @@instance_count
139
+ @@instance_count += 1
140
+ end
141
+
142
+ def optional
143
+ false
144
+ end
145
+
146
+ def log( s )
147
+ metrocot.log("#{self.description}: #{s}")
148
+ end
149
+
150
+ def dump( level, out )
151
+ out << " " * level + description + " p=#{priority}\n"
152
+ end
153
+
154
+ def self.parse(s)
155
+ raise "not supported"
156
+ end
157
+
158
+ def description
159
+ self.class.name
160
+ end
161
+
162
+
163
+ def dump_match_map( out, level, match_map )
164
+ if match_map.is_a? Hash
165
+ out << "{\n"
166
+ level += 1
167
+ match_map.each { |key, value|
168
+ out << " " * level + "#{key} => "
169
+ dump_match_map( out, level, value )
170
+ }
171
+ level -= 1
172
+ out << " " * level + "}\n"
173
+ elsif match_map.is_a? Array
174
+ out << "[\n"
175
+ level += 1
176
+ match_map.each { |value|
177
+ out << " " * level
178
+ dump_match_map( out, level, value )
179
+ }
180
+ level -= 1
181
+ out << " " * level + "]\n"
182
+ elsif match_map.is_a? String
183
+ out << "\"" + match_map + "\"\n"
184
+ elsif match_map.is_a? Hpricot::Elem
185
+ out << "<" + match_map.stag.name + ">\n"
186
+ else
187
+ out << match_map.class.to_s + "\n"
188
+ end
189
+ end
190
+
191
+
192
+ def log_match_data( msg, match_range, match_map )
193
+ log("#{msg} #{match_range.describe} map:")
194
+ if @node_scraper.verbose
195
+ if ! match_map.nil? && match_map != {}
196
+ dump_match_map( STDOUT, 0, match_map )
197
+ end
198
+ end
199
+ end
200
+
201
+
202
+ def each_match( match_range, match_map )
203
+ log_match_data("each_match", match_range, match_map)
204
+ end
205
+
206
+ def priority
207
+ 0
208
+ end
209
+
210
+ def with_scanned_match_data( match_map, match_data )
211
+
212
+ scanner = if name
213
+ @node_scraper.scanner_by_name(name)
214
+ else
215
+ default_scanner
216
+ end
217
+
218
+ if scanner
219
+ # begin
220
+ match_map[name] = scanner.scan(match_data)
221
+ # rescue
222
+ # log("scanner error: #{$!}")
223
+ # return nil
224
+ # end
225
+ elsif name
226
+ match_map[name] = match_data
227
+ elsif self.is_a?(CompositePattern) && match_data.is_a?(Hash)
228
+ log("copying #{match_data.class.name} match data from #{self.description}")
229
+ match_data.each { |key, value|
230
+ match_map[key] = value
231
+ }
232
+ else
233
+ log("not carrying #{match_data.class.name} match data from #{self.description}")
234
+ end
235
+
236
+ result = yield( match_map )
237
+ match_map.delete(name) if name
238
+ return result
239
+
240
+ end
241
+
242
+ def default_scanner
243
+ nil
244
+ end
245
+
246
+
247
+ end
248
+
249
+
250
+ class PathPattern < BasePattern
251
+
252
+ def initialize( source, path )
253
+ super(source)
254
+ @path = path
255
+ end
256
+
257
+ def self.parse( s )
258
+ return nil unless s.index(".") == 0
259
+ return nil if s.index("..") == 0
260
+ space_index = s.index(" ") || s.size
261
+ return nil if space_index == 1
262
+ self.new( s[0 .. space_index - 1], s[1 .. space_index - 1] )
263
+ end
264
+
265
+
266
+ def source
267
+ @source
268
+ end
269
+
270
+
271
+ def description
272
+ "path \"#{@path}\""
273
+ end
274
+
275
+
276
+ def each_match( match_range, match_map )
277
+ super(match_range, match_map)
278
+ result = nil
279
+ search_root = match_range.node_scraper.hnode
280
+ search_root.search( @path ).each { |descendent|
281
+ nix = node_scraper.hnode_index[descendent]
282
+ next_node_nix = node_scraper.hnode_succ_index[descendent]
283
+ unless nix
284
+ @node_scraper.flattened_hnodes.each { |node|
285
+ log( "#{@node_scraper.hnode_index[node]}: #{node}" )
286
+ }
287
+ raise "no node index for #{descendent.class} #{descendent}"
288
+ end
289
+ if nix < match_range.start_index
290
+ log( "too far left: #{nix}" )
291
+ next
292
+ end
293
+ if nix >= match_range.end_index
294
+ log( "too far right: #{nix}" )
295
+ break
296
+ end
297
+ log( "matched path at node #{nix}" )
298
+ result = with_scanned_match_data( match_map, descendent ) { |match_map|
299
+ yield( match_range.crop(nix, 0, next_node_nix, 0) , match_map )
300
+ }
301
+ break if result
302
+ }
303
+ result
304
+ end
305
+
306
+
307
+ def priority
308
+ 1
309
+ end
310
+
311
+
312
+ end
313
+
314
+
315
+ class OptSpacePattern < BasePattern
316
+
317
+ def initialize
318
+ super(" ")
319
+ end
320
+
321
+ def description
322
+ "spaces"
323
+ end
324
+
325
+ def optional
326
+ true
327
+ end
328
+
329
+ def self.parse( s )
330
+ return nil unless s[0..0] == " "
331
+ OptSpacePattern.new
332
+ end
333
+
334
+ def priority
335
+ -7
336
+ end
337
+
338
+ def each_match( match_range, match_map )
339
+ super(match_range, match_map)
340
+ match_start_index = match_range.start_index
341
+ match_start_offset = match_range.start_offset
342
+ match_end_index = match_range.start_index
343
+ match_end_offset = match_range.start_offset
344
+
345
+ # consume rest of first text node
346
+
347
+ hnodes = match_range.hnodes
348
+
349
+ if hnodes[match_start_index] && hnodes[match_start_index].text?
350
+ hnode_text = hnodes[match_start_index].inner_text
351
+ while match_end_offset < hnode_text.size && (/\s+/.=== hnode_text[match_start_offset .. match_end_offset])
352
+ match_end_offset += 1
353
+ end
354
+
355
+ if match_end_offset > match_start_offset
356
+ if match_end_offset >= hnode_text.size
357
+ match_range = match_range.tail( match_end_index + 1, 0 )
358
+ log( "matched entire string of #{match_end_offset - match_start_offset} spaces" )
359
+ else
360
+ match_range = match_range.tail( match_end_index, match_end_offset )
361
+ log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
362
+ end
363
+ end
364
+ end
365
+
366
+ result = with_scanned_match_data( match_map, hnodes[match_start_index ... match_end_index] ) { |match_map|
367
+ yield( match_range, match_map )
368
+ }
369
+ result
370
+
371
+ end
372
+
373
+ end
374
+
375
+
376
+ class OneOrMorePattern < BasePattern
377
+
378
+ def initialize(repeatee)
379
+ super( nil )
380
+ @repeatee = repeatee
381
+ end
382
+
383
+
384
+ def self.parse(s)
385
+ raise "not implemented"
386
+ end
387
+
388
+
389
+ def repeatee
390
+ @repeatee
391
+ end
392
+
393
+
394
+ def description
395
+ "one+ ##{pattern_no}"
396
+ end
397
+
398
+
399
+ def dump( level, out )
400
+ out << " " * level + "one or more p=#{priority}\n"
401
+ @repeatee.dump( level + 1, out )
402
+ end
403
+
404
+
405
+ def priority
406
+ @repeatee.priority
407
+ end
408
+
409
+
410
+ def consume_remaining_matches( match_range, match_map, matches )
411
+
412
+ log("consuming remaining matches in #{match_range}")
413
+ @repeatee.each_match( match_range, match_map ) { |r_match_range, r_match_map|
414
+ matches << r_match_range.hnodes[r_match_range.start_index ... r_match_range.end_index]
415
+ last_match_range = consume_remaining_matches( match_range.tail( r_match_range.end_index, r_match_range.end_offset ), match_map, matches )
416
+ return last_match_range || r_match_range
417
+ }
418
+ return nil
419
+
420
+ end
421
+
422
+
423
+ def each_match( match_range, match_map )
424
+
425
+ super(match_range, match_map)
426
+
427
+ log("looking for first match in #{match_range}")
428
+ @repeatee.each_match( match_range, match_map ) { |first_match_range, first_match_map|
429
+ results = []
430
+ last_match_range = consume_remaining_matches( match_range.tail( first_match_range.end_index, first_match_range.end_offset ), match_map, results )
431
+
432
+ combined_match_range = if last_match_range
433
+ match_range.crop(
434
+ first_match_range.start_index, first_match_range.start_offset,
435
+ last_match_range.end_index, last_match_range.end_offset
436
+ )
437
+ else
438
+ first_match_range
439
+ end
440
+
441
+ log("combined match in #{combined_match_range}")
442
+
443
+ result = with_scanned_match_data( match_map, results ) { |match_map|
444
+ yield( combined_match_range, results )
445
+ }
446
+
447
+ if result
448
+ log("one+ match done with #{result}")
449
+ return result
450
+ else
451
+ log("one+ match not done")
452
+ end
453
+ }
454
+
455
+ return nil
456
+ end
457
+
458
+ end
459
+
460
+
461
+ class AnythingPattern < BasePattern
462
+
463
+ def description
464
+ "anything"
465
+ end
466
+
467
+ def self.parse( s )
468
+ return nil unless s.index("...") == 0
469
+ self.new("...")
470
+ end
471
+
472
+
473
+ # always matches the whole thing because it will get matched last...
474
+ # it just expands to fill whatever gap
475
+
476
+ def each_match( match_range, match_map )
477
+ with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index .. match_range.end_index] ) { |match_map|
478
+ yield( match_range, match_map )
479
+ }
480
+ end
481
+
482
+
483
+ def priority
484
+ -7
485
+ end
486
+
487
+ end
488
+
489
+
490
+ class TextPattern < BasePattern
491
+
492
+ def initialize( source, text )
493
+ super(source)
494
+ @text = text
495
+ end
496
+
497
+ def description
498
+ "text \"#{@text}\""
499
+ end
500
+
501
+ def self.parse( s )
502
+
503
+ if s.index("$") == 0
504
+ return self.new( "$", /[\r\n]/ )
505
+ end
506
+
507
+ if s.index("/") == 0
508
+
509
+ r_s = ""
510
+ s = s[1..-1]
511
+ src = "/"
512
+ while !s.empty?
513
+
514
+ if s.index("/") == 0
515
+ s = s[1..-1]
516
+ src << "/"
517
+ break
518
+ elsif s.index("\\/") == 0
519
+ s = s[2..-1]
520
+ r_s << "/"
521
+ src << "\\/"
522
+ else
523
+ r_s << s[0..0]
524
+ src << s[0..0]
525
+ s = s[1..-1]
526
+ end
527
+
528
+ end
529
+
530
+ regexp = Regexp.compile( r_s )
531
+ return self.new( src, regexp )
532
+
533
+ end
534
+
535
+ if s.index("\"") == 0
536
+
537
+ r_s = ""
538
+ s = s[1..-1]
539
+ src = "\""
540
+ while !s.empty?
541
+
542
+ if s.index("\"") == 0
543
+ s = s[1..-1]
544
+ src << "\""
545
+ break
546
+ elsif s.index("\\\"") == 0
547
+ s = s[2..-1]
548
+ r_s << "\""
549
+ src << "\\\""
550
+ else
551
+ r_s << s[0..0]
552
+ src << s[0..0]
553
+ s = s[1..-1]
554
+ end
555
+
556
+ end
557
+
558
+ return self.new( src, r_s )
559
+ end
560
+
561
+ end
562
+
563
+ def priority
564
+ if name
565
+ if @text.is_a?(String)
566
+ -4
567
+ else
568
+ -5
569
+ end
570
+ else
571
+ if @text.is_a?(String)
572
+ -2
573
+ else
574
+ -3
575
+ end
576
+ end
577
+ end
578
+
579
+ def each_match( match_range, match_map )
580
+
581
+ super(match_range, match_map)
582
+
583
+ match_start_index = match_range.start_index
584
+ match_start_offset = match_range.start_offset
585
+ match_end_index = match_range.start_index
586
+ match_end_offset = match_range.start_offset
587
+
588
+ # consume rest of first text node
589
+
590
+ hnodes = match_range.hnodes
591
+
592
+ actual_match = nil
593
+
594
+ while match_start_index < match_range.end_index
595
+
596
+ while match_start_index < match_range.end_index && ! hnodes[match_start_index].text?
597
+ log( "not text: ##{match_start_index} #{hnodes[match_start_index].class}" )
598
+ match_start_index += 1
599
+ match_start_offset = 0
600
+ end
601
+
602
+ unless match_start_index < match_range.end_index && hnodes[match_start_index].text?
603
+ log( "no match found" )
604
+ return nil
605
+ end
606
+
607
+ hnode_text = hnodes[match_start_index].inner_text
608
+
609
+ log( "trying text match on: #{hnode_text[match_start_offset .. -1]}" )
610
+
611
+ match_offset = hnode_text.index( @text, match_start_offset )
612
+
613
+ if match_offset
614
+
615
+ actual_match = if @text.is_a? Regexp
616
+ hnode_text[match_offset..-1][@text]
617
+ else
618
+ @text
619
+ end
620
+
621
+ match_end_offset = match_start_offset + actual_match.size
622
+ match_start_offset = match_start_offset + actual_match.size
623
+
624
+ if match_end_offset >= match_start_offset
625
+ if match_end_offset >= hnode_text.size
626
+ log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
627
+ else
628
+ log( "matched first #{match_end_offset - match_start_offset} chars" )
629
+ end
630
+ break
631
+ end
632
+ end
633
+
634
+ match_start_index += 1
635
+ match_start_offset = 0
636
+
637
+ end
638
+
639
+
640
+ result = with_scanned_match_data( match_map, actual_match ) { |match_map|
641
+ yield( match_range.crop( match_start_index, match_start_offset, match_start_index, match_end_offset), match_map )
642
+ }
643
+ result
644
+
645
+ end
646
+
647
+ end
648
+
649
+
650
+ class CompositePattern < BasePattern
651
+
652
+ attr_reader :parts
653
+
654
+ def initialize( parts = nil )
655
+ super(nil)
656
+ @parts = if parts.nil?
657
+ []
658
+ else
659
+ parts.clone
660
+ end
661
+ succ_pred = nil
662
+ @parts.each { |part|
663
+ part.pred = succ_pred
664
+ succ_pred.succ = part unless succ_pred.nil?
665
+ succ_pred = part
666
+ }
667
+ end
668
+
669
+
670
+ def dump( level, out )
671
+ out << " " * level + "composite p=#{priority} [\n"
672
+ @parts.each { |part|
673
+ part.dump( level + 1, out )
674
+ }
675
+ out << " " * level + "]\n"
676
+ end
677
+
678
+
679
+ def each_split_match( match_range, match_map, parts_by_priority, ppx, part_matches )
680
+
681
+ pattern = nil
682
+
683
+ while (ppx < parts_by_priority.size)
684
+ pattern = parts_by_priority[ppx]
685
+ break unless pattern.optional
686
+ break unless pattern.pred && ! pattern.pred.matched
687
+ log("skipping optional #{pattern.description}")
688
+ ppx += 1
689
+ end
690
+
691
+ if ppx >= parts_by_priority.size
692
+ log("comp nothing left to do")
693
+ return yield( match_range, match_map )
694
+ end
695
+
696
+
697
+ #
698
+ # figure out which gap this pattern is supposed to fill
699
+ #
700
+
701
+ matched_on_right = pattern.succ
702
+
703
+ while matched_on_right && ! matched_on_right.matched
704
+ matched_on_right = matched_on_right.succ
705
+ end
706
+
707
+ if matched_on_right
708
+ log("comp matching must be left of #{matched_on_right.description}")
709
+ match_range = match_range.head(matched_on_right.matched.start_index, matched_on_right.matched.start_offset)
710
+ end
711
+
712
+ matched_on_left = pattern.pred
713
+
714
+ while matched_on_left && ! matched_on_left.matched
715
+ matched_on_left = matched_on_left.pred
716
+ end
717
+
718
+ if matched_on_left
719
+ log("comp matching must be right of #{matched_on_left.description}")
720
+ match_range = match_range.tail(matched_on_left.matched.end_index, matched_on_left.matched.end_offset)
721
+ end
722
+
723
+ log("comp matching sub-pattern: #{pattern.description} at #{match_range.describe}")
724
+
725
+ pattern.each_match( match_range, match_map ) { |part_match_range, match_map|
726
+
727
+ pattern.matched = part_match_range
728
+
729
+ result = each_split_match( match_range, match_map, parts_by_priority, ppx + 1, part_matches ) { |sub_match_range, sub_match_map|
730
+ yield( sub_match_range, match_map )
731
+ }
732
+
733
+ pattern.matched = nil
734
+
735
+ if result
736
+ log("comp done, returning: #{result}")
737
+ return result
738
+ else
739
+ log("comp not done")
740
+ end
741
+ }
742
+
743
+ return nil
744
+
745
+ end
746
+
747
+
748
+ def each_match( match_range, match_map )
749
+
750
+ @parts.each { |part| part.matched = nil }
751
+
752
+ super(match_range, match_map)
753
+
754
+ # find the hightest priority part and divine up the children
755
+
756
+ parts_by_priority = @parts.sort_by { |part| 0 - part.priority }
757
+ each_split_match( match_range, {}, parts_by_priority, 0, {} ) { |last_match_range, last_match_map|
758
+ comp_match_range = nil
759
+ @parts.each { |part|
760
+ next unless part.matched
761
+ if ! comp_match_range
762
+ comp_match_range = part.matched
763
+ next
764
+ end
765
+ comp_match_range = comp_match_range.extend(part.matched)
766
+ }
767
+ result = with_scanned_match_data( match_map, last_match_map ) { |match_map|
768
+ log_match_data("comp match trying", comp_match_range, match_map)
769
+ yield( comp_match_range, match_map )
770
+ }
771
+ if result
772
+ log("comp match done, returning: #{result}")
773
+ return result
774
+ else
775
+ log("comp match not done")
776
+ end
777
+ }
778
+
779
+ end
780
+
781
+ def description
782
+ "comp ##{pattern_no}"
783
+ end
784
+
785
+ end
786
+
787
+
788
+ class NodeScraper
789
+
790
+ attr_accessor :mcot, :root, :parent, :hnode, :pattern_classes, :top_part_names, :verbose
791
+
792
+
793
+ def initialize( mcot, parent, root, hnode )
794
+ @mcot = mcot
795
+ @parent = parent
796
+ @root = root || self
797
+ @hnode = hnode
798
+ @verbose = mcot.verbose
799
+ end
800
+
801
+
802
+ def log( s )
803
+ mcot.log( s ) if @verbose
804
+ end
805
+
806
+
807
+ def descend( path )
808
+
809
+ results = {}
810
+ @hnode.search( path ).each { |hchild|
811
+ results[hchild] = yield( NodeScraper.new( @mcot, self, @root, hchild ) )
812
+ }
813
+ results
814
+
815
+ end
816
+
817
+
818
+ def flatten_hnodes( ix, node )
819
+ @flattened_hnodes << node
820
+ @hnode_index[node] = ix
821
+ ix += 1
822
+ if node.elem?
823
+ node.children.each { |child|
824
+ ix = flatten_hnodes(ix, child)
825
+ }
826
+ end
827
+ @hnode_succ_index[node] = ix
828
+ ix
829
+ end
830
+
831
+
832
+ def build_hnode_index
833
+
834
+ @flattened_hnodes = []
835
+ @hnode_index = {}
836
+ @hnode_succ_index = {}
837
+
838
+ n = flatten_hnodes( 0, hnode )
839
+
840
+ log( "built index for #{n} hnodes" )
841
+
842
+ end
843
+
844
+
845
+ def flattened_hnodes
846
+ unless @flattened_hnodes
847
+ build_hnode_index
848
+ end
849
+ return @flattened_hnodes
850
+ end
851
+
852
+
853
+ def hnode_index
854
+ unless @hnode_index
855
+ build_hnode_index
856
+ end
857
+ return @hnode_index
858
+ end
859
+
860
+
861
+ def hnode_succ_index
862
+ unless @hnode_succ_index
863
+ build_hnode_succ_index
864
+ end
865
+ return @hnode_succ_index
866
+ end
867
+
868
+
869
+ def collect_gen( pattern_s, call_with, &block )
870
+ pattern = @mcot.compile_pattern( pattern_s, self )
871
+ top_part_names = []
872
+ if pattern.is_a? CompositePattern
873
+ pattern.parts.each { |part|
874
+ top_part_names << part.name if part.name
875
+ }
876
+ end
877
+ log("top part names: #{top_part_names.join(", ")}")
878
+ build_hnode_index
879
+ pattern.dump( 0, $stdout ) if @verbose
880
+ results = []
881
+ match_range = MatchRange.new( self, 0, 0, flattened_hnodes.size, 0)
882
+ while ! match_range.empty?
883
+ result = nil
884
+ pattern.each_match( match_range, {} ) { |sub_match_range, match_map|
885
+ match_list = []
886
+ block_args = if (call_with == :positional) && top_part_names.size > 0
887
+ top_part_names.collect { |top_name|
888
+ match_map[top_name]
889
+ }
890
+ else
891
+ match_map
892
+ end
893
+ log("calling scan block with: #{block_args.join(", ")}")
894
+ result = block.call( *block_args )
895
+ if result
896
+ results << result
897
+ match_range = match_range.following( sub_match_range )
898
+ end
899
+
900
+ result
901
+ }
902
+
903
+ break unless result
904
+ end
905
+ results
906
+ end
907
+
908
+
909
+ def collect( pattern_s, &block )
910
+ collect_gen( pattern_s, :positional, &block )
911
+ end
912
+
913
+
914
+ def collect_hashed( pattern_s, &block )
915
+ collect_gen( pattern_s, :map, &block )
916
+ end
917
+
918
+ def scanner_by_name( name )
919
+ return mcot.scanner_by_name(name)
920
+ end
921
+
922
+ end
923
+
924
+
925
+ def log( s )
926
+ puts( s ) if @verbose
927
+ end
928
+
929
+
930
+
931
+ def scanner_by_name( name )
932
+ @scanners[name]
933
+ end
934
+
935
+
936
+ def compile_pattern( pattern_s, node_scraper )
937
+
938
+ # if @compiled_patterns.key? pattern_s
939
+ # return @compiled_patterns[ pattern_s ]
940
+ # end
941
+
942
+ s = pattern_s
943
+ patterns = []
944
+
945
+ log("compiling: #{s}")
946
+
947
+ while !s.empty?
948
+
949
+ log("left: #{s}")
950
+
951
+ if s.index(")") == 0
952
+ break
953
+ end
954
+
955
+ name = nil
956
+ s.scan( /^(\w+)=/ ) { |match|
957
+ log "got name #{match.class} (#{match.size})"
958
+ name = match[0]
959
+ }
960
+ s = s[(name.length + 1) .. -1] if name
961
+
962
+ log("after name #{name}: #{s}")
963
+
964
+ pattern = nil
965
+
966
+ [PathPattern, TextPattern, AnythingPattern, OptSpacePattern].each { |pattern_class|
967
+ pattern = pattern_class.parse(s)
968
+ if pattern
969
+ pattern.metrocot = self
970
+ pattern.node_scraper = node_scraper
971
+ break
972
+ end
973
+ log "not a #{pattern_class}"
974
+ }
975
+
976
+ if pattern
977
+ s = s[pattern.source.size .. -1]
978
+ patterns << pattern
979
+ log("found: #{pattern.description}")
980
+ if name
981
+ log("scanned as: #{name}")
982
+ pattern.name = name.to_sym
983
+ end
984
+ next
985
+ end
986
+
987
+ if s[0..0] == "+"
988
+ raise "+ must follow pattern" unless patterns.size > 0
989
+ raise "+ applied twice does not make sense" if patterns[-1].is_a? OneOrMorePattern
990
+ pattern = OneOrMorePattern.new( patterns[-1] )
991
+ pattern.metrocot = self
992
+ pattern.node_scraper = node_scraper
993
+ patterns[-1] = pattern
994
+ log("now one or more: #{pattern.repeatee}")
995
+ s = s[1 .. -1]
996
+ next
997
+ end
998
+
999
+ if s[0..0] == "("
1000
+ pattern = compile_pattern( s[1 .. -1], node_scraper )
1001
+ close_par_index = pattern.source.size + 1
1002
+ raise "expected ')' found '#{s[0..0]}'" unless s[close_par_index..close_par_index] == ")"
1003
+ s = s[close_par_index + 1 .. -1]
1004
+ log("found nested: #{pattern.description} \"#{pattern.source}\"")
1005
+ patterns << pattern
1006
+ if name
1007
+ pattern.name = name.to_sym
1008
+ end
1009
+ next
1010
+ end
1011
+
1012
+ raise "unrecognizable pattern: \"#{s[0..10]}...\""
1013
+
1014
+ end
1015
+
1016
+ pattern = if patterns.size > 1
1017
+ CompositePattern.new( patterns )
1018
+ elsif patterns.size == 1
1019
+ patterns[0]
1020
+ else
1021
+ nil
1022
+ end
1023
+
1024
+ if pattern
1025
+ pattern.metrocot = self
1026
+ pattern.node_scraper = node_scraper
1027
+ pattern.source = pattern_s[0 .. (0 - (1 + s.size))]
1028
+ end
1029
+
1030
+ return @compiled_patterns[ pattern_s ] = pattern
1031
+
1032
+ end
1033
+
1034
+
1035
+ attr_accessor :verbose
1036
+
1037
+
1038
+ def initialize( scanners )
1039
+
1040
+ @scanners = {}
1041
+ @compiled_patterns = {}
1042
+
1043
+ scanners.each { |name, value|
1044
+ if value.is_a? Class
1045
+ @scanners[name] = value.new
1046
+ else
1047
+ @scanners[name] = value
1048
+ end
1049
+ }
1050
+
1051
+ @verbose = false
1052
+
1053
+ log("scanners: #{@scanners.inspect}")
1054
+
1055
+ end
1056
+
1057
+
1058
+ def scrape(doc)
1059
+ NodeScraper.new( self, nil, nil, doc )
1060
+ end
1061
+
1062
+
1063
+ module Scanners
1064
+
1065
+ class BaseScanner
1066
+ def scan(data)
1067
+ data.to_s
1068
+ end
1069
+ end
1070
+
1071
+ class DateTimeScanner < BaseScanner
1072
+ def scan( data )
1073
+ if data.is_a? Hpricot::Elem
1074
+ data = data.inner_text
1075
+ end
1076
+ Time.parse(data)
1077
+ end
1078
+ end
1079
+
1080
+ class TextLookupScanner < BaseScanner
1081
+ end
1082
+
1083
+ class TextileScanner < BaseScanner
1084
+ def scan( data )
1085
+ if data.is_a? Hpricot::Elem
1086
+ data = data.inner_text
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ class TextScanner < BaseScanner
1092
+ def scan( data )
1093
+ if data.is_a? Hpricot::Elem
1094
+ data = data.inner_text
1095
+ else
1096
+ data = data.to_s
1097
+ end
1098
+ data
1099
+ end
1100
+ end
1101
+
1102
+ class LineScanner < BaseScanner
1103
+ end
1104
+
1105
+
1106
+ end
1107
+
1108
+ end
1109
+
1110
+ #
1111
+ #############################################################################
1112
+ #