metrocot 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Manifest.txt +7 -0
- data/README.txt +98 -0
- data/Rakefile +12 -0
- data/bin/metrocot +109 -0
- data/lib/metrocot.rb +1112 -0
- data/test/test_metrocot.rb +70 -0
- metadata +72 -0
data/lib/metrocot.rb
ADDED
@@ -0,0 +1,1112 @@
|
|
1
|
+
|
2
|
+
#############################################################################
|
3
|
+
#
|
4
|
+
# Copyright (c) 2009 Metro Cascade Media Inc
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# 'Software'), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
20
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
21
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
22
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
23
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#
|
25
|
+
#############################################################################
|
26
|
+
#
|
27
|
+
# Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
|
28
|
+
# January 1 2009
|
29
|
+
#
|
30
|
+
#############################################################################
|
31
|
+
#
|
32
|
+
# We are like tiny pleasantly chirping hex bugs coding away on the
|
33
|
+
# shoulders of Why so that we can create more, and do more with less
|
34
|
+
# code, not by virtue of any sharpness of mind on our part, or any
|
35
|
+
# other distinction, but because we are carried high and raised up
|
36
|
+
# by his giant size.
|
37
|
+
#
|
38
|
+
#############################################################################
|
39
|
+
#
|
40
|
+
|
41
|
+
class Metrocot < Object
|
42
|
+
|
43
|
+
VERSION = '1.0.0'
|
44
|
+
|
45
|
+
class MatchRange
|
46
|
+
|
47
|
+
attr_accessor :node_scraper, :start_index, :start_offset, :end_index, :end_offset, :verbose
|
48
|
+
|
49
|
+
|
50
|
+
def initialize( node_scraper, start_index, start_offset, end_index, end_offset )
|
51
|
+
@node_scraper = node_scraper
|
52
|
+
@start_index = start_index
|
53
|
+
@start_offset = start_offset
|
54
|
+
@end_index = end_index
|
55
|
+
@end_offset = end_offset
|
56
|
+
@verbose = false
|
57
|
+
end
|
58
|
+
|
59
|
+
|
60
|
+
def hnodes
|
61
|
+
@node_scraper.flattened_hnodes
|
62
|
+
end
|
63
|
+
|
64
|
+
|
65
|
+
def crop( crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
|
66
|
+
MatchRange.new( node_scraper, crop_start_index, crop_start_offset, crop_end_index, crop_end_offset )
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
def empty?
|
71
|
+
return @start_index >= @end_index || (@start_index == @end_index && @start_offset >= end_offset)
|
72
|
+
end
|
73
|
+
|
74
|
+
|
75
|
+
def following( other_range )
|
76
|
+
MatchRange.new( node_scraper, other_range.end_index, other_range.end_offset, end_index, end_offset )
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
def tail( tail_start_index, tail_start_offset )
|
81
|
+
MatchRange.new( @node_scraper, tail_start_index, tail_start_offset, @end_index, @end_offset )
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
def head( head_end_index, head_end_offset )
|
86
|
+
MatchRange.new( @node_scraper, @start_index, @start_offset, head_end_index, head_end_offset )
|
87
|
+
end
|
88
|
+
|
89
|
+
|
90
|
+
def split_at( middle )
|
91
|
+
parts = []
|
92
|
+
if middle.start_index > 0 || middle.start_offset > 0
|
93
|
+
parts << crop_at( 0, 0, middle.start_index, middle.start_offset )
|
94
|
+
end
|
95
|
+
parts << middle
|
96
|
+
if middle.end_index < end_index || (middle.start_end_index == end_index && middle.end_offset < end_offset)
|
97
|
+
parts << crop( middle.end_index, middle.end_offset, end_index, end_offset )
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
def extend( other )
|
103
|
+
|
104
|
+
extended_range = MatchRange.new( @node_scraper, @start_index, @start_offset, @end_index, @end_offset )
|
105
|
+
if other.start_index < extended_range.start_index
|
106
|
+
extended_range.start_index = other.start_index
|
107
|
+
elsif other.start_index == extended_range.start_index && other.start_offset < extended_range.start_offset
|
108
|
+
extended_range.start_offset = other.start_offset
|
109
|
+
end
|
110
|
+
|
111
|
+
if other.end_index > extended_range.end_index
|
112
|
+
extended_range.end_index = other.end_index
|
113
|
+
elsif other.end_index == extended_range.end_index && other.end_offset > extended_range.end_offset
|
114
|
+
extended_range.end_offset = other.end_offset
|
115
|
+
end
|
116
|
+
|
117
|
+
extended_range
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
|
122
|
+
def describe
|
123
|
+
"[#{start_index}+#{start_offset} ... #{end_index}+#{end_offset}]"
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
|
128
|
+
|
129
|
+
|
130
|
+
class BasePattern
|
131
|
+
|
132
|
+
attr_accessor :pred, :succ, :source, :name, :matched, :node_scraper, :metrocot, :pattern_no
|
133
|
+
|
134
|
+
@@instance_count = 0
|
135
|
+
|
136
|
+
def initialize( source )
|
137
|
+
@source = source
|
138
|
+
@pattern_no = @@instance_count
|
139
|
+
@@instance_count += 1
|
140
|
+
end
|
141
|
+
|
142
|
+
def optional
|
143
|
+
false
|
144
|
+
end
|
145
|
+
|
146
|
+
def log( s )
|
147
|
+
metrocot.log("#{self.description}: #{s}")
|
148
|
+
end
|
149
|
+
|
150
|
+
def dump( level, out )
|
151
|
+
out << " " * level + description + " p=#{priority}\n"
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.parse(s)
|
155
|
+
raise "not supported"
|
156
|
+
end
|
157
|
+
|
158
|
+
def description
|
159
|
+
self.class.name
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def dump_match_map( out, level, match_map )
|
164
|
+
if match_map.is_a? Hash
|
165
|
+
out << "{\n"
|
166
|
+
level += 1
|
167
|
+
match_map.each { |key, value|
|
168
|
+
out << " " * level + "#{key} => "
|
169
|
+
dump_match_map( out, level, value )
|
170
|
+
}
|
171
|
+
level -= 1
|
172
|
+
out << " " * level + "}\n"
|
173
|
+
elsif match_map.is_a? Array
|
174
|
+
out << "[\n"
|
175
|
+
level += 1
|
176
|
+
match_map.each { |value|
|
177
|
+
out << " " * level
|
178
|
+
dump_match_map( out, level, value )
|
179
|
+
}
|
180
|
+
level -= 1
|
181
|
+
out << " " * level + "]\n"
|
182
|
+
elsif match_map.is_a? String
|
183
|
+
out << "\"" + match_map + "\"\n"
|
184
|
+
elsif match_map.is_a? Hpricot::Elem
|
185
|
+
out << "<" + match_map.stag.name + ">\n"
|
186
|
+
else
|
187
|
+
out << match_map.class.to_s + "\n"
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
|
192
|
+
def log_match_data( msg, match_range, match_map )
|
193
|
+
log("#{msg} #{match_range.describe} map:")
|
194
|
+
if @node_scraper.verbose
|
195
|
+
if ! match_map.nil? && match_map != {}
|
196
|
+
dump_match_map( STDOUT, 0, match_map )
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
|
202
|
+
def each_match( match_range, match_map )
|
203
|
+
log_match_data("each_match", match_range, match_map)
|
204
|
+
end
|
205
|
+
|
206
|
+
def priority
|
207
|
+
0
|
208
|
+
end
|
209
|
+
|
210
|
+
def with_scanned_match_data( match_map, match_data )
|
211
|
+
|
212
|
+
scanner = if name
|
213
|
+
@node_scraper.scanner_by_name(name)
|
214
|
+
else
|
215
|
+
default_scanner
|
216
|
+
end
|
217
|
+
|
218
|
+
if scanner
|
219
|
+
# begin
|
220
|
+
match_map[name] = scanner.scan(match_data)
|
221
|
+
# rescue
|
222
|
+
# log("scanner error: #{$!}")
|
223
|
+
# return nil
|
224
|
+
# end
|
225
|
+
elsif name
|
226
|
+
match_map[name] = match_data
|
227
|
+
elsif self.is_a?(CompositePattern) && match_data.is_a?(Hash)
|
228
|
+
log("copying #{match_data.class.name} match data from #{self.description}")
|
229
|
+
match_data.each { |key, value|
|
230
|
+
match_map[key] = value
|
231
|
+
}
|
232
|
+
else
|
233
|
+
log("not carrying #{match_data.class.name} match data from #{self.description}")
|
234
|
+
end
|
235
|
+
|
236
|
+
result = yield( match_map )
|
237
|
+
match_map.delete(name) if name
|
238
|
+
return result
|
239
|
+
|
240
|
+
end
|
241
|
+
|
242
|
+
def default_scanner
|
243
|
+
nil
|
244
|
+
end
|
245
|
+
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
|
250
|
+
class PathPattern < BasePattern
|
251
|
+
|
252
|
+
def initialize( source, path )
|
253
|
+
super(source)
|
254
|
+
@path = path
|
255
|
+
end
|
256
|
+
|
257
|
+
def self.parse( s )
|
258
|
+
return nil unless s.index(".") == 0
|
259
|
+
return nil if s.index("..") == 0
|
260
|
+
space_index = s.index(" ") || s.size
|
261
|
+
return nil if space_index == 1
|
262
|
+
self.new( s[0 .. space_index - 1], s[1 .. space_index - 1] )
|
263
|
+
end
|
264
|
+
|
265
|
+
|
266
|
+
def source
|
267
|
+
@source
|
268
|
+
end
|
269
|
+
|
270
|
+
|
271
|
+
def description
|
272
|
+
"path \"#{@path}\""
|
273
|
+
end
|
274
|
+
|
275
|
+
|
276
|
+
def each_match( match_range, match_map )
|
277
|
+
super(match_range, match_map)
|
278
|
+
result = nil
|
279
|
+
search_root = match_range.node_scraper.hnode
|
280
|
+
search_root.search( @path ).each { |descendent|
|
281
|
+
nix = node_scraper.hnode_index[descendent]
|
282
|
+
next_node_nix = node_scraper.hnode_succ_index[descendent]
|
283
|
+
unless nix
|
284
|
+
@node_scraper.flattened_hnodes.each { |node|
|
285
|
+
log( "#{@node_scraper.hnode_index[node]}: #{node}" )
|
286
|
+
}
|
287
|
+
raise "no node index for #{descendent.class} #{descendent}"
|
288
|
+
end
|
289
|
+
if nix < match_range.start_index
|
290
|
+
log( "too far left: #{nix}" )
|
291
|
+
next
|
292
|
+
end
|
293
|
+
if nix >= match_range.end_index
|
294
|
+
log( "too far right: #{nix}" )
|
295
|
+
break
|
296
|
+
end
|
297
|
+
log( "matched path at node #{nix}" )
|
298
|
+
result = with_scanned_match_data( match_map, descendent ) { |match_map|
|
299
|
+
yield( match_range.crop(nix, 0, next_node_nix, 0) , match_map )
|
300
|
+
}
|
301
|
+
break if result
|
302
|
+
}
|
303
|
+
result
|
304
|
+
end
|
305
|
+
|
306
|
+
|
307
|
+
def priority
|
308
|
+
1
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
end
|
313
|
+
|
314
|
+
|
315
|
+
class OptSpacePattern < BasePattern
|
316
|
+
|
317
|
+
def initialize
|
318
|
+
super(" ")
|
319
|
+
end
|
320
|
+
|
321
|
+
def description
|
322
|
+
"spaces"
|
323
|
+
end
|
324
|
+
|
325
|
+
def optional
|
326
|
+
true
|
327
|
+
end
|
328
|
+
|
329
|
+
def self.parse( s )
|
330
|
+
return nil unless s[0..0] == " "
|
331
|
+
OptSpacePattern.new
|
332
|
+
end
|
333
|
+
|
334
|
+
def priority
|
335
|
+
-7
|
336
|
+
end
|
337
|
+
|
338
|
+
def each_match( match_range, match_map )
|
339
|
+
super(match_range, match_map)
|
340
|
+
match_start_index = match_range.start_index
|
341
|
+
match_start_offset = match_range.start_offset
|
342
|
+
match_end_index = match_range.start_index
|
343
|
+
match_end_offset = match_range.start_offset
|
344
|
+
|
345
|
+
# consume rest of first text node
|
346
|
+
|
347
|
+
hnodes = match_range.hnodes
|
348
|
+
|
349
|
+
if hnodes[match_start_index] && hnodes[match_start_index].text?
|
350
|
+
hnode_text = hnodes[match_start_index].inner_text
|
351
|
+
while match_end_offset < hnode_text.size && (/\s+/.=== hnode_text[match_start_offset .. match_end_offset])
|
352
|
+
match_end_offset += 1
|
353
|
+
end
|
354
|
+
|
355
|
+
if match_end_offset > match_start_offset
|
356
|
+
if match_end_offset >= hnode_text.size
|
357
|
+
match_range = match_range.tail( match_end_index + 1, 0 )
|
358
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} spaces" )
|
359
|
+
else
|
360
|
+
match_range = match_range.tail( match_end_index, match_end_offset )
|
361
|
+
log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
|
362
|
+
end
|
363
|
+
end
|
364
|
+
end
|
365
|
+
|
366
|
+
result = with_scanned_match_data( match_map, hnodes[match_start_index ... match_end_index] ) { |match_map|
|
367
|
+
yield( match_range, match_map )
|
368
|
+
}
|
369
|
+
result
|
370
|
+
|
371
|
+
end
|
372
|
+
|
373
|
+
end
|
374
|
+
|
375
|
+
|
376
|
+
class OneOrMorePattern < BasePattern
|
377
|
+
|
378
|
+
def initialize(repeatee)
|
379
|
+
super( nil )
|
380
|
+
@repeatee = repeatee
|
381
|
+
end
|
382
|
+
|
383
|
+
|
384
|
+
def self.parse(s)
|
385
|
+
raise "not implemented"
|
386
|
+
end
|
387
|
+
|
388
|
+
|
389
|
+
def repeatee
|
390
|
+
@repeatee
|
391
|
+
end
|
392
|
+
|
393
|
+
|
394
|
+
def description
|
395
|
+
"one+ ##{pattern_no}"
|
396
|
+
end
|
397
|
+
|
398
|
+
|
399
|
+
def dump( level, out )
|
400
|
+
out << " " * level + "one or more p=#{priority}\n"
|
401
|
+
@repeatee.dump( level + 1, out )
|
402
|
+
end
|
403
|
+
|
404
|
+
|
405
|
+
def priority
|
406
|
+
@repeatee.priority
|
407
|
+
end
|
408
|
+
|
409
|
+
|
410
|
+
def consume_remaining_matches( match_range, match_map, matches )
|
411
|
+
|
412
|
+
log("consuming remaining matches in #{match_range}")
|
413
|
+
@repeatee.each_match( match_range, match_map ) { |r_match_range, r_match_map|
|
414
|
+
matches << r_match_range.hnodes[r_match_range.start_index ... r_match_range.end_index]
|
415
|
+
last_match_range = consume_remaining_matches( match_range.tail( r_match_range.end_index, r_match_range.end_offset ), match_map, matches )
|
416
|
+
return last_match_range || r_match_range
|
417
|
+
}
|
418
|
+
return nil
|
419
|
+
|
420
|
+
end
|
421
|
+
|
422
|
+
|
423
|
+
def each_match( match_range, match_map )
|
424
|
+
|
425
|
+
super(match_range, match_map)
|
426
|
+
|
427
|
+
log("looking for first match in #{match_range}")
|
428
|
+
@repeatee.each_match( match_range, match_map ) { |first_match_range, first_match_map|
|
429
|
+
results = []
|
430
|
+
last_match_range = consume_remaining_matches( match_range.tail( first_match_range.end_index, first_match_range.end_offset ), match_map, results )
|
431
|
+
|
432
|
+
combined_match_range = if last_match_range
|
433
|
+
match_range.crop(
|
434
|
+
first_match_range.start_index, first_match_range.start_offset,
|
435
|
+
last_match_range.end_index, last_match_range.end_offset
|
436
|
+
)
|
437
|
+
else
|
438
|
+
first_match_range
|
439
|
+
end
|
440
|
+
|
441
|
+
log("combined match in #{combined_match_range}")
|
442
|
+
|
443
|
+
result = with_scanned_match_data( match_map, results ) { |match_map|
|
444
|
+
yield( combined_match_range, results )
|
445
|
+
}
|
446
|
+
|
447
|
+
if result
|
448
|
+
log("one+ match done with #{result}")
|
449
|
+
return result
|
450
|
+
else
|
451
|
+
log("one+ match not done")
|
452
|
+
end
|
453
|
+
}
|
454
|
+
|
455
|
+
return nil
|
456
|
+
end
|
457
|
+
|
458
|
+
end
|
459
|
+
|
460
|
+
|
461
|
+
class AnythingPattern < BasePattern
|
462
|
+
|
463
|
+
def description
|
464
|
+
"anything"
|
465
|
+
end
|
466
|
+
|
467
|
+
def self.parse( s )
|
468
|
+
return nil unless s.index("...") == 0
|
469
|
+
self.new("...")
|
470
|
+
end
|
471
|
+
|
472
|
+
|
473
|
+
# always matches the whole thing because it will get matched last...
|
474
|
+
# it just expands to fill whatever gap
|
475
|
+
|
476
|
+
def each_match( match_range, match_map )
|
477
|
+
with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index .. match_range.end_index] ) { |match_map|
|
478
|
+
yield( match_range, match_map )
|
479
|
+
}
|
480
|
+
end
|
481
|
+
|
482
|
+
|
483
|
+
def priority
|
484
|
+
-7
|
485
|
+
end
|
486
|
+
|
487
|
+
end
|
488
|
+
|
489
|
+
|
490
|
+
class TextPattern < BasePattern
|
491
|
+
|
492
|
+
def initialize( source, text )
|
493
|
+
super(source)
|
494
|
+
@text = text
|
495
|
+
end
|
496
|
+
|
497
|
+
def description
|
498
|
+
"text \"#{@text}\""
|
499
|
+
end
|
500
|
+
|
501
|
+
def self.parse( s )
|
502
|
+
|
503
|
+
if s.index("$") == 0
|
504
|
+
return self.new( "$", /[\r\n]/ )
|
505
|
+
end
|
506
|
+
|
507
|
+
if s.index("/") == 0
|
508
|
+
|
509
|
+
r_s = ""
|
510
|
+
s = s[1..-1]
|
511
|
+
src = "/"
|
512
|
+
while !s.empty?
|
513
|
+
|
514
|
+
if s.index("/") == 0
|
515
|
+
s = s[1..-1]
|
516
|
+
src << "/"
|
517
|
+
break
|
518
|
+
elsif s.index("\\/") == 0
|
519
|
+
s = s[2..-1]
|
520
|
+
r_s << "/"
|
521
|
+
src << "\\/"
|
522
|
+
else
|
523
|
+
r_s << s[0..0]
|
524
|
+
src << s[0..0]
|
525
|
+
s = s[1..-1]
|
526
|
+
end
|
527
|
+
|
528
|
+
end
|
529
|
+
|
530
|
+
regexp = Regexp.compile( r_s )
|
531
|
+
return self.new( src, regexp )
|
532
|
+
|
533
|
+
end
|
534
|
+
|
535
|
+
if s.index("\"") == 0
|
536
|
+
|
537
|
+
r_s = ""
|
538
|
+
s = s[1..-1]
|
539
|
+
src = "\""
|
540
|
+
while !s.empty?
|
541
|
+
|
542
|
+
if s.index("\"") == 0
|
543
|
+
s = s[1..-1]
|
544
|
+
src << "\""
|
545
|
+
break
|
546
|
+
elsif s.index("\\\"") == 0
|
547
|
+
s = s[2..-1]
|
548
|
+
r_s << "\""
|
549
|
+
src << "\\\""
|
550
|
+
else
|
551
|
+
r_s << s[0..0]
|
552
|
+
src << s[0..0]
|
553
|
+
s = s[1..-1]
|
554
|
+
end
|
555
|
+
|
556
|
+
end
|
557
|
+
|
558
|
+
return self.new( src, r_s )
|
559
|
+
end
|
560
|
+
|
561
|
+
end
|
562
|
+
|
563
|
+
def priority
|
564
|
+
if name
|
565
|
+
if @text.is_a?(String)
|
566
|
+
-4
|
567
|
+
else
|
568
|
+
-5
|
569
|
+
end
|
570
|
+
else
|
571
|
+
if @text.is_a?(String)
|
572
|
+
-2
|
573
|
+
else
|
574
|
+
-3
|
575
|
+
end
|
576
|
+
end
|
577
|
+
end
|
578
|
+
|
579
|
+
def each_match( match_range, match_map )
|
580
|
+
|
581
|
+
super(match_range, match_map)
|
582
|
+
|
583
|
+
match_start_index = match_range.start_index
|
584
|
+
match_start_offset = match_range.start_offset
|
585
|
+
match_end_index = match_range.start_index
|
586
|
+
match_end_offset = match_range.start_offset
|
587
|
+
|
588
|
+
# consume rest of first text node
|
589
|
+
|
590
|
+
hnodes = match_range.hnodes
|
591
|
+
|
592
|
+
actual_match = nil
|
593
|
+
|
594
|
+
while match_start_index < match_range.end_index
|
595
|
+
|
596
|
+
while match_start_index < match_range.end_index && ! hnodes[match_start_index].text?
|
597
|
+
log( "not text: ##{match_start_index} #{hnodes[match_start_index].class}" )
|
598
|
+
match_start_index += 1
|
599
|
+
match_start_offset = 0
|
600
|
+
end
|
601
|
+
|
602
|
+
unless match_start_index < match_range.end_index && hnodes[match_start_index].text?
|
603
|
+
log( "no match found" )
|
604
|
+
return nil
|
605
|
+
end
|
606
|
+
|
607
|
+
hnode_text = hnodes[match_start_index].inner_text
|
608
|
+
|
609
|
+
log( "trying text match on: #{hnode_text[match_start_offset .. -1]}" )
|
610
|
+
|
611
|
+
match_offset = hnode_text.index( @text, match_start_offset )
|
612
|
+
|
613
|
+
if match_offset
|
614
|
+
|
615
|
+
actual_match = if @text.is_a? Regexp
|
616
|
+
hnode_text[match_offset..-1][@text]
|
617
|
+
else
|
618
|
+
@text
|
619
|
+
end
|
620
|
+
|
621
|
+
match_end_offset = match_start_offset + actual_match.size
|
622
|
+
match_start_offset = match_start_offset + actual_match.size
|
623
|
+
|
624
|
+
if match_end_offset >= match_start_offset
|
625
|
+
if match_end_offset >= hnode_text.size
|
626
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
|
627
|
+
else
|
628
|
+
log( "matched first #{match_end_offset - match_start_offset} chars" )
|
629
|
+
end
|
630
|
+
break
|
631
|
+
end
|
632
|
+
end
|
633
|
+
|
634
|
+
match_start_index += 1
|
635
|
+
match_start_offset = 0
|
636
|
+
|
637
|
+
end
|
638
|
+
|
639
|
+
|
640
|
+
result = with_scanned_match_data( match_map, actual_match ) { |match_map|
|
641
|
+
yield( match_range.crop( match_start_index, match_start_offset, match_start_index, match_end_offset), match_map )
|
642
|
+
}
|
643
|
+
result
|
644
|
+
|
645
|
+
end
|
646
|
+
|
647
|
+
end
|
648
|
+
|
649
|
+
|
650
|
+
class CompositePattern < BasePattern
|
651
|
+
|
652
|
+
attr_reader :parts
|
653
|
+
|
654
|
+
def initialize( parts = nil )
|
655
|
+
super(nil)
|
656
|
+
@parts = if parts.nil?
|
657
|
+
[]
|
658
|
+
else
|
659
|
+
parts.clone
|
660
|
+
end
|
661
|
+
succ_pred = nil
|
662
|
+
@parts.each { |part|
|
663
|
+
part.pred = succ_pred
|
664
|
+
succ_pred.succ = part unless succ_pred.nil?
|
665
|
+
succ_pred = part
|
666
|
+
}
|
667
|
+
end
|
668
|
+
|
669
|
+
|
670
|
+
def dump( level, out )
|
671
|
+
out << " " * level + "composite p=#{priority} [\n"
|
672
|
+
@parts.each { |part|
|
673
|
+
part.dump( level + 1, out )
|
674
|
+
}
|
675
|
+
out << " " * level + "]\n"
|
676
|
+
end
|
677
|
+
|
678
|
+
|
679
|
+
def each_split_match( match_range, match_map, parts_by_priority, ppx, part_matches )
|
680
|
+
|
681
|
+
pattern = nil
|
682
|
+
|
683
|
+
while (ppx < parts_by_priority.size)
|
684
|
+
pattern = parts_by_priority[ppx]
|
685
|
+
break unless pattern.optional
|
686
|
+
break unless pattern.pred && ! pattern.pred.matched
|
687
|
+
log("skipping optional #{pattern.description}")
|
688
|
+
ppx += 1
|
689
|
+
end
|
690
|
+
|
691
|
+
if ppx >= parts_by_priority.size
|
692
|
+
log("comp nothing left to do")
|
693
|
+
return yield( match_range, match_map )
|
694
|
+
end
|
695
|
+
|
696
|
+
|
697
|
+
#
|
698
|
+
# figure out which gap this pattern is supposed to fill
|
699
|
+
#
|
700
|
+
|
701
|
+
matched_on_right = pattern.succ
|
702
|
+
|
703
|
+
while matched_on_right && ! matched_on_right.matched
|
704
|
+
matched_on_right = matched_on_right.succ
|
705
|
+
end
|
706
|
+
|
707
|
+
if matched_on_right
|
708
|
+
log("comp matching must be left of #{matched_on_right.description}")
|
709
|
+
match_range = match_range.head(matched_on_right.matched.start_index, matched_on_right.matched.start_offset)
|
710
|
+
end
|
711
|
+
|
712
|
+
matched_on_left = pattern.pred
|
713
|
+
|
714
|
+
while matched_on_left && ! matched_on_left.matched
|
715
|
+
matched_on_left = matched_on_left.pred
|
716
|
+
end
|
717
|
+
|
718
|
+
if matched_on_left
|
719
|
+
log("comp matching must be right of #{matched_on_left.description}")
|
720
|
+
match_range = match_range.tail(matched_on_left.matched.end_index, matched_on_left.matched.end_offset)
|
721
|
+
end
|
722
|
+
|
723
|
+
log("comp matching sub-pattern: #{pattern.description} at #{match_range.describe}")
|
724
|
+
|
725
|
+
pattern.each_match( match_range, match_map ) { |part_match_range, match_map|
|
726
|
+
|
727
|
+
pattern.matched = part_match_range
|
728
|
+
|
729
|
+
result = each_split_match( match_range, match_map, parts_by_priority, ppx + 1, part_matches ) { |sub_match_range, sub_match_map|
|
730
|
+
yield( sub_match_range, match_map )
|
731
|
+
}
|
732
|
+
|
733
|
+
pattern.matched = nil
|
734
|
+
|
735
|
+
if result
|
736
|
+
log("comp done, returning: #{result}")
|
737
|
+
return result
|
738
|
+
else
|
739
|
+
log("comp not done")
|
740
|
+
end
|
741
|
+
}
|
742
|
+
|
743
|
+
return nil
|
744
|
+
|
745
|
+
end
|
746
|
+
|
747
|
+
|
748
|
+
def each_match( match_range, match_map )
|
749
|
+
|
750
|
+
@parts.each { |part| part.matched = nil }
|
751
|
+
|
752
|
+
super(match_range, match_map)
|
753
|
+
|
754
|
+
# find the hightest priority part and divine up the children
|
755
|
+
|
756
|
+
parts_by_priority = @parts.sort_by { |part| 0 - part.priority }
|
757
|
+
each_split_match( match_range, {}, parts_by_priority, 0, {} ) { |last_match_range, last_match_map|
|
758
|
+
comp_match_range = nil
|
759
|
+
@parts.each { |part|
|
760
|
+
next unless part.matched
|
761
|
+
if ! comp_match_range
|
762
|
+
comp_match_range = part.matched
|
763
|
+
next
|
764
|
+
end
|
765
|
+
comp_match_range = comp_match_range.extend(part.matched)
|
766
|
+
}
|
767
|
+
result = with_scanned_match_data( match_map, last_match_map ) { |match_map|
|
768
|
+
log_match_data("comp match trying", comp_match_range, match_map)
|
769
|
+
yield( comp_match_range, match_map )
|
770
|
+
}
|
771
|
+
if result
|
772
|
+
log("comp match done, returning: #{result}")
|
773
|
+
return result
|
774
|
+
else
|
775
|
+
log("comp match not done")
|
776
|
+
end
|
777
|
+
}
|
778
|
+
|
779
|
+
end
|
780
|
+
|
781
|
+
def description
|
782
|
+
"comp ##{pattern_no}"
|
783
|
+
end
|
784
|
+
|
785
|
+
end
|
786
|
+
|
787
|
+
|
788
|
+
class NodeScraper
|
789
|
+
|
790
|
+
attr_accessor :mcot, :root, :parent, :hnode, :pattern_classes, :top_part_names, :verbose
|
791
|
+
|
792
|
+
|
793
|
+
def initialize( mcot, parent, root, hnode )
|
794
|
+
@mcot = mcot
|
795
|
+
@parent = parent
|
796
|
+
@root = root || self
|
797
|
+
@hnode = hnode
|
798
|
+
@verbose = mcot.verbose
|
799
|
+
end
|
800
|
+
|
801
|
+
|
802
|
+
def log( s )
|
803
|
+
mcot.log( s ) if @verbose
|
804
|
+
end
|
805
|
+
|
806
|
+
|
807
|
+
def descend( path )
|
808
|
+
|
809
|
+
results = {}
|
810
|
+
@hnode.search( path ).each { |hchild|
|
811
|
+
results[hchild] = yield( NodeScraper.new( @mcot, self, @root, hchild ) )
|
812
|
+
}
|
813
|
+
results
|
814
|
+
|
815
|
+
end
|
816
|
+
|
817
|
+
|
818
|
+
def flatten_hnodes( ix, node )
|
819
|
+
@flattened_hnodes << node
|
820
|
+
@hnode_index[node] = ix
|
821
|
+
ix += 1
|
822
|
+
if node.elem?
|
823
|
+
node.children.each { |child|
|
824
|
+
ix = flatten_hnodes(ix, child)
|
825
|
+
}
|
826
|
+
end
|
827
|
+
@hnode_succ_index[node] = ix
|
828
|
+
ix
|
829
|
+
end
|
830
|
+
|
831
|
+
|
832
|
+
def build_hnode_index
|
833
|
+
|
834
|
+
@flattened_hnodes = []
|
835
|
+
@hnode_index = {}
|
836
|
+
@hnode_succ_index = {}
|
837
|
+
|
838
|
+
n = flatten_hnodes( 0, hnode )
|
839
|
+
|
840
|
+
log( "built index for #{n} hnodes" )
|
841
|
+
|
842
|
+
end
|
843
|
+
|
844
|
+
|
845
|
+
def flattened_hnodes
|
846
|
+
unless @flattened_hnodes
|
847
|
+
build_hnode_index
|
848
|
+
end
|
849
|
+
return @flattened_hnodes
|
850
|
+
end
|
851
|
+
|
852
|
+
|
853
|
+
def hnode_index
|
854
|
+
unless @hnode_index
|
855
|
+
build_hnode_index
|
856
|
+
end
|
857
|
+
return @hnode_index
|
858
|
+
end
|
859
|
+
|
860
|
+
|
861
|
+
def hnode_succ_index
|
862
|
+
unless @hnode_succ_index
|
863
|
+
build_hnode_succ_index
|
864
|
+
end
|
865
|
+
return @hnode_succ_index
|
866
|
+
end
|
867
|
+
|
868
|
+
|
869
|
+
def collect_gen( pattern_s, call_with, &block )
|
870
|
+
pattern = @mcot.compile_pattern( pattern_s, self )
|
871
|
+
top_part_names = []
|
872
|
+
if pattern.is_a? CompositePattern
|
873
|
+
pattern.parts.each { |part|
|
874
|
+
top_part_names << part.name if part.name
|
875
|
+
}
|
876
|
+
end
|
877
|
+
log("top part names: #{top_part_names.join(", ")}")
|
878
|
+
build_hnode_index
|
879
|
+
pattern.dump( 0, $stdout ) if @verbose
|
880
|
+
results = []
|
881
|
+
match_range = MatchRange.new( self, 0, 0, flattened_hnodes.size, 0)
|
882
|
+
while ! match_range.empty?
|
883
|
+
result = nil
|
884
|
+
pattern.each_match( match_range, {} ) { |sub_match_range, match_map|
|
885
|
+
match_list = []
|
886
|
+
block_args = if (call_with == :positional) && top_part_names.size > 0
|
887
|
+
top_part_names.collect { |top_name|
|
888
|
+
match_map[top_name]
|
889
|
+
}
|
890
|
+
else
|
891
|
+
match_map
|
892
|
+
end
|
893
|
+
log("calling scan block with: #{block_args.join(", ")}")
|
894
|
+
result = block.call( *block_args )
|
895
|
+
if result
|
896
|
+
results << result
|
897
|
+
match_range = match_range.following( sub_match_range )
|
898
|
+
end
|
899
|
+
|
900
|
+
result
|
901
|
+
}
|
902
|
+
|
903
|
+
break unless result
|
904
|
+
end
|
905
|
+
results
|
906
|
+
end
|
907
|
+
|
908
|
+
|
909
|
+
def collect( pattern_s, &block )
|
910
|
+
collect_gen( pattern_s, :positional, &block )
|
911
|
+
end
|
912
|
+
|
913
|
+
|
914
|
+
def collect_hashed( pattern_s, &block )
|
915
|
+
collect_gen( pattern_s, :map, &block )
|
916
|
+
end
|
917
|
+
|
918
|
+
def scanner_by_name( name )
|
919
|
+
return mcot.scanner_by_name(name)
|
920
|
+
end
|
921
|
+
|
922
|
+
end
|
923
|
+
|
924
|
+
|
925
|
+
def log( s )
|
926
|
+
puts( s ) if @verbose
|
927
|
+
end
|
928
|
+
|
929
|
+
|
930
|
+
|
931
|
+
def scanner_by_name( name )
|
932
|
+
@scanners[name]
|
933
|
+
end
|
934
|
+
|
935
|
+
|
936
|
+
def compile_pattern( pattern_s, node_scraper )
|
937
|
+
|
938
|
+
# if @compiled_patterns.key? pattern_s
|
939
|
+
# return @compiled_patterns[ pattern_s ]
|
940
|
+
# end
|
941
|
+
|
942
|
+
s = pattern_s
|
943
|
+
patterns = []
|
944
|
+
|
945
|
+
log("compiling: #{s}")
|
946
|
+
|
947
|
+
while !s.empty?
|
948
|
+
|
949
|
+
log("left: #{s}")
|
950
|
+
|
951
|
+
if s.index(")") == 0
|
952
|
+
break
|
953
|
+
end
|
954
|
+
|
955
|
+
name = nil
|
956
|
+
s.scan( /^(\w+)=/ ) { |match|
|
957
|
+
log "got name #{match.class} (#{match.size})"
|
958
|
+
name = match[0]
|
959
|
+
}
|
960
|
+
s = s[(name.length + 1) .. -1] if name
|
961
|
+
|
962
|
+
log("after name #{name}: #{s}")
|
963
|
+
|
964
|
+
pattern = nil
|
965
|
+
|
966
|
+
[PathPattern, TextPattern, AnythingPattern, OptSpacePattern].each { |pattern_class|
|
967
|
+
pattern = pattern_class.parse(s)
|
968
|
+
if pattern
|
969
|
+
pattern.metrocot = self
|
970
|
+
pattern.node_scraper = node_scraper
|
971
|
+
break
|
972
|
+
end
|
973
|
+
log "not a #{pattern_class}"
|
974
|
+
}
|
975
|
+
|
976
|
+
if pattern
|
977
|
+
s = s[pattern.source.size .. -1]
|
978
|
+
patterns << pattern
|
979
|
+
log("found: #{pattern.description}")
|
980
|
+
if name
|
981
|
+
log("scanned as: #{name}")
|
982
|
+
pattern.name = name.to_sym
|
983
|
+
end
|
984
|
+
next
|
985
|
+
end
|
986
|
+
|
987
|
+
if s[0..0] == "+"
|
988
|
+
raise "+ must follow pattern" unless patterns.size > 0
|
989
|
+
raise "+ applied twice does not make sense" if patterns[-1].is_a? OneOrMorePattern
|
990
|
+
pattern = OneOrMorePattern.new( patterns[-1] )
|
991
|
+
pattern.metrocot = self
|
992
|
+
pattern.node_scraper = node_scraper
|
993
|
+
patterns[-1] = pattern
|
994
|
+
log("now one or more: #{pattern.repeatee}")
|
995
|
+
s = s[1 .. -1]
|
996
|
+
next
|
997
|
+
end
|
998
|
+
|
999
|
+
if s[0..0] == "("
|
1000
|
+
pattern = compile_pattern( s[1 .. -1], node_scraper )
|
1001
|
+
close_par_index = pattern.source.size + 1
|
1002
|
+
raise "expected ')' found '#{s[0..0]}'" unless s[close_par_index..close_par_index] == ")"
|
1003
|
+
s = s[close_par_index + 1 .. -1]
|
1004
|
+
log("found nested: #{pattern.description} \"#{pattern.source}\"")
|
1005
|
+
patterns << pattern
|
1006
|
+
if name
|
1007
|
+
pattern.name = name.to_sym
|
1008
|
+
end
|
1009
|
+
next
|
1010
|
+
end
|
1011
|
+
|
1012
|
+
raise "unrecognizable pattern: \"#{s[0..10]}...\""
|
1013
|
+
|
1014
|
+
end
|
1015
|
+
|
1016
|
+
pattern = if patterns.size > 1
|
1017
|
+
CompositePattern.new( patterns )
|
1018
|
+
elsif patterns.size == 1
|
1019
|
+
patterns[0]
|
1020
|
+
else
|
1021
|
+
nil
|
1022
|
+
end
|
1023
|
+
|
1024
|
+
if pattern
|
1025
|
+
pattern.metrocot = self
|
1026
|
+
pattern.node_scraper = node_scraper
|
1027
|
+
pattern.source = pattern_s[0 .. (0 - (1 + s.size))]
|
1028
|
+
end
|
1029
|
+
|
1030
|
+
return @compiled_patterns[ pattern_s ] = pattern
|
1031
|
+
|
1032
|
+
end
|
1033
|
+
|
1034
|
+
|
1035
|
+
attr_accessor :verbose
|
1036
|
+
|
1037
|
+
|
1038
|
+
def initialize( scanners )
|
1039
|
+
|
1040
|
+
@scanners = {}
|
1041
|
+
@compiled_patterns = {}
|
1042
|
+
|
1043
|
+
scanners.each { |name, value|
|
1044
|
+
if value.is_a? Class
|
1045
|
+
@scanners[name] = value.new
|
1046
|
+
else
|
1047
|
+
@scanners[name] = value
|
1048
|
+
end
|
1049
|
+
}
|
1050
|
+
|
1051
|
+
@verbose = false
|
1052
|
+
|
1053
|
+
log("scanners: #{@scanners.inspect}")
|
1054
|
+
|
1055
|
+
end
|
1056
|
+
|
1057
|
+
|
1058
|
+
def scrape(doc)
|
1059
|
+
NodeScraper.new( self, nil, nil, doc )
|
1060
|
+
end
|
1061
|
+
|
1062
|
+
|
1063
|
+
module Scanners
|
1064
|
+
|
1065
|
+
class BaseScanner
|
1066
|
+
def scan(data)
|
1067
|
+
data.to_s
|
1068
|
+
end
|
1069
|
+
end
|
1070
|
+
|
1071
|
+
class DateTimeScanner < BaseScanner
|
1072
|
+
def scan( data )
|
1073
|
+
if data.is_a? Hpricot::Elem
|
1074
|
+
data = data.inner_text
|
1075
|
+
end
|
1076
|
+
Time.parse(data)
|
1077
|
+
end
|
1078
|
+
end
|
1079
|
+
|
1080
|
+
class TextLookupScanner < BaseScanner
|
1081
|
+
end
|
1082
|
+
|
1083
|
+
class TextileScanner < BaseScanner
|
1084
|
+
def scan( data )
|
1085
|
+
if data.is_a? Hpricot::Elem
|
1086
|
+
data = data.inner_text
|
1087
|
+
end
|
1088
|
+
end
|
1089
|
+
end
|
1090
|
+
|
1091
|
+
class TextScanner < BaseScanner
|
1092
|
+
def scan( data )
|
1093
|
+
if data.is_a? Hpricot::Elem
|
1094
|
+
data = data.inner_text
|
1095
|
+
else
|
1096
|
+
data = data.to_s
|
1097
|
+
end
|
1098
|
+
data
|
1099
|
+
end
|
1100
|
+
end
|
1101
|
+
|
1102
|
+
class LineScanner < BaseScanner
|
1103
|
+
end
|
1104
|
+
|
1105
|
+
|
1106
|
+
end
|
1107
|
+
|
1108
|
+
end
|
1109
|
+
|
1110
|
+
#
|
1111
|
+
#############################################################################
|
1112
|
+
#
|