metrocot 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/README.txt +27 -27
- data/Rakefile +4 -1
- data/lib/metrocot.rb +313 -145
- data/test/test_metrocot.rb +71 -7
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 1.0.2 / 2009-01-08
|
2
|
+
|
3
|
+
* added docs and examples
|
4
|
+
* created tests for examples and made those work as well
|
5
|
+
* added StrippingTextScanner
|
6
|
+
|
7
|
+
=== 1.0.1 / 2009-01-03
|
8
|
+
|
9
|
+
* checked in with some initial docs
|
10
|
+
|
1
11
|
=== 1.0.0 / 2009-01-02
|
2
12
|
|
3
13
|
* First working version
|
data/README.txt
CHANGED
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
Metrocot builds on
|
8
|
-
with a minimum of code and page specific information. The specification is
|
9
|
-
|
7
|
+
Metrocot builds on Hpricot to allow scraping of list data from HTML pages
|
8
|
+
with a minimum of code and page specific information. The specification is
|
9
|
+
done in a very compact readable format.
|
10
10
|
|
11
11
|
|
12
12
|
== FEATURES/PROBLEMS:
|
@@ -19,39 +19,39 @@ is a very compact readable format.
|
|
19
19
|
|
20
20
|
== SYNOPSIS:
|
21
21
|
|
22
|
-
require 'rubygems'
|
23
|
-
require 'metrocot'
|
22
|
+
require 'rubygems'
|
23
|
+
require 'metrocot'
|
24
24
|
|
25
|
-
class Event < Object
|
25
|
+
class Event < Object
|
26
26
|
|
27
|
-
|
27
|
+
attr_accessor :starts_at, :title, :description, :url
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
def initialize( starts_at, title, description, url )
|
30
|
+
@starts_at = starts_at
|
31
|
+
@title = title
|
32
|
+
@description = description
|
33
|
+
@url = url
|
34
|
+
end
|
35
|
+
|
34
36
|
end
|
35
|
-
|
36
|
-
end
|
37
37
|
|
38
|
-
mce_url = "http://www.musiccorner.ca/calendar.html"
|
39
|
-
mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
|
38
|
+
mce_url = "http://www.musiccorner.ca/calendar.html"
|
39
|
+
mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
|
40
40
|
|
41
|
-
scraper = Metrocot.new(
|
41
|
+
scraper = Metrocot.new(
|
42
42
|
:starts_at => Metrocot::Scanners::DateTimeScanner,
|
43
43
|
:description => Metrocot::Scanners::TextScanner,
|
44
|
-
:title => Metrocot::Scanners::
|
45
|
-
)
|
44
|
+
:title => Metrocot::Scanners::StrippingTextScanner
|
45
|
+
)
|
46
46
|
|
47
|
-
mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
|
48
|
-
|
49
|
-
}.values.flatten
|
50
|
-
|
51
|
-
puts "Found #{mce_events.size} mce events:"
|
52
|
-
mce_events.each_with_index { |event, event_index|
|
53
|
-
|
54
|
-
}
|
47
|
+
mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
|
48
|
+
td.collect( "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)" ) { |starts_at, title, description| Event.new( starts_at, title, description, mce_url ) }
|
49
|
+
}.values.flatten
|
50
|
+
|
51
|
+
puts "Found #{mce_events.size} mce events:"
|
52
|
+
mce_events.each_with_index { |event, event_index|
|
53
|
+
puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
|
54
|
+
}
|
55
55
|
|
56
56
|
|
57
57
|
== REQUIREMENTS:
|
data/Rakefile
CHANGED
@@ -5,8 +5,11 @@ require 'hoe'
|
|
5
5
|
require './lib/metrocot.rb'
|
6
6
|
|
7
7
|
Hoe.new('metrocot', Metrocot::VERSION) do |p|
|
8
|
-
p.rubyforge_name = 'metrocot'
|
8
|
+
p.rubyforge_name = 'metrocot'
|
9
9
|
p.developer('Helmut Hissen', 'helmut@zeebar.com')
|
10
|
+
p.summary = "An Hpricot based tool for harvesting list-like data from HTML pages."
|
11
|
+
p.remote_rdoc_dir = '' # Release to root
|
10
12
|
end
|
11
13
|
|
14
|
+
|
12
15
|
# vim: syntax=Ruby
|
data/lib/metrocot.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
|
2
|
+
# Author:: Helmut Hissen (mailto:helmut@zeebar.com)
|
3
|
+
# Copyright:: Copyright(c) 2009 Metro Cascade Media, Inc.
|
4
|
+
# License:: Distributed under the BSD open source license
|
5
|
+
#
|
2
6
|
#############################################################################
|
3
7
|
#
|
4
8
|
# Copyright (c) 2009 Metro Cascade Media Inc
|
@@ -24,9 +28,6 @@
|
|
24
28
|
#
|
25
29
|
#############################################################################
|
26
30
|
#
|
27
|
-
# Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
|
28
|
-
# January 1 2009
|
29
|
-
#
|
30
31
|
#############################################################################
|
31
32
|
#
|
32
33
|
# We are like tiny pleasantly chirping hex bugs coding away on the
|
@@ -38,9 +39,39 @@
|
|
38
39
|
#############################################################################
|
39
40
|
#
|
40
41
|
|
42
|
+
# == Purpose
|
43
|
+
# This class implements the main Metrocot HTML scanner and a number of handy
|
44
|
+
# input scanners (for grabbing time, numbers, or text from HTML). The purpose
|
45
|
+
# of the Metrocot is to scan a XML dom for the patterns specified in the
|
46
|
+
# Metrocot pattern language.
|
47
|
+
#
|
48
|
+
# == Pattern Language
|
49
|
+
# The Metrocot pattern language allows for the following types of patterns:
|
50
|
+
#
|
51
|
+
# +...+:: matches anything
|
52
|
+
# +"some string":: matches that string
|
53
|
+
# +/(some|pattern)/ matches that regexp pattern
|
54
|
+
# +./HPRICOT_PATH+:: matches a certain type of dom subtree
|
55
|
+
# +SPACE+:: matches zero or more white spaces
|
56
|
+
# +(PATTERN_A PATTERN_B):: matches PATTERN_A followed by PATTERN_B
|
57
|
+
# +PATTERN\++:: matches one or more occurrences of PATTERN
|
58
|
+
#
|
59
|
+
# == Usage
|
60
|
+
# 0) create a Metricot and define the types of fields you want to extract (and their names).
|
61
|
+
# 1) use Hpricot to get the doc's dom
|
62
|
+
# 2) use descend(xpath) to create a NodeScraper rooted at the Hpricot node(s) matching the xpath
|
63
|
+
# 3) use collect(pattern) to collect all entries found in the HTML which match the Metricot pattern
|
64
|
+
|
41
65
|
class Metrocot < Object
|
42
66
|
|
43
|
-
VERSION = '1.0.
|
67
|
+
VERSION = '1.0.2'
|
68
|
+
|
69
|
+
|
70
|
+
# represents a subtree withing a metrocot dom. the semantics are roughly equivalent
|
71
|
+
# to what you get from select-dragging your mouse pointer through a section of an html
|
72
|
+
# doc in your web browser. Thats is, a range specifies the first and last node in the
|
73
|
+
# pre-fix traversal of the dom. Additionally, the first and last node may be truncated
|
74
|
+
# (at their tail and head respectively) if they are text nodes.
|
44
75
|
|
45
76
|
class MatchRange
|
46
77
|
|
@@ -126,6 +157,9 @@ class Metrocot < Object
|
|
126
157
|
end
|
127
158
|
|
128
159
|
|
160
|
+
#
|
161
|
+
# base class for all other patterns. Provides some reasonable default behaviours.
|
162
|
+
#
|
129
163
|
|
130
164
|
class BasePattern
|
131
165
|
|
@@ -156,7 +190,11 @@ class Metrocot < Object
|
|
156
190
|
end
|
157
191
|
|
158
192
|
def description
|
159
|
-
|
193
|
+
if name
|
194
|
+
"#{self.class.name} \"#{name}\""
|
195
|
+
else
|
196
|
+
self.class.name
|
197
|
+
end
|
160
198
|
end
|
161
199
|
|
162
200
|
|
@@ -247,6 +285,8 @@ class Metrocot < Object
|
|
247
285
|
end
|
248
286
|
|
249
287
|
|
288
|
+
# matches a certain Hpricot path
|
289
|
+
|
250
290
|
class PathPattern < BasePattern
|
251
291
|
|
252
292
|
def initialize( source, path )
|
@@ -311,6 +351,8 @@ class Metrocot < Object
|
|
311
351
|
|
312
352
|
end
|
313
353
|
|
354
|
+
|
355
|
+
# matches zero or more white spaces
|
314
356
|
|
315
357
|
class OptSpacePattern < BasePattern
|
316
358
|
|
@@ -332,47 +374,58 @@ class Metrocot < Object
|
|
332
374
|
end
|
333
375
|
|
334
376
|
def priority
|
335
|
-
-
|
377
|
+
-8
|
336
378
|
end
|
337
379
|
|
338
380
|
def each_match( match_range, match_map )
|
381
|
+
|
382
|
+
result = nil
|
383
|
+
|
339
384
|
super(match_range, match_map)
|
340
|
-
|
385
|
+
|
386
|
+
match_start_index = match_range.start_index
|
341
387
|
match_start_offset = match_range.start_offset
|
342
|
-
match_end_index
|
343
|
-
match_end_offset
|
388
|
+
match_end_index = match_range.start_index
|
389
|
+
match_end_offset = match_range.start_offset
|
390
|
+
|
391
|
+
raise "negative range #{match_range}" if match_start_index > match_end_index
|
392
|
+
raise "negative range #{match_range}" if match_start_index == match_end_index && match_start_offset > match_end_offset
|
344
393
|
|
345
394
|
# consume rest of first text node
|
346
395
|
|
347
396
|
hnodes = match_range.hnodes
|
397
|
+
hnode_text = nil
|
348
398
|
|
349
399
|
if hnodes[match_start_index] && hnodes[match_start_index].text?
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
400
|
+
|
401
|
+
hnode_text = hnodes[match_start_index].inner_text
|
402
|
+
while match_end_offset < hnode_text.size && (/^\s+$/.=== hnode_text[match_start_offset .. match_end_offset])
|
403
|
+
match_end_offset += 1
|
404
|
+
end
|
354
405
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
match_range = match_range.tail( match_end_index, match_end_offset )
|
361
|
-
log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
|
362
|
-
end
|
406
|
+
if match_end_offset >= match_start_offset
|
407
|
+
if match_end_offset >= hnode_text.size
|
408
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} spaces" )
|
409
|
+
else
|
410
|
+
log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
|
363
411
|
end
|
412
|
+
end
|
364
413
|
end
|
365
414
|
|
415
|
+
sp_match_range = match_range.crop( match_start_index, match_start_offset, match_end_index, match_end_offset )
|
416
|
+
|
366
417
|
result = with_scanned_match_data( match_map, hnodes[match_start_index ... match_end_index] ) { |match_map|
|
367
|
-
yield(
|
418
|
+
yield( sp_match_range, match_map )
|
368
419
|
}
|
369
|
-
result
|
370
420
|
|
421
|
+
result
|
371
422
|
end
|
372
423
|
|
373
424
|
end
|
374
425
|
|
375
426
|
|
427
|
+
# matches one or more occurences of some other pattern
|
428
|
+
|
376
429
|
class OneOrMorePattern < BasePattern
|
377
430
|
|
378
431
|
def initialize(repeatee)
|
@@ -458,6 +511,8 @@ class Metrocot < Object
|
|
458
511
|
end
|
459
512
|
|
460
513
|
|
514
|
+
# Matches anything.
|
515
|
+
|
461
516
|
class AnythingPattern < BasePattern
|
462
517
|
|
463
518
|
def description
|
@@ -474,9 +529,25 @@ class Metrocot < Object
|
|
474
529
|
# it just expands to fill whatever gap
|
475
530
|
|
476
531
|
def each_match( match_range, match_map )
|
477
|
-
|
478
|
-
|
479
|
-
|
532
|
+
|
533
|
+
result = if match_range.start_index > match_range.end_index
|
534
|
+
log( "empty range" )
|
535
|
+
nil
|
536
|
+
elsif match_range.start_index > match_range.end_index && match_range.start_offset > match_range.end_offset
|
537
|
+
log( "empty node" )
|
538
|
+
nil
|
539
|
+
elsif match_range.start_index == match_range.end_index || (match_range.start_index + 1 == match_range.end_index && match_range.end_offset == 0) && match_range.hnodes[match_range.start_index].text?
|
540
|
+
log( "single text node range #{match_range.describe}" )
|
541
|
+
raise "bad range #{match_range.describe}" if match_range.hnodes[match_range.start_index].nil?
|
542
|
+
with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index].inner_text[match_range.start_offset...match_range.end_offset] ) { |match_map|
|
543
|
+
yield( match_range, match_map )
|
544
|
+
}
|
545
|
+
else
|
546
|
+
log( "multi node range #{match_range.describe}" )
|
547
|
+
with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index ... match_range.end_index] ) { |match_map|
|
548
|
+
yield( match_range, match_map )
|
549
|
+
}
|
550
|
+
end
|
480
551
|
end
|
481
552
|
|
482
553
|
|
@@ -487,6 +558,8 @@ class Metrocot < Object
|
|
487
558
|
end
|
488
559
|
|
489
560
|
|
561
|
+
# Matches a certain text string or regex pattern
|
562
|
+
|
490
563
|
class TextPattern < BasePattern
|
491
564
|
|
492
565
|
def initialize( source, text )
|
@@ -580,10 +653,9 @@ class Metrocot < Object
|
|
580
653
|
|
581
654
|
super(match_range, match_map)
|
582
655
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
match_end_offset = match_range.start_offset
|
656
|
+
match_index = match_range.start_index
|
657
|
+
match_offset = match_range.start_offset
|
658
|
+
|
587
659
|
|
588
660
|
# consume rest of first text node
|
589
661
|
|
@@ -591,62 +663,72 @@ class Metrocot < Object
|
|
591
663
|
|
592
664
|
actual_match = nil
|
593
665
|
|
594
|
-
while
|
666
|
+
while match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)
|
595
667
|
|
596
|
-
while
|
597
|
-
log( "not text: ##{
|
598
|
-
|
599
|
-
|
668
|
+
while (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && ! hnodes[match_index].text?
|
669
|
+
log( "not text: ##{match_index} #{hnodes[match_index].class}" )
|
670
|
+
match_index += 1
|
671
|
+
match_offset = 0
|
600
672
|
end
|
601
673
|
|
602
|
-
unless
|
674
|
+
unless (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && hnodes[match_index].text?
|
603
675
|
log( "no match found" )
|
604
676
|
return nil
|
605
677
|
end
|
606
678
|
|
607
|
-
hnode_text =
|
679
|
+
hnode_text = if match_index == match_range.end_index
|
680
|
+
hnodes[match_index].inner_text[0...match_range.end_offset]
|
681
|
+
else
|
682
|
+
hnodes[match_index].inner_text
|
683
|
+
end
|
608
684
|
|
609
|
-
log( "trying text match on: #{hnode_text[
|
685
|
+
log( "trying text match on: #{hnode_text[match_offset .. -1]}" )
|
610
686
|
|
611
|
-
|
687
|
+
next_match_offset = hnode_text.index( @text, match_offset )
|
612
688
|
|
613
|
-
if
|
689
|
+
if next_match_offset.nil?
|
690
|
+
log( "no match found for #{@text}" )
|
691
|
+
match_index += 1
|
692
|
+
match_offset = 0
|
693
|
+
next
|
694
|
+
end
|
614
695
|
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
696
|
+
actual_match = if @text.is_a? Regexp
|
697
|
+
hnode_text[next_match_offset..-1][@text]
|
698
|
+
else
|
699
|
+
@text
|
700
|
+
end
|
701
|
+
|
702
|
+
log( "next text match at #{match_index}.#{next_match_offset}: #{actual_match}" )
|
620
703
|
|
621
|
-
|
622
|
-
|
704
|
+
match_start_offset = next_match_offset
|
705
|
+
match_end_offset = match_start_offset + actual_match.size
|
623
706
|
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
log( "matched first #{match_end_offset - match_start_offset} chars" )
|
629
|
-
end
|
630
|
-
break
|
631
|
-
end
|
707
|
+
if match_end_offset >= hnode_text.size
|
708
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
|
709
|
+
else
|
710
|
+
log( "matched first #{match_end_offset - match_start_offset} chars" )
|
632
711
|
end
|
633
712
|
|
634
|
-
|
635
|
-
|
713
|
+
result = with_scanned_match_data( match_map, actual_match ) { |match_map|
|
714
|
+
yield( match_range.crop( match_index, match_start_offset, match_index, match_end_offset), match_map )
|
715
|
+
}
|
716
|
+
|
717
|
+
return result if result
|
718
|
+
|
719
|
+
match_offset = match_end_offset
|
636
720
|
|
637
721
|
end
|
638
722
|
|
639
|
-
|
640
|
-
result = with_scanned_match_data( match_map, actual_match ) { |match_map|
|
641
|
-
yield( match_range.crop( match_start_index, match_start_offset, match_start_index, match_end_offset), match_map )
|
642
|
-
}
|
643
|
-
result
|
723
|
+
return nil
|
644
724
|
|
645
725
|
end
|
646
726
|
|
647
727
|
end
|
648
728
|
|
649
729
|
|
730
|
+
# Matches a series of patterns
|
731
|
+
|
650
732
|
class CompositePattern < BasePattern
|
651
733
|
|
652
734
|
attr_reader :parts
|
@@ -676,7 +758,7 @@ class Metrocot < Object
|
|
676
758
|
end
|
677
759
|
|
678
760
|
|
679
|
-
def each_split_match(
|
761
|
+
def each_split_match( comp_match_range, match_map, parts_by_priority, ppx, part_matches )
|
680
762
|
|
681
763
|
pattern = nil
|
682
764
|
|
@@ -690,10 +772,13 @@ class Metrocot < Object
|
|
690
772
|
|
691
773
|
if ppx >= parts_by_priority.size
|
692
774
|
log("comp nothing left to do")
|
693
|
-
return yield(
|
775
|
+
return yield( comp_match_range, match_map )
|
694
776
|
end
|
695
777
|
|
696
778
|
|
779
|
+
match_range = comp_match_range
|
780
|
+
log("comp matching sub-pattern #{pattern.description} within #{match_range.describe}")
|
781
|
+
|
697
782
|
#
|
698
783
|
# figure out which gap this pattern is supposed to fill
|
699
784
|
#
|
@@ -718,15 +803,25 @@ class Metrocot < Object
|
|
718
803
|
if matched_on_left
|
719
804
|
log("comp matching must be right of #{matched_on_left.description}")
|
720
805
|
match_range = match_range.tail(matched_on_left.matched.end_index, matched_on_left.matched.end_offset)
|
806
|
+
elsif matched_on_right
|
807
|
+
right_node = match_range.hnodes[matched_on_right.matched.start_index]
|
808
|
+
parent_of_right_node = right_node && right_node.parent
|
809
|
+
parent_ix_of_right_node = parent_of_right_node && node_scraper.hnode_index[parent_of_right_node]
|
810
|
+
if parent_ix_of_right_node && parent_ix_of_right_node >= match_range.start_index
|
811
|
+
match_range = match_range.tail(parent_ix_of_right_node + 1, 0)
|
812
|
+
log("restricting left boundary to #{match_range} because would otherwise include subtree with right peer")
|
813
|
+
end
|
721
814
|
end
|
722
815
|
|
723
|
-
log("comp matching sub-pattern
|
816
|
+
log("comp matching sub-pattern #{pattern.description} at #{match_range.describe}")
|
724
817
|
|
725
818
|
pattern.each_match( match_range, match_map ) { |part_match_range, match_map|
|
726
819
|
|
727
820
|
pattern.matched = part_match_range
|
728
821
|
|
729
|
-
|
822
|
+
log("found sub-pattern #{pattern.description} at #{part_match_range.describe}")
|
823
|
+
|
824
|
+
result = each_split_match( comp_match_range, match_map, parts_by_priority, ppx + 1, part_matches ) { |sub_match_range, sub_match_map|
|
730
825
|
yield( sub_match_range, match_map )
|
731
826
|
}
|
732
827
|
|
@@ -785,6 +880,9 @@ class Metrocot < Object
|
|
785
880
|
end
|
786
881
|
|
787
882
|
|
883
|
+
# rooted at a node in the dom, the node srcaper is used to collect all matches of
|
884
|
+
# patterns.
|
885
|
+
|
788
886
|
class NodeScraper
|
789
887
|
|
790
888
|
attr_accessor :mcot, :root, :parent, :hnode, :pattern_classes, :top_part_names, :verbose
|
@@ -883,15 +981,16 @@ class Metrocot < Object
|
|
883
981
|
result = nil
|
884
982
|
pattern.each_match( match_range, {} ) { |sub_match_range, match_map|
|
885
983
|
match_list = []
|
886
|
-
|
887
|
-
top_part_names.collect { |top_name|
|
984
|
+
result = if (call_with == :positional) && top_part_names.size > 0
|
985
|
+
block_args = top_part_names.collect { |top_name|
|
888
986
|
match_map[top_name]
|
889
987
|
}
|
988
|
+
log("calling pos scan block with: #{block_args.inspect}")
|
989
|
+
result = block.call( *block_args )
|
890
990
|
else
|
891
|
-
match_map
|
991
|
+
log("calling hash scan block with: #{match_map.inspect}")
|
992
|
+
result = block.call( match_map )
|
892
993
|
end
|
893
|
-
log("calling scan block with: #{block_args.join(", ")}")
|
894
|
-
result = block.call( *block_args )
|
895
994
|
if result
|
896
995
|
results << result
|
897
996
|
match_range = match_range.following( sub_match_range )
|
@@ -905,16 +1004,51 @@ class Metrocot < Object
|
|
905
1004
|
results
|
906
1005
|
end
|
907
1006
|
|
1007
|
+
# collects all occurrences of the data matching the pattern by calling the
|
1008
|
+
# yield block for everything part of the dom subtree matching the pattern.
|
1009
|
+
# The block can reject the dom match by returning nil. Anything other than
|
1010
|
+
# nil will be appended to the list returned at the end.
|
1011
|
+
#
|
1012
|
+
# Unlike collect_hashed(), the block will be given a list of parameter
|
1013
|
+
# values matching the list of named fields in the pattern.
|
1014
|
+
#
|
1015
|
+
# === Example
|
1016
|
+
#
|
1017
|
+
# mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
1018
|
+
# li.collect( "liker=... \"likes\" likee=..." ) { |likes, liked|
|
1019
|
+
# [ likes, liked ]
|
1020
|
+
# }
|
1021
|
+
# }
|
1022
|
+
#
|
908
1023
|
|
909
1024
|
def collect( pattern_s, &block )
|
910
1025
|
collect_gen( pattern_s, :positional, &block )
|
911
1026
|
end
|
912
1027
|
|
913
1028
|
|
1029
|
+
# collects all occurrences of the data matching the pattern by calling the
|
1030
|
+
# yield block for everything part of the dom subtree matching the pattern.
|
1031
|
+
# The block can reject the dom match by returning nil. Anything other than
|
1032
|
+
# nil will be appended to the list returned at the end.
|
1033
|
+
#
|
1034
|
+
# Unlike collect(), the block will be given a map of parameter values
|
1035
|
+
# keyed by the names of the named fields in the pattern.
|
1036
|
+
#
|
1037
|
+
# === Example
|
1038
|
+
#
|
1039
|
+
# mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
1040
|
+
# li.collect_hashed( "killer=... verb=/(stabbed|shot|strangled)/ victim=... \"(with|using)\" weapon=..." ) { |map|
|
1041
|
+
# Murder.new( map )
|
1042
|
+
# }
|
1043
|
+
# }
|
1044
|
+
|
914
1045
|
def collect_hashed( pattern_s, &block )
|
915
1046
|
collect_gen( pattern_s, :map, &block )
|
916
1047
|
end
|
917
1048
|
|
1049
|
+
|
1050
|
+
# returns the scanner declared with that name when the metrocot was created
|
1051
|
+
|
918
1052
|
def scanner_by_name( name )
|
919
1053
|
return mcot.scanner_by_name(name)
|
920
1054
|
end
|
@@ -922,17 +1056,125 @@ class Metrocot < Object
|
|
922
1056
|
end
|
923
1057
|
|
924
1058
|
|
1059
|
+
# Some useful scanners which should cover a good number of common
|
1060
|
+
# parsing scenarios.
|
1061
|
+
|
1062
|
+
module Scanners
|
1063
|
+
|
1064
|
+
class BaseScanner
|
1065
|
+
def scan(data)
|
1066
|
+
data.to_s
|
1067
|
+
end
|
1068
|
+
end
|
1069
|
+
|
1070
|
+
|
1071
|
+
# Scans the hpricot element or text for a date in one of the
|
1072
|
+
# various formats accepted by Time. Dependingon the kinds of
|
1073
|
+
# dates expected in the doc, this may not be sufficient and
|
1074
|
+
# you may have to create a custom scanner.
|
1075
|
+
|
1076
|
+
class DateTimeScanner < BaseScanner
|
1077
|
+
def scan( data )
|
1078
|
+
if data.is_a? Hpricot::Elem
|
1079
|
+
data = data.inner_text
|
1080
|
+
end
|
1081
|
+
Time.parse(data)
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
|
1085
|
+
|
1086
|
+
# Scans the dom subtree and converts it to textile where possible
|
1087
|
+
# returning a string containing the Textile version
|
1088
|
+
|
1089
|
+
class TextileScanner < BaseScanner
|
1090
|
+
def scan( data )
|
1091
|
+
if data.is_a? Hpricot::Elem
|
1092
|
+
data = data.inner_text
|
1093
|
+
end
|
1094
|
+
end
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
|
1098
|
+
# just pulls out the plain text. This will probably be the most
|
1099
|
+
# commonly used scanner.
|
1100
|
+
|
1101
|
+
class TextScanner < BaseScanner
|
1102
|
+
def scan( data )
|
1103
|
+
if data.is_a? Hpricot::Elem
|
1104
|
+
data = data.inner_text
|
1105
|
+
else
|
1106
|
+
data = data.to_s
|
1107
|
+
end
|
1108
|
+
data
|
1109
|
+
end
|
1110
|
+
end
|
1111
|
+
|
1112
|
+
|
1113
|
+
# just pulls out the plain text and strips it. This will probably one of the
|
1114
|
+
# most commonly used scanner.
|
1115
|
+
|
1116
|
+
class StrippingTextScanner < BaseScanner
|
1117
|
+
def scan( data )
|
1118
|
+
if data.is_a? Hpricot::Elem
|
1119
|
+
data = data.inner_text
|
1120
|
+
else
|
1121
|
+
data = data.to_s
|
1122
|
+
end
|
1123
|
+
data && data.strip
|
1124
|
+
end
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
# pulls out text and then dicards everything up to the end of line
|
1128
|
+
|
1129
|
+
class LineScanner < BaseScanner
|
1130
|
+
end
|
1131
|
+
|
1132
|
+
end
|
1133
|
+
|
925
1134
|
def log( s )
|
926
1135
|
puts( s ) if @verbose
|
927
1136
|
end
|
928
1137
|
|
929
1138
|
|
1139
|
+
# given a Symbol of a scanner (name), return a handle for that scanner
|
1140
|
+
# declared when the metrocot was created.
|
930
1141
|
|
931
1142
|
def scanner_by_name( name )
|
932
1143
|
@scanners[name]
|
933
1144
|
end
|
934
1145
|
|
935
1146
|
|
1147
|
+
|
1148
|
+
attr_accessor :verbose
|
1149
|
+
|
1150
|
+
|
1151
|
+
def initialize( scanners )
|
1152
|
+
|
1153
|
+
@scanners = {}
|
1154
|
+
@compiled_patterns = {}
|
1155
|
+
|
1156
|
+
scanners.each { |name, value|
|
1157
|
+
if value.is_a? Class
|
1158
|
+
@scanners[name] = value.new
|
1159
|
+
else
|
1160
|
+
@scanners[name] = value
|
1161
|
+
end
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
@verbose = false
|
1165
|
+
|
1166
|
+
log("scanners: #{@scanners.inspect}")
|
1167
|
+
|
1168
|
+
end
|
1169
|
+
|
1170
|
+
#
|
1171
|
+
#
|
1172
|
+
|
1173
|
+
def scrape(doc)
|
1174
|
+
NodeScraper.new( self, nil, nil, doc )
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
|
936
1178
|
def compile_pattern( pattern_s, node_scraper )
|
937
1179
|
|
938
1180
|
# if @compiled_patterns.key? pattern_s
|
@@ -1031,80 +1273,6 @@ class Metrocot < Object
|
|
1031
1273
|
|
1032
1274
|
end
|
1033
1275
|
|
1034
|
-
|
1035
|
-
attr_accessor :verbose
|
1036
|
-
|
1037
|
-
|
1038
|
-
def initialize( scanners )
|
1039
|
-
|
1040
|
-
@scanners = {}
|
1041
|
-
@compiled_patterns = {}
|
1042
|
-
|
1043
|
-
scanners.each { |name, value|
|
1044
|
-
if value.is_a? Class
|
1045
|
-
@scanners[name] = value.new
|
1046
|
-
else
|
1047
|
-
@scanners[name] = value
|
1048
|
-
end
|
1049
|
-
}
|
1050
|
-
|
1051
|
-
@verbose = false
|
1052
|
-
|
1053
|
-
log("scanners: #{@scanners.inspect}")
|
1054
|
-
|
1055
|
-
end
|
1056
|
-
|
1057
|
-
|
1058
|
-
def scrape(doc)
|
1059
|
-
NodeScraper.new( self, nil, nil, doc )
|
1060
|
-
end
|
1061
|
-
|
1062
|
-
|
1063
|
-
module Scanners
|
1064
|
-
|
1065
|
-
class BaseScanner
|
1066
|
-
def scan(data)
|
1067
|
-
data.to_s
|
1068
|
-
end
|
1069
|
-
end
|
1070
|
-
|
1071
|
-
class DateTimeScanner < BaseScanner
|
1072
|
-
def scan( data )
|
1073
|
-
if data.is_a? Hpricot::Elem
|
1074
|
-
data = data.inner_text
|
1075
|
-
end
|
1076
|
-
Time.parse(data)
|
1077
|
-
end
|
1078
|
-
end
|
1079
|
-
|
1080
|
-
class TextLookupScanner < BaseScanner
|
1081
|
-
end
|
1082
|
-
|
1083
|
-
class TextileScanner < BaseScanner
|
1084
|
-
def scan( data )
|
1085
|
-
if data.is_a? Hpricot::Elem
|
1086
|
-
data = data.inner_text
|
1087
|
-
end
|
1088
|
-
end
|
1089
|
-
end
|
1090
|
-
|
1091
|
-
class TextScanner < BaseScanner
|
1092
|
-
def scan( data )
|
1093
|
-
if data.is_a? Hpricot::Elem
|
1094
|
-
data = data.inner_text
|
1095
|
-
else
|
1096
|
-
data = data.to_s
|
1097
|
-
end
|
1098
|
-
data
|
1099
|
-
end
|
1100
|
-
end
|
1101
|
-
|
1102
|
-
class LineScanner < BaseScanner
|
1103
|
-
end
|
1104
|
-
|
1105
|
-
|
1106
|
-
end
|
1107
|
-
|
1108
1276
|
end
|
1109
1277
|
|
1110
1278
|
#
|
data/test/test_metrocot.rb
CHANGED
@@ -20,21 +20,32 @@ class TestMetrocot < Test::Unit::TestCase
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
class Murder < Object
|
24
|
+
attr_accessor :killer, :mod, :weapon, :victim
|
25
|
+
def initialize( map )
|
26
|
+
@killer = map[:killer]
|
27
|
+
@mod = map[:mod]
|
28
|
+
@weapon = map[:weapon]
|
29
|
+
@victim = map[:victim]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
23
34
|
def test_nothing
|
24
35
|
|
25
36
|
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
26
37
|
|
27
38
|
doc = Hpricot(html)
|
28
39
|
|
29
|
-
|
40
|
+
mcot = Metrocot.new(
|
30
41
|
:a => Metrocot::Scanners::TextScanner,
|
31
42
|
:b => Metrocot::Scanners::TextScanner,
|
32
43
|
:c => Metrocot::Scanners::TextScanner
|
33
44
|
)
|
34
45
|
|
35
|
-
#
|
46
|
+
# mcot.verbose = true
|
36
47
|
|
37
|
-
assert_equal( [[]],
|
48
|
+
assert_equal( [[]], mcot.scrape(doc).descend("//html/body") { |td|
|
38
49
|
td.collect( "a=.//h3 b=.//p c=.//p" ) { |a, b, c|
|
39
50
|
Abc.new( a, b, c )
|
40
51
|
}
|
@@ -42,21 +53,22 @@ class TestMetrocot < Test::Unit::TestCase
|
|
42
53
|
|
43
54
|
end
|
44
55
|
|
56
|
+
|
45
57
|
def test_abc
|
46
58
|
|
47
|
-
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
59
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><h3>where are the cartoon foxes?</h3><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p><p>Inquiring minds want to know!</p></body></html>"
|
48
60
|
|
49
61
|
doc = Hpricot(html)
|
50
62
|
|
51
|
-
|
63
|
+
mcot = Metrocot.new(
|
52
64
|
:a => Metrocot::Scanners::TextScanner,
|
53
65
|
:b => Metrocot::Scanners::TextScanner,
|
54
66
|
:c => Metrocot::Scanners::TextScanner
|
55
67
|
)
|
56
68
|
|
57
|
-
#
|
69
|
+
# mcot.verbose = true
|
58
70
|
|
59
|
-
abcs =
|
71
|
+
abcs = mcot.scrape(doc).descend("//html/body") { |td|
|
60
72
|
td.collect( "a=.//h2 b=.//p c=.//p" ) { |a, b, c|
|
61
73
|
Abc.new( a, b, c )
|
62
74
|
}
|
@@ -66,5 +78,57 @@ class TestMetrocot < Test::Unit::TestCase
|
|
66
78
|
|
67
79
|
end
|
68
80
|
|
81
|
+
|
82
|
+
def test_murder
|
83
|
+
|
84
|
+
html = %{
|
85
|
+
<html>
|
86
|
+
<head>
|
87
|
+
<title>Murder by Numbers</title>
|
88
|
+
</head>
|
89
|
+
<body>
|
90
|
+
<h1>The Who Killed Who Hoedown</h1>
|
91
|
+
<ul>
|
92
|
+
<li>Bob strangled the plumber with piano wire.</li>
|
93
|
+
<li>Collonel Clinket stabbed Aunt Elizabeth with a screw driver, while Dick shot Harry with a hunting rifle.</li>
|
94
|
+
<li>Other than that, not much happened here today. Honestly!</li>
|
95
|
+
<li>Accidents do and will happen and that's what happened last Friday.</li>
|
96
|
+
</ul>
|
97
|
+
<p>
|
98
|
+
ps: Collonel Clinket would have stabbed Aunt Elizabeth with an ice pick if that had been available.
|
99
|
+
</p>
|
100
|
+
</body>
|
101
|
+
</html>
|
102
|
+
}
|
103
|
+
|
104
|
+
doc = Hpricot(html)
|
105
|
+
|
106
|
+
mcot = Metrocot.new(
|
107
|
+
:killer => Metrocot::Scanners::StrippingTextScanner,
|
108
|
+
:victim => Metrocot::Scanners::StrippingTextScanner,
|
109
|
+
:mod => Metrocot::Scanners::StrippingTextScanner,
|
110
|
+
:weapon => Metrocot::Scanners::StrippingTextScanner
|
111
|
+
)
|
112
|
+
|
113
|
+
# mcot.verbose = true
|
114
|
+
|
115
|
+
murder_strings = []
|
116
|
+
mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
117
|
+
li.collect_hashed( "killer=... mod=/(stabbed|shot|strangled)/ victim=... /(with|using)/ weapon=... /[.,]/" ) { |map| Murder.new( map ) }
|
118
|
+
}.each { |node, murders|
|
119
|
+
murders.each { |murder|
|
120
|
+
ms = " %-20s %-20s %-10s %-20s" % [murder.killer, murder.victim, murder.mod, murder.weapon]
|
121
|
+
# puts ms
|
122
|
+
murder_strings << ms
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
assert_equal([
|
127
|
+
" Bob the plumber strangled piano wire ",
|
128
|
+
" Collonel Clinket Aunt Elizabeth stabbed a screw driver ",
|
129
|
+
" while Dick Harry shot a hunting rifle "].sort, murder_strings.sort)
|
130
|
+
|
131
|
+
end
|
132
|
+
|
69
133
|
end
|
70
134
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metrocot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Helmut Hissen
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-01-
|
12
|
+
date: 2009-01-08 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.8.2
|
24
24
|
version:
|
25
|
-
description: Metrocot builds on
|
25
|
+
description: Metrocot builds on Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information. The specification is done in a very compact readable format.
|
26
26
|
email:
|
27
27
|
- helmut@zeebar.com
|
28
28
|
executables:
|
@@ -67,6 +67,6 @@ rubyforge_project: metrocot
|
|
67
67
|
rubygems_version: 1.2.0
|
68
68
|
signing_key:
|
69
69
|
specification_version: 2
|
70
|
-
summary:
|
70
|
+
summary: An Hpricot based tool for harvesting list-like data from HTML pages.
|
71
71
|
test_files:
|
72
72
|
- test/test_metrocot.rb
|