metrocot 1.0.0 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/README.txt +27 -27
- data/Rakefile +4 -1
- data/lib/metrocot.rb +313 -145
- data/test/test_metrocot.rb +71 -7
- metadata +4 -4
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 1.0.2 / 2009-01-08
|
2
|
+
|
3
|
+
* added docs and examples
|
4
|
+
* created tests for examples and made those work as well
|
5
|
+
* added StrippingTextScanner
|
6
|
+
|
7
|
+
=== 1.0.1 / 2009-01-03
|
8
|
+
|
9
|
+
* checked in with some initial docs
|
10
|
+
|
1
11
|
=== 1.0.0 / 2009-01-02
|
2
12
|
|
3
13
|
* First working version
|
data/README.txt
CHANGED
@@ -4,9 +4,9 @@
|
|
4
4
|
|
5
5
|
== DESCRIPTION:
|
6
6
|
|
7
|
-
Metrocot builds on
|
8
|
-
with a minimum of code and page specific information. The specification is
|
9
|
-
|
7
|
+
Metrocot builds on Hpricot to allow scraping of list data from HTML pages
|
8
|
+
with a minimum of code and page specific information. The specification is
|
9
|
+
done in a very compact readable format.
|
10
10
|
|
11
11
|
|
12
12
|
== FEATURES/PROBLEMS:
|
@@ -19,39 +19,39 @@ is a very compact readable format.
|
|
19
19
|
|
20
20
|
== SYNOPSIS:
|
21
21
|
|
22
|
-
require 'rubygems'
|
23
|
-
require 'metrocot'
|
22
|
+
require 'rubygems'
|
23
|
+
require 'metrocot'
|
24
24
|
|
25
|
-
class Event < Object
|
25
|
+
class Event < Object
|
26
26
|
|
27
|
-
|
27
|
+
attr_accessor :starts_at, :title, :description, :url
|
28
28
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
def initialize( starts_at, title, description, url )
|
30
|
+
@starts_at = starts_at
|
31
|
+
@title = title
|
32
|
+
@description = description
|
33
|
+
@url = url
|
34
|
+
end
|
35
|
+
|
34
36
|
end
|
35
|
-
|
36
|
-
end
|
37
37
|
|
38
|
-
mce_url = "http://www.musiccorner.ca/calendar.html"
|
39
|
-
mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
|
38
|
+
mce_url = "http://www.musiccorner.ca/calendar.html"
|
39
|
+
mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
|
40
40
|
|
41
|
-
scraper = Metrocot.new(
|
41
|
+
scraper = Metrocot.new(
|
42
42
|
:starts_at => Metrocot::Scanners::DateTimeScanner,
|
43
43
|
:description => Metrocot::Scanners::TextScanner,
|
44
|
-
:title => Metrocot::Scanners::
|
45
|
-
)
|
44
|
+
:title => Metrocot::Scanners::StrippingTextScanner
|
45
|
+
)
|
46
46
|
|
47
|
-
mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
|
48
|
-
|
49
|
-
}.values.flatten
|
50
|
-
|
51
|
-
puts "Found #{mce_events.size} mce events:"
|
52
|
-
mce_events.each_with_index { |event, event_index|
|
53
|
-
|
54
|
-
}
|
47
|
+
mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
|
48
|
+
td.collect( "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)" ) { |starts_at, title, description| Event.new( starts_at, title, description, mce_url ) }
|
49
|
+
}.values.flatten
|
50
|
+
|
51
|
+
puts "Found #{mce_events.size} mce events:"
|
52
|
+
mce_events.each_with_index { |event, event_index|
|
53
|
+
puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
|
54
|
+
}
|
55
55
|
|
56
56
|
|
57
57
|
== REQUIREMENTS:
|
data/Rakefile
CHANGED
@@ -5,8 +5,11 @@ require 'hoe'
|
|
5
5
|
require './lib/metrocot.rb'
|
6
6
|
|
7
7
|
Hoe.new('metrocot', Metrocot::VERSION) do |p|
|
8
|
-
p.rubyforge_name = 'metrocot'
|
8
|
+
p.rubyforge_name = 'metrocot'
|
9
9
|
p.developer('Helmut Hissen', 'helmut@zeebar.com')
|
10
|
+
p.summary = "An Hpricot based tool for harvesting list-like data from HTML pages."
|
11
|
+
p.remote_rdoc_dir = '' # Release to root
|
10
12
|
end
|
11
13
|
|
14
|
+
|
12
15
|
# vim: syntax=Ruby
|
data/lib/metrocot.rb
CHANGED
@@ -1,4 +1,8 @@
|
|
1
1
|
|
2
|
+
# Author:: Helmut Hissen (mailto:helmut@zeebar.com)
|
3
|
+
# Copyright:: Copyright(c) 2009 Metro Cascade Media, Inc.
|
4
|
+
# License:: Distributed under the BSD open source license
|
5
|
+
#
|
2
6
|
#############################################################################
|
3
7
|
#
|
4
8
|
# Copyright (c) 2009 Metro Cascade Media Inc
|
@@ -24,9 +28,6 @@
|
|
24
28
|
#
|
25
29
|
#############################################################################
|
26
30
|
#
|
27
|
-
# Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
|
28
|
-
# January 1 2009
|
29
|
-
#
|
30
31
|
#############################################################################
|
31
32
|
#
|
32
33
|
# We are like tiny pleasantly chirping hex bugs coding away on the
|
@@ -38,9 +39,39 @@
|
|
38
39
|
#############################################################################
|
39
40
|
#
|
40
41
|
|
42
|
+
# == Purpose
|
43
|
+
# This class implements the main Metrocot HTML scanner and a number of handy
|
44
|
+
# input scanners (for grabbing time, numbers, or text from HTML). The purpose
|
45
|
+
# of the Metrocot is to scan a XML dom for the patterns specified in the
|
46
|
+
# Metrocot pattern language.
|
47
|
+
#
|
48
|
+
# == Pattern Language
|
49
|
+
# The Metrocot pattern language allows for the following types of patterns:
|
50
|
+
#
|
51
|
+
# +...+:: matches anything
|
52
|
+
# +"some string":: matches that string
|
53
|
+
# +/(some|pattern)/ matches that regexp pattern
|
54
|
+
# +./HPRICOT_PATH+:: matches a certain type of dom subtree
|
55
|
+
# +SPACE+:: matches zero or more white spaces
|
56
|
+
# +(PATTERN_A PATTERN_B):: matches PATTERN_A followed by PATTERN_B
|
57
|
+
# +PATTERN\++:: matches one or more occurrences of PATTERN
|
58
|
+
#
|
59
|
+
# == Usage
|
60
|
+
# 0) create a Metricot and define the types of fields you want to extract (and their names).
|
61
|
+
# 1) use Hpricot to get the doc's dom
|
62
|
+
# 2) use descend(xpath) to create a NodeScraper rooted at the Hpricot node(s) matching the xpath
|
63
|
+
# 3) use collect(pattern) to collect all entries found in the HTML which match the Metricot pattern
|
64
|
+
|
41
65
|
class Metrocot < Object
|
42
66
|
|
43
|
-
VERSION = '1.0.
|
67
|
+
VERSION = '1.0.2'
|
68
|
+
|
69
|
+
|
70
|
+
# represents a subtree withing a metrocot dom. the semantics are roughly equivalent
|
71
|
+
# to what you get from select-dragging your mouse pointer through a section of an html
|
72
|
+
# doc in your web browser. Thats is, a range specifies the first and last node in the
|
73
|
+
# pre-fix traversal of the dom. Additionally, the first and last node may be truncated
|
74
|
+
# (at their tail and head respectively) if they are text nodes.
|
44
75
|
|
45
76
|
class MatchRange
|
46
77
|
|
@@ -126,6 +157,9 @@ class Metrocot < Object
|
|
126
157
|
end
|
127
158
|
|
128
159
|
|
160
|
+
#
|
161
|
+
# base class for all other patterns. Provides some reasonable default behaviours.
|
162
|
+
#
|
129
163
|
|
130
164
|
class BasePattern
|
131
165
|
|
@@ -156,7 +190,11 @@ class Metrocot < Object
|
|
156
190
|
end
|
157
191
|
|
158
192
|
def description
|
159
|
-
|
193
|
+
if name
|
194
|
+
"#{self.class.name} \"#{name}\""
|
195
|
+
else
|
196
|
+
self.class.name
|
197
|
+
end
|
160
198
|
end
|
161
199
|
|
162
200
|
|
@@ -247,6 +285,8 @@ class Metrocot < Object
|
|
247
285
|
end
|
248
286
|
|
249
287
|
|
288
|
+
# matches a certain Hpricot path
|
289
|
+
|
250
290
|
class PathPattern < BasePattern
|
251
291
|
|
252
292
|
def initialize( source, path )
|
@@ -311,6 +351,8 @@ class Metrocot < Object
|
|
311
351
|
|
312
352
|
end
|
313
353
|
|
354
|
+
|
355
|
+
# matches zero or more white spaces
|
314
356
|
|
315
357
|
class OptSpacePattern < BasePattern
|
316
358
|
|
@@ -332,47 +374,58 @@ class Metrocot < Object
|
|
332
374
|
end
|
333
375
|
|
334
376
|
def priority
|
335
|
-
-
|
377
|
+
-8
|
336
378
|
end
|
337
379
|
|
338
380
|
def each_match( match_range, match_map )
|
381
|
+
|
382
|
+
result = nil
|
383
|
+
|
339
384
|
super(match_range, match_map)
|
340
|
-
|
385
|
+
|
386
|
+
match_start_index = match_range.start_index
|
341
387
|
match_start_offset = match_range.start_offset
|
342
|
-
match_end_index
|
343
|
-
match_end_offset
|
388
|
+
match_end_index = match_range.start_index
|
389
|
+
match_end_offset = match_range.start_offset
|
390
|
+
|
391
|
+
raise "negative range #{match_range}" if match_start_index > match_end_index
|
392
|
+
raise "negative range #{match_range}" if match_start_index == match_end_index && match_start_offset > match_end_offset
|
344
393
|
|
345
394
|
# consume rest of first text node
|
346
395
|
|
347
396
|
hnodes = match_range.hnodes
|
397
|
+
hnode_text = nil
|
348
398
|
|
349
399
|
if hnodes[match_start_index] && hnodes[match_start_index].text?
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
400
|
+
|
401
|
+
hnode_text = hnodes[match_start_index].inner_text
|
402
|
+
while match_end_offset < hnode_text.size && (/^\s+$/.=== hnode_text[match_start_offset .. match_end_offset])
|
403
|
+
match_end_offset += 1
|
404
|
+
end
|
354
405
|
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
match_range = match_range.tail( match_end_index, match_end_offset )
|
361
|
-
log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
|
362
|
-
end
|
406
|
+
if match_end_offset >= match_start_offset
|
407
|
+
if match_end_offset >= hnode_text.size
|
408
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} spaces" )
|
409
|
+
else
|
410
|
+
log( "matched first #{match_end_offset - match_start_offset} leading spaces" )
|
363
411
|
end
|
412
|
+
end
|
364
413
|
end
|
365
414
|
|
415
|
+
sp_match_range = match_range.crop( match_start_index, match_start_offset, match_end_index, match_end_offset )
|
416
|
+
|
366
417
|
result = with_scanned_match_data( match_map, hnodes[match_start_index ... match_end_index] ) { |match_map|
|
367
|
-
yield(
|
418
|
+
yield( sp_match_range, match_map )
|
368
419
|
}
|
369
|
-
result
|
370
420
|
|
421
|
+
result
|
371
422
|
end
|
372
423
|
|
373
424
|
end
|
374
425
|
|
375
426
|
|
427
|
+
# matches one or more occurences of some other pattern
|
428
|
+
|
376
429
|
class OneOrMorePattern < BasePattern
|
377
430
|
|
378
431
|
def initialize(repeatee)
|
@@ -458,6 +511,8 @@ class Metrocot < Object
|
|
458
511
|
end
|
459
512
|
|
460
513
|
|
514
|
+
# Matches anything.
|
515
|
+
|
461
516
|
class AnythingPattern < BasePattern
|
462
517
|
|
463
518
|
def description
|
@@ -474,9 +529,25 @@ class Metrocot < Object
|
|
474
529
|
# it just expands to fill whatever gap
|
475
530
|
|
476
531
|
def each_match( match_range, match_map )
|
477
|
-
|
478
|
-
|
479
|
-
|
532
|
+
|
533
|
+
result = if match_range.start_index > match_range.end_index
|
534
|
+
log( "empty range" )
|
535
|
+
nil
|
536
|
+
elsif match_range.start_index > match_range.end_index && match_range.start_offset > match_range.end_offset
|
537
|
+
log( "empty node" )
|
538
|
+
nil
|
539
|
+
elsif match_range.start_index == match_range.end_index || (match_range.start_index + 1 == match_range.end_index && match_range.end_offset == 0) && match_range.hnodes[match_range.start_index].text?
|
540
|
+
log( "single text node range #{match_range.describe}" )
|
541
|
+
raise "bad range #{match_range.describe}" if match_range.hnodes[match_range.start_index].nil?
|
542
|
+
with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index].inner_text[match_range.start_offset...match_range.end_offset] ) { |match_map|
|
543
|
+
yield( match_range, match_map )
|
544
|
+
}
|
545
|
+
else
|
546
|
+
log( "multi node range #{match_range.describe}" )
|
547
|
+
with_scanned_match_data( match_map, match_range.hnodes[match_range.start_index ... match_range.end_index] ) { |match_map|
|
548
|
+
yield( match_range, match_map )
|
549
|
+
}
|
550
|
+
end
|
480
551
|
end
|
481
552
|
|
482
553
|
|
@@ -487,6 +558,8 @@ class Metrocot < Object
|
|
487
558
|
end
|
488
559
|
|
489
560
|
|
561
|
+
# Matches a certain text string or regex pattern
|
562
|
+
|
490
563
|
class TextPattern < BasePattern
|
491
564
|
|
492
565
|
def initialize( source, text )
|
@@ -580,10 +653,9 @@ class Metrocot < Object
|
|
580
653
|
|
581
654
|
super(match_range, match_map)
|
582
655
|
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
match_end_offset = match_range.start_offset
|
656
|
+
match_index = match_range.start_index
|
657
|
+
match_offset = match_range.start_offset
|
658
|
+
|
587
659
|
|
588
660
|
# consume rest of first text node
|
589
661
|
|
@@ -591,62 +663,72 @@ class Metrocot < Object
|
|
591
663
|
|
592
664
|
actual_match = nil
|
593
665
|
|
594
|
-
while
|
666
|
+
while match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)
|
595
667
|
|
596
|
-
while
|
597
|
-
log( "not text: ##{
|
598
|
-
|
599
|
-
|
668
|
+
while (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && ! hnodes[match_index].text?
|
669
|
+
log( "not text: ##{match_index} #{hnodes[match_index].class}" )
|
670
|
+
match_index += 1
|
671
|
+
match_offset = 0
|
600
672
|
end
|
601
673
|
|
602
|
-
unless
|
674
|
+
unless (match_index < match_range.end_index || (match_index == match_range.end_index && match_offset < match_range.end_offset)) && hnodes[match_index].text?
|
603
675
|
log( "no match found" )
|
604
676
|
return nil
|
605
677
|
end
|
606
678
|
|
607
|
-
hnode_text =
|
679
|
+
hnode_text = if match_index == match_range.end_index
|
680
|
+
hnodes[match_index].inner_text[0...match_range.end_offset]
|
681
|
+
else
|
682
|
+
hnodes[match_index].inner_text
|
683
|
+
end
|
608
684
|
|
609
|
-
log( "trying text match on: #{hnode_text[
|
685
|
+
log( "trying text match on: #{hnode_text[match_offset .. -1]}" )
|
610
686
|
|
611
|
-
|
687
|
+
next_match_offset = hnode_text.index( @text, match_offset )
|
612
688
|
|
613
|
-
if
|
689
|
+
if next_match_offset.nil?
|
690
|
+
log( "no match found for #{@text}" )
|
691
|
+
match_index += 1
|
692
|
+
match_offset = 0
|
693
|
+
next
|
694
|
+
end
|
614
695
|
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
696
|
+
actual_match = if @text.is_a? Regexp
|
697
|
+
hnode_text[next_match_offset..-1][@text]
|
698
|
+
else
|
699
|
+
@text
|
700
|
+
end
|
701
|
+
|
702
|
+
log( "next text match at #{match_index}.#{next_match_offset}: #{actual_match}" )
|
620
703
|
|
621
|
-
|
622
|
-
|
704
|
+
match_start_offset = next_match_offset
|
705
|
+
match_end_offset = match_start_offset + actual_match.size
|
623
706
|
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
log( "matched first #{match_end_offset - match_start_offset} chars" )
|
629
|
-
end
|
630
|
-
break
|
631
|
-
end
|
707
|
+
if match_end_offset >= hnode_text.size
|
708
|
+
log( "matched entire string of #{match_end_offset - match_start_offset} chars" )
|
709
|
+
else
|
710
|
+
log( "matched first #{match_end_offset - match_start_offset} chars" )
|
632
711
|
end
|
633
712
|
|
634
|
-
|
635
|
-
|
713
|
+
result = with_scanned_match_data( match_map, actual_match ) { |match_map|
|
714
|
+
yield( match_range.crop( match_index, match_start_offset, match_index, match_end_offset), match_map )
|
715
|
+
}
|
716
|
+
|
717
|
+
return result if result
|
718
|
+
|
719
|
+
match_offset = match_end_offset
|
636
720
|
|
637
721
|
end
|
638
722
|
|
639
|
-
|
640
|
-
result = with_scanned_match_data( match_map, actual_match ) { |match_map|
|
641
|
-
yield( match_range.crop( match_start_index, match_start_offset, match_start_index, match_end_offset), match_map )
|
642
|
-
}
|
643
|
-
result
|
723
|
+
return nil
|
644
724
|
|
645
725
|
end
|
646
726
|
|
647
727
|
end
|
648
728
|
|
649
729
|
|
730
|
+
# Matches a series of patterns
|
731
|
+
|
650
732
|
class CompositePattern < BasePattern
|
651
733
|
|
652
734
|
attr_reader :parts
|
@@ -676,7 +758,7 @@ class Metrocot < Object
|
|
676
758
|
end
|
677
759
|
|
678
760
|
|
679
|
-
def each_split_match(
|
761
|
+
def each_split_match( comp_match_range, match_map, parts_by_priority, ppx, part_matches )
|
680
762
|
|
681
763
|
pattern = nil
|
682
764
|
|
@@ -690,10 +772,13 @@ class Metrocot < Object
|
|
690
772
|
|
691
773
|
if ppx >= parts_by_priority.size
|
692
774
|
log("comp nothing left to do")
|
693
|
-
return yield(
|
775
|
+
return yield( comp_match_range, match_map )
|
694
776
|
end
|
695
777
|
|
696
778
|
|
779
|
+
match_range = comp_match_range
|
780
|
+
log("comp matching sub-pattern #{pattern.description} within #{match_range.describe}")
|
781
|
+
|
697
782
|
#
|
698
783
|
# figure out which gap this pattern is supposed to fill
|
699
784
|
#
|
@@ -718,15 +803,25 @@ class Metrocot < Object
|
|
718
803
|
if matched_on_left
|
719
804
|
log("comp matching must be right of #{matched_on_left.description}")
|
720
805
|
match_range = match_range.tail(matched_on_left.matched.end_index, matched_on_left.matched.end_offset)
|
806
|
+
elsif matched_on_right
|
807
|
+
right_node = match_range.hnodes[matched_on_right.matched.start_index]
|
808
|
+
parent_of_right_node = right_node && right_node.parent
|
809
|
+
parent_ix_of_right_node = parent_of_right_node && node_scraper.hnode_index[parent_of_right_node]
|
810
|
+
if parent_ix_of_right_node && parent_ix_of_right_node >= match_range.start_index
|
811
|
+
match_range = match_range.tail(parent_ix_of_right_node + 1, 0)
|
812
|
+
log("restricting left boundary to #{match_range} because would otherwise include subtree with right peer")
|
813
|
+
end
|
721
814
|
end
|
722
815
|
|
723
|
-
log("comp matching sub-pattern
|
816
|
+
log("comp matching sub-pattern #{pattern.description} at #{match_range.describe}")
|
724
817
|
|
725
818
|
pattern.each_match( match_range, match_map ) { |part_match_range, match_map|
|
726
819
|
|
727
820
|
pattern.matched = part_match_range
|
728
821
|
|
729
|
-
|
822
|
+
log("found sub-pattern #{pattern.description} at #{part_match_range.describe}")
|
823
|
+
|
824
|
+
result = each_split_match( comp_match_range, match_map, parts_by_priority, ppx + 1, part_matches ) { |sub_match_range, sub_match_map|
|
730
825
|
yield( sub_match_range, match_map )
|
731
826
|
}
|
732
827
|
|
@@ -785,6 +880,9 @@ class Metrocot < Object
|
|
785
880
|
end
|
786
881
|
|
787
882
|
|
883
|
+
# rooted at a node in the dom, the node srcaper is used to collect all matches of
|
884
|
+
# patterns.
|
885
|
+
|
788
886
|
class NodeScraper
|
789
887
|
|
790
888
|
attr_accessor :mcot, :root, :parent, :hnode, :pattern_classes, :top_part_names, :verbose
|
@@ -883,15 +981,16 @@ class Metrocot < Object
|
|
883
981
|
result = nil
|
884
982
|
pattern.each_match( match_range, {} ) { |sub_match_range, match_map|
|
885
983
|
match_list = []
|
886
|
-
|
887
|
-
top_part_names.collect { |top_name|
|
984
|
+
result = if (call_with == :positional) && top_part_names.size > 0
|
985
|
+
block_args = top_part_names.collect { |top_name|
|
888
986
|
match_map[top_name]
|
889
987
|
}
|
988
|
+
log("calling pos scan block with: #{block_args.inspect}")
|
989
|
+
result = block.call( *block_args )
|
890
990
|
else
|
891
|
-
match_map
|
991
|
+
log("calling hash scan block with: #{match_map.inspect}")
|
992
|
+
result = block.call( match_map )
|
892
993
|
end
|
893
|
-
log("calling scan block with: #{block_args.join(", ")}")
|
894
|
-
result = block.call( *block_args )
|
895
994
|
if result
|
896
995
|
results << result
|
897
996
|
match_range = match_range.following( sub_match_range )
|
@@ -905,16 +1004,51 @@ class Metrocot < Object
|
|
905
1004
|
results
|
906
1005
|
end
|
907
1006
|
|
1007
|
+
# collects all occurrences of the data matching the pattern by calling the
|
1008
|
+
# yield block for everything part of the dom subtree matching the pattern.
|
1009
|
+
# The block can reject the dom match by returning nil. Anything other than
|
1010
|
+
# nil will be appended to the list returned at the end.
|
1011
|
+
#
|
1012
|
+
# Unlike collect_hashed(), the block will be given a list of parameter
|
1013
|
+
# values matching the list of named fields in the pattern.
|
1014
|
+
#
|
1015
|
+
# === Example
|
1016
|
+
#
|
1017
|
+
# mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
1018
|
+
# li.collect( "liker=... \"likes\" likee=..." ) { |likes, liked|
|
1019
|
+
# [ likes, liked ]
|
1020
|
+
# }
|
1021
|
+
# }
|
1022
|
+
#
|
908
1023
|
|
909
1024
|
def collect( pattern_s, &block )
|
910
1025
|
collect_gen( pattern_s, :positional, &block )
|
911
1026
|
end
|
912
1027
|
|
913
1028
|
|
1029
|
+
# collects all occurrences of the data matching the pattern by calling the
|
1030
|
+
# yield block for everything part of the dom subtree matching the pattern.
|
1031
|
+
# The block can reject the dom match by returning nil. Anything other than
|
1032
|
+
# nil will be appended to the list returned at the end.
|
1033
|
+
#
|
1034
|
+
# Unlike collect(), the block will be given a map of parameter values
|
1035
|
+
# keyed by the names of the named fields in the pattern.
|
1036
|
+
#
|
1037
|
+
# === Example
|
1038
|
+
#
|
1039
|
+
# mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
1040
|
+
# li.collect_hashed( "killer=... verb=/(stabbed|shot|strangled)/ victim=... \"(with|using)\" weapon=..." ) { |map|
|
1041
|
+
# Murder.new( map )
|
1042
|
+
# }
|
1043
|
+
# }
|
1044
|
+
|
914
1045
|
def collect_hashed( pattern_s, &block )
|
915
1046
|
collect_gen( pattern_s, :map, &block )
|
916
1047
|
end
|
917
1048
|
|
1049
|
+
|
1050
|
+
# returns the scanner declared with that name when the metrocot was created
|
1051
|
+
|
918
1052
|
def scanner_by_name( name )
|
919
1053
|
return mcot.scanner_by_name(name)
|
920
1054
|
end
|
@@ -922,17 +1056,125 @@ class Metrocot < Object
|
|
922
1056
|
end
|
923
1057
|
|
924
1058
|
|
1059
|
+
# Some useful scanners which should cover a good number of common
|
1060
|
+
# parsing scenarios.
|
1061
|
+
|
1062
|
+
module Scanners
|
1063
|
+
|
1064
|
+
class BaseScanner
|
1065
|
+
def scan(data)
|
1066
|
+
data.to_s
|
1067
|
+
end
|
1068
|
+
end
|
1069
|
+
|
1070
|
+
|
1071
|
+
# Scans the hpricot element or text for a date in one of the
|
1072
|
+
# various formats accepted by Time. Dependingon the kinds of
|
1073
|
+
# dates expected in the doc, this may not be sufficient and
|
1074
|
+
# you may have to create a custom scanner.
|
1075
|
+
|
1076
|
+
class DateTimeScanner < BaseScanner
|
1077
|
+
def scan( data )
|
1078
|
+
if data.is_a? Hpricot::Elem
|
1079
|
+
data = data.inner_text
|
1080
|
+
end
|
1081
|
+
Time.parse(data)
|
1082
|
+
end
|
1083
|
+
end
|
1084
|
+
|
1085
|
+
|
1086
|
+
# Scans the dom subtree and converts it to textile where possible
|
1087
|
+
# returning a string containing the Textile version
|
1088
|
+
|
1089
|
+
class TextileScanner < BaseScanner
|
1090
|
+
def scan( data )
|
1091
|
+
if data.is_a? Hpricot::Elem
|
1092
|
+
data = data.inner_text
|
1093
|
+
end
|
1094
|
+
end
|
1095
|
+
end
|
1096
|
+
|
1097
|
+
|
1098
|
+
# just pulls out the plain text. This will probably be the most
|
1099
|
+
# commonly used scanner.
|
1100
|
+
|
1101
|
+
class TextScanner < BaseScanner
|
1102
|
+
def scan( data )
|
1103
|
+
if data.is_a? Hpricot::Elem
|
1104
|
+
data = data.inner_text
|
1105
|
+
else
|
1106
|
+
data = data.to_s
|
1107
|
+
end
|
1108
|
+
data
|
1109
|
+
end
|
1110
|
+
end
|
1111
|
+
|
1112
|
+
|
1113
|
+
# just pulls out the plain text and strips it. This will probably one of the
|
1114
|
+
# most commonly used scanner.
|
1115
|
+
|
1116
|
+
class StrippingTextScanner < BaseScanner
|
1117
|
+
def scan( data )
|
1118
|
+
if data.is_a? Hpricot::Elem
|
1119
|
+
data = data.inner_text
|
1120
|
+
else
|
1121
|
+
data = data.to_s
|
1122
|
+
end
|
1123
|
+
data && data.strip
|
1124
|
+
end
|
1125
|
+
end
|
1126
|
+
|
1127
|
+
# pulls out text and then dicards everything up to the end of line
|
1128
|
+
|
1129
|
+
class LineScanner < BaseScanner
|
1130
|
+
end
|
1131
|
+
|
1132
|
+
end
|
1133
|
+
|
925
1134
|
def log( s )
|
926
1135
|
puts( s ) if @verbose
|
927
1136
|
end
|
928
1137
|
|
929
1138
|
|
1139
|
+
# given a Symbol of a scanner (name), return a handle for that scanner
|
1140
|
+
# declared when the metrocot was created.
|
930
1141
|
|
931
1142
|
def scanner_by_name( name )
|
932
1143
|
@scanners[name]
|
933
1144
|
end
|
934
1145
|
|
935
1146
|
|
1147
|
+
|
1148
|
+
attr_accessor :verbose
|
1149
|
+
|
1150
|
+
|
1151
|
+
def initialize( scanners )
|
1152
|
+
|
1153
|
+
@scanners = {}
|
1154
|
+
@compiled_patterns = {}
|
1155
|
+
|
1156
|
+
scanners.each { |name, value|
|
1157
|
+
if value.is_a? Class
|
1158
|
+
@scanners[name] = value.new
|
1159
|
+
else
|
1160
|
+
@scanners[name] = value
|
1161
|
+
end
|
1162
|
+
}
|
1163
|
+
|
1164
|
+
@verbose = false
|
1165
|
+
|
1166
|
+
log("scanners: #{@scanners.inspect}")
|
1167
|
+
|
1168
|
+
end
|
1169
|
+
|
1170
|
+
#
|
1171
|
+
#
|
1172
|
+
|
1173
|
+
def scrape(doc)
|
1174
|
+
NodeScraper.new( self, nil, nil, doc )
|
1175
|
+
end
|
1176
|
+
|
1177
|
+
|
936
1178
|
def compile_pattern( pattern_s, node_scraper )
|
937
1179
|
|
938
1180
|
# if @compiled_patterns.key? pattern_s
|
@@ -1031,80 +1273,6 @@ class Metrocot < Object
|
|
1031
1273
|
|
1032
1274
|
end
|
1033
1275
|
|
1034
|
-
|
1035
|
-
attr_accessor :verbose
|
1036
|
-
|
1037
|
-
|
1038
|
-
def initialize( scanners )
|
1039
|
-
|
1040
|
-
@scanners = {}
|
1041
|
-
@compiled_patterns = {}
|
1042
|
-
|
1043
|
-
scanners.each { |name, value|
|
1044
|
-
if value.is_a? Class
|
1045
|
-
@scanners[name] = value.new
|
1046
|
-
else
|
1047
|
-
@scanners[name] = value
|
1048
|
-
end
|
1049
|
-
}
|
1050
|
-
|
1051
|
-
@verbose = false
|
1052
|
-
|
1053
|
-
log("scanners: #{@scanners.inspect}")
|
1054
|
-
|
1055
|
-
end
|
1056
|
-
|
1057
|
-
|
1058
|
-
def scrape(doc)
|
1059
|
-
NodeScraper.new( self, nil, nil, doc )
|
1060
|
-
end
|
1061
|
-
|
1062
|
-
|
1063
|
-
module Scanners
|
1064
|
-
|
1065
|
-
class BaseScanner
|
1066
|
-
def scan(data)
|
1067
|
-
data.to_s
|
1068
|
-
end
|
1069
|
-
end
|
1070
|
-
|
1071
|
-
class DateTimeScanner < BaseScanner
|
1072
|
-
def scan( data )
|
1073
|
-
if data.is_a? Hpricot::Elem
|
1074
|
-
data = data.inner_text
|
1075
|
-
end
|
1076
|
-
Time.parse(data)
|
1077
|
-
end
|
1078
|
-
end
|
1079
|
-
|
1080
|
-
class TextLookupScanner < BaseScanner
|
1081
|
-
end
|
1082
|
-
|
1083
|
-
class TextileScanner < BaseScanner
|
1084
|
-
def scan( data )
|
1085
|
-
if data.is_a? Hpricot::Elem
|
1086
|
-
data = data.inner_text
|
1087
|
-
end
|
1088
|
-
end
|
1089
|
-
end
|
1090
|
-
|
1091
|
-
class TextScanner < BaseScanner
|
1092
|
-
def scan( data )
|
1093
|
-
if data.is_a? Hpricot::Elem
|
1094
|
-
data = data.inner_text
|
1095
|
-
else
|
1096
|
-
data = data.to_s
|
1097
|
-
end
|
1098
|
-
data
|
1099
|
-
end
|
1100
|
-
end
|
1101
|
-
|
1102
|
-
class LineScanner < BaseScanner
|
1103
|
-
end
|
1104
|
-
|
1105
|
-
|
1106
|
-
end
|
1107
|
-
|
1108
1276
|
end
|
1109
1277
|
|
1110
1278
|
#
|
data/test/test_metrocot.rb
CHANGED
@@ -20,21 +20,32 @@ class TestMetrocot < Test::Unit::TestCase
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
|
23
|
+
class Murder < Object
|
24
|
+
attr_accessor :killer, :mod, :weapon, :victim
|
25
|
+
def initialize( map )
|
26
|
+
@killer = map[:killer]
|
27
|
+
@mod = map[:mod]
|
28
|
+
@weapon = map[:weapon]
|
29
|
+
@victim = map[:victim]
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
|
23
34
|
def test_nothing
|
24
35
|
|
25
36
|
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
26
37
|
|
27
38
|
doc = Hpricot(html)
|
28
39
|
|
29
|
-
|
40
|
+
mcot = Metrocot.new(
|
30
41
|
:a => Metrocot::Scanners::TextScanner,
|
31
42
|
:b => Metrocot::Scanners::TextScanner,
|
32
43
|
:c => Metrocot::Scanners::TextScanner
|
33
44
|
)
|
34
45
|
|
35
|
-
#
|
46
|
+
# mcot.verbose = true
|
36
47
|
|
37
|
-
assert_equal( [[]],
|
48
|
+
assert_equal( [[]], mcot.scrape(doc).descend("//html/body") { |td|
|
38
49
|
td.collect( "a=.//h3 b=.//p c=.//p" ) { |a, b, c|
|
39
50
|
Abc.new( a, b, c )
|
40
51
|
}
|
@@ -42,21 +53,22 @@ class TestMetrocot < Test::Unit::TestCase
|
|
42
53
|
|
43
54
|
end
|
44
55
|
|
56
|
+
|
45
57
|
def test_abc
|
46
58
|
|
47
|
-
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p></body></html>"
|
59
|
+
html = "<html><head></head><body><h1>hello!</h1><h2>A</h2><h3>where are the cartoon foxes?</h3><p>a 1</p><p>a 2</p><h2>B</h2><p>b 1</p><p>b 2</p><h2>C</h2><p>c 1</p><p>c 2</p><p>Inquiring minds want to know!</p></body></html>"
|
48
60
|
|
49
61
|
doc = Hpricot(html)
|
50
62
|
|
51
|
-
|
63
|
+
mcot = Metrocot.new(
|
52
64
|
:a => Metrocot::Scanners::TextScanner,
|
53
65
|
:b => Metrocot::Scanners::TextScanner,
|
54
66
|
:c => Metrocot::Scanners::TextScanner
|
55
67
|
)
|
56
68
|
|
57
|
-
#
|
69
|
+
# mcot.verbose = true
|
58
70
|
|
59
|
-
abcs =
|
71
|
+
abcs = mcot.scrape(doc).descend("//html/body") { |td|
|
60
72
|
td.collect( "a=.//h2 b=.//p c=.//p" ) { |a, b, c|
|
61
73
|
Abc.new( a, b, c )
|
62
74
|
}
|
@@ -66,5 +78,57 @@ class TestMetrocot < Test::Unit::TestCase
|
|
66
78
|
|
67
79
|
end
|
68
80
|
|
81
|
+
|
82
|
+
def test_murder
|
83
|
+
|
84
|
+
html = %{
|
85
|
+
<html>
|
86
|
+
<head>
|
87
|
+
<title>Murder by Numbers</title>
|
88
|
+
</head>
|
89
|
+
<body>
|
90
|
+
<h1>The Who Killed Who Hoedown</h1>
|
91
|
+
<ul>
|
92
|
+
<li>Bob strangled the plumber with piano wire.</li>
|
93
|
+
<li>Collonel Clinket stabbed Aunt Elizabeth with a screw driver, while Dick shot Harry with a hunting rifle.</li>
|
94
|
+
<li>Other than that, not much happened here today. Honestly!</li>
|
95
|
+
<li>Accidents do and will happen and that's what happened last Friday.</li>
|
96
|
+
</ul>
|
97
|
+
<p>
|
98
|
+
ps: Collonel Clinket would have stabbed Aunt Elizabeth with an ice pick if that had been available.
|
99
|
+
</p>
|
100
|
+
</body>
|
101
|
+
</html>
|
102
|
+
}
|
103
|
+
|
104
|
+
doc = Hpricot(html)
|
105
|
+
|
106
|
+
mcot = Metrocot.new(
|
107
|
+
:killer => Metrocot::Scanners::StrippingTextScanner,
|
108
|
+
:victim => Metrocot::Scanners::StrippingTextScanner,
|
109
|
+
:mod => Metrocot::Scanners::StrippingTextScanner,
|
110
|
+
:weapon => Metrocot::Scanners::StrippingTextScanner
|
111
|
+
)
|
112
|
+
|
113
|
+
# mcot.verbose = true
|
114
|
+
|
115
|
+
murder_strings = []
|
116
|
+
mcot.scrape(doc).descend( "//ul/li" ) { |li|
|
117
|
+
li.collect_hashed( "killer=... mod=/(stabbed|shot|strangled)/ victim=... /(with|using)/ weapon=... /[.,]/" ) { |map| Murder.new( map ) }
|
118
|
+
}.each { |node, murders|
|
119
|
+
murders.each { |murder|
|
120
|
+
ms = " %-20s %-20s %-10s %-20s" % [murder.killer, murder.victim, murder.mod, murder.weapon]
|
121
|
+
# puts ms
|
122
|
+
murder_strings << ms
|
123
|
+
}
|
124
|
+
}
|
125
|
+
|
126
|
+
assert_equal([
|
127
|
+
" Bob the plumber strangled piano wire ",
|
128
|
+
" Collonel Clinket Aunt Elizabeth stabbed a screw driver ",
|
129
|
+
" while Dick Harry shot a hunting rifle "].sort, murder_strings.sort)
|
130
|
+
|
131
|
+
end
|
132
|
+
|
69
133
|
end
|
70
134
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: metrocot
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Helmut Hissen
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-01-
|
12
|
+
date: 2009-01-08 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
- !ruby/object:Gem::Version
|
23
23
|
version: 1.8.2
|
24
24
|
version:
|
25
|
-
description: Metrocot builds on
|
25
|
+
description: Metrocot builds on Hpricot to allow scraping of list data from HTML pages with a minimum of code and page specific information. The specification is done in a very compact readable format.
|
26
26
|
email:
|
27
27
|
- helmut@zeebar.com
|
28
28
|
executables:
|
@@ -67,6 +67,6 @@ rubyforge_project: metrocot
|
|
67
67
|
rubygems_version: 1.2.0
|
68
68
|
signing_key:
|
69
69
|
specification_version: 2
|
70
|
-
summary:
|
70
|
+
summary: An Hpricot based tool for harvesting list-like data from HTML pages.
|
71
71
|
test_files:
|
72
72
|
- test/test_metrocot.rb
|