metrocot 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Manifest.txt +7 -0
- data/README.txt +98 -0
- data/Rakefile +12 -0
- data/bin/metrocot +109 -0
- data/lib/metrocot.rb +1112 -0
- data/test/test_metrocot.rb +70 -0
- metadata +72 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.txt
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
= metrocot
|
2
|
+
|
3
|
+
* http://www.metrocascade.com/mdn/opensource/metrocot (url)
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages
|
8
|
+
with a minimum of code and page specific information. The specification is done
|
9
|
+
is a very compact readable format.
|
10
|
+
|
11
|
+
|
12
|
+
== FEATURES/PROBLEMS:
|
13
|
+
|
14
|
+
* extremely concise specification via single string parameter (not even a DSL)
|
15
|
+
* leverages Hpricot to locate reference points in input data
|
16
|
+
* comes with standard scanners for Time, String, etc
|
17
|
+
* modular data validation through use of user defined scanners
|
18
|
+
|
19
|
+
|
20
|
+
== SYNOPSIS:
|
21
|
+
|
22
|
+
require 'rubygems'
|
23
|
+
require 'metrocot'
|
24
|
+
|
25
|
+
class Event < Object
|
26
|
+
|
27
|
+
attr_accessor :starts_at, :title, :description, :url
|
28
|
+
|
29
|
+
def initialize( starts_at, title, description, url )
|
30
|
+
@starts_at = starts_at
|
31
|
+
@title = title
|
32
|
+
@description = description
|
33
|
+
@url = url
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
mce_url = "http://www.musiccorner.ca/calendar.html"
|
39
|
+
mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
|
40
|
+
|
41
|
+
scraper = Metrocot.new(
|
42
|
+
:starts_at => Metrocot::Scanners::DateTimeScanner,
|
43
|
+
:description => Metrocot::Scanners::TextScanner,
|
44
|
+
:title => Metrocot::Scanners::TextScanner
|
45
|
+
)
|
46
|
+
|
47
|
+
mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
|
48
|
+
td.collect( "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)" ) { |starts_at, title, description| Event.new( starts_at, title, description, mce_url ) }
|
49
|
+
}.values.flatten
|
50
|
+
|
51
|
+
puts "Found #{mce_events.size} mce events:"
|
52
|
+
mce_events.each_with_index { |event, event_index|
|
53
|
+
puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
|
54
|
+
}
|
55
|
+
|
56
|
+
|
57
|
+
== REQUIREMENTS:
|
58
|
+
|
59
|
+
* Hpricot
|
60
|
+
|
61
|
+
== INSTALL:
|
62
|
+
|
63
|
+
* sudo gem install metricot
|
64
|
+
|
65
|
+
== LICENSE:
|
66
|
+
|
67
|
+
(The MIT License)
|
68
|
+
|
69
|
+
Copyright (c) 2009 Metro Cascade Media Inc
|
70
|
+
|
71
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
72
|
+
a copy of this software and associated documentation files (the
|
73
|
+
'Software'), to deal in the Software without restriction, including
|
74
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
75
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
76
|
+
permit persons to whom the Software is furnished to do so, subject to
|
77
|
+
the following conditions:
|
78
|
+
|
79
|
+
The above copyright notice and this permission notice shall be
|
80
|
+
included in all copies or substantial portions of the Software.
|
81
|
+
|
82
|
+
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
83
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
84
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
85
|
+
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
86
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
87
|
+
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
88
|
+
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
89
|
+
|
90
|
+
|
91
|
+
== ALL HAIL WHY
|
92
|
+
|
93
|
+
We are like tiny pleasantly chirping hex bugs coding away on the
|
94
|
+
shoulders of Why so that we can create more, and do more with less
|
95
|
+
code, not by virtue of any sharpness of mind on our part, or any
|
96
|
+
other distinction, but because we are carried high and raised up
|
97
|
+
by his giant size.
|
98
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
require './lib/metrocot.rb'
|
6
|
+
|
7
|
+
Hoe.new('metrocot', Metrocot::VERSION) do |p|
|
8
|
+
p.rubyforge_name = 'metrocot' # if different than lowercase project name
|
9
|
+
p.developer('Helmut Hissen', 'helmut@zeebar.com')
|
10
|
+
end
|
11
|
+
|
12
|
+
# vim: syntax=Ruby
|
data/bin/metrocot
ADDED
@@ -0,0 +1,109 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
############################################################################
|
3
|
+
#
|
4
|
+
# Copyright (c) 2009 Metro Cascade Media Inc
|
5
|
+
#
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
7
|
+
# a copy of this software and associated documentation files (the
|
8
|
+
# 'Software'), to deal in the Software without restriction, including
|
9
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
10
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
11
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
12
|
+
# the following conditions:
|
13
|
+
#
|
14
|
+
# The above copyright notice and this permission notice shall be
|
15
|
+
# included in all copies or substantial portions of the Software.
|
16
|
+
#
|
17
|
+
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
|
18
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
19
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
20
|
+
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
21
|
+
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
22
|
+
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
23
|
+
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
24
|
+
#
|
25
|
+
#############################################################################
|
26
|
+
#
|
27
|
+
# Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
|
28
|
+
# January 1 2009
|
29
|
+
#
|
30
|
+
#############################################################################
|
31
|
+
#
|
32
|
+
# We are like tiny pleasantly chirping hex bugs coding away on the
|
33
|
+
# shoulders of Why so that we can create more, and do more with less
|
34
|
+
# code, not by virtue of any sharpness of mind on our part, or any
|
35
|
+
# other distinction, but because we are carried high and raised up
|
36
|
+
# by his giant size.
|
37
|
+
#
|
38
|
+
#############################################################################
|
39
|
+
#
|
40
|
+
|
41
|
+
|
42
|
+
require 'rubygems'
|
43
|
+
require 'hpricot'
|
44
|
+
require 'metrocot'
|
45
|
+
require 'open-uri'
|
46
|
+
|
47
|
+
# example url - these guys give amazing concerts for kids
|
48
|
+
url = "http://www.musiccorner.ca/calendar.html"
|
49
|
+
|
50
|
+
# example xpath root - depends on the structure of the document
|
51
|
+
xpath_root = "//div[@id='content']/table/tr/td"
|
52
|
+
|
53
|
+
# esample metrocot scan specification - also depends on page
|
54
|
+
# notice how the values we pull out get names and that these
|
55
|
+
# names map to a specific scanner decleared when the scraper
|
56
|
+
# was created.
|
57
|
+
#
|
58
|
+
# the following patterns are supported:
|
59
|
+
# "ABC" a fixed string
|
60
|
+
# /[abc]+/ Regexp
|
61
|
+
# ... anything
|
62
|
+
# ./h1/p Hpricot path
|
63
|
+
# (PATTERN1 PATTERN2) composite pattern
|
64
|
+
# PATTERN1+ one or more occurence of
|
65
|
+
# SPACE optional white space
|
66
|
+
# NAME=PATTERN pull data out and scan, then collect
|
67
|
+
#
|
68
|
+
mspec = "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)"
|
69
|
+
|
70
|
+
if ARGV.size > 0
|
71
|
+
url = ARGV[0]
|
72
|
+
if ARGV.size > 1
|
73
|
+
xpath_root = ARGV[1]
|
74
|
+
if ARGV.size > 2
|
75
|
+
mspec = ARGV[2]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
|
81
|
+
doc = open(URI.parse(url)) { |data| Hpricot(data) }
|
82
|
+
|
83
|
+
class Event < Object
|
84
|
+
attr_accessor :starts_at, :title, :description, :url
|
85
|
+
def initialize( starts_at, title, description, url )
|
86
|
+
@starts_at = starts_at
|
87
|
+
@title = title
|
88
|
+
@description = description
|
89
|
+
@url = url
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
scraper = Metrocot.new(
|
94
|
+
:starts_at => Metrocot::Scanners::DateTimeScanner,
|
95
|
+
:description => Metrocot::Scanners::TextScanner,
|
96
|
+
:title => Metrocot::Scanners::TextScanner
|
97
|
+
)
|
98
|
+
|
99
|
+
events = scraper.scrape(doc).descend(xpath_root) { |td|
|
100
|
+
td.collect( mspec ) { |starts_at, title, description|
|
101
|
+
Event.new( starts_at, title, description, url )
|
102
|
+
}
|
103
|
+
}.values.flatten
|
104
|
+
|
105
|
+
puts "Found #{events.size} events:"
|
106
|
+
events.each_with_index { |event, event_index|
|
107
|
+
puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
|
108
|
+
}
|
109
|
+
|