metrocot 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt ADDED
@@ -0,0 +1,7 @@
1
+ === 1.0.0 / 2009-01-02
2
+
3
+ * First working version
4
+
5
+ * simple object collection based on single string specification
6
+ * scans several test URLs without problems
7
+
data/Manifest.txt ADDED
@@ -0,0 +1,7 @@
1
+ History.txt
2
+ Manifest.txt
3
+ README.txt
4
+ Rakefile
5
+ bin/metrocot
6
+ lib/metrocot.rb
7
+ test/test_metrocot.rb
data/README.txt ADDED
@@ -0,0 +1,98 @@
1
+ = metrocot
2
+
3
+ * http://www.metrocascade.com/mdn/opensource/metrocot (url)
4
+
5
+ == DESCRIPTION:
6
+
7
+ Metrocot builds on top of Hpricot to allow scraping of list data from HTML pages
8
+ with a minimum of code and page specific information. The specification is done
9
+ is a very compact readable format.
10
+
11
+
12
+ == FEATURES/PROBLEMS:
13
+
14
+ * extremely concise specification via single string parameter (not even a DSL)
15
+ * leverages Hpricot to locate reference points in input data
16
+ * comes with standard scanners for Time, String, etc
17
+ * modular data validation through use of user defined scanners
18
+
19
+
20
+ == SYNOPSIS:
21
+
22
+ require 'rubygems'
23
+ require 'metrocot'
24
+
25
+ class Event < Object
26
+
27
+ attr_accessor :starts_at, :title, :description, :url
28
+
29
+ def initialize( starts_at, title, description, url )
30
+ @starts_at = starts_at
31
+ @title = title
32
+ @description = description
33
+ @url = url
34
+ end
35
+
36
+ end
37
+
38
+ mce_url = "http://www.musiccorner.ca/calendar.html"
39
+ mce_doc = open(URI.parse(mce_url)) { |data| Hpricot(data) }
40
+
41
+ scraper = Metrocot.new(
42
+ :starts_at => Metrocot::Scanners::DateTimeScanner,
43
+ :description => Metrocot::Scanners::TextScanner,
44
+ :title => Metrocot::Scanners::TextScanner
45
+ )
46
+
47
+ mce_events = scraper.scrape(mce_doc).descend("//div[@id='content']/table/tr/td") { |td|
48
+ td.collect( "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)" ) { |starts_at, title, description| Event.new( starts_at, title, description, mce_url ) }
49
+ }.values.flatten
50
+
51
+ puts "Found #{mce_events.size} mce events:"
52
+ mce_events.each_with_index { |event, event_index|
53
+ puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
54
+ }
55
+
56
+
57
+ == REQUIREMENTS:
58
+
59
+ * Hpricot
60
+
61
+ == INSTALL:
62
+
63
+ * sudo gem install metricot
64
+
65
+ == LICENSE:
66
+
67
+ (The MIT License)
68
+
69
+ Copyright (c) 2009 Metro Cascade Media Inc
70
+
71
+ Permission is hereby granted, free of charge, to any person obtaining
72
+ a copy of this software and associated documentation files (the
73
+ 'Software'), to deal in the Software without restriction, including
74
+ without limitation the rights to use, copy, modify, merge, publish,
75
+ distribute, sublicense, and/or sell copies of the Software, and to
76
+ permit persons to whom the Software is furnished to do so, subject to
77
+ the following conditions:
78
+
79
+ The above copyright notice and this permission notice shall be
80
+ included in all copies or substantial portions of the Software.
81
+
82
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
83
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
84
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
85
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
86
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
87
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
88
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
89
+
90
+
91
+ == ALL HAIL WHY
92
+
93
+ We are like tiny pleasantly chirping hex bugs coding away on the
94
+ shoulders of Why so that we can create more, and do more with less
95
+ code, not by virtue of any sharpness of mind on our part, or any
96
+ other distinction, but because we are carried high and raised up
97
+ by his giant size.
98
+
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+ require './lib/metrocot.rb'
6
+
7
+ Hoe.new('metrocot', Metrocot::VERSION) do |p|
8
+ p.rubyforge_name = 'metrocot' # if different than lowercase project name
9
+ p.developer('Helmut Hissen', 'helmut@zeebar.com')
10
+ end
11
+
12
+ # vim: syntax=Ruby
data/bin/metrocot ADDED
@@ -0,0 +1,109 @@
1
+ #!/usr/bin/env ruby
2
+ ############################################################################
3
+ #
4
+ # Copyright (c) 2009 Metro Cascade Media Inc
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining
7
+ # a copy of this software and associated documentation files (the
8
+ # 'Software'), to deal in the Software without restriction, including
9
+ # without limitation the rights to use, copy, modify, merge, publish,
10
+ # distribute, sublicense, and/or sell copies of the Software, and to
11
+ # permit persons to whom the Software is furnished to do so, subject to
12
+ # the following conditions:
13
+ #
14
+ # The above copyright notice and this permission notice shall be
15
+ # included in all copies or substantial portions of the Software.
16
+ #
17
+ # THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
18
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20
+ # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
21
+ # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22
+ # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23
+ # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24
+ #
25
+ #############################################################################
26
+ #
27
+ # Helmut Hissen <helmut@zeebar.com> (Metro Cascade Media Inc)
28
+ # January 1 2009
29
+ #
30
+ #############################################################################
31
+ #
32
+ # We are like tiny pleasantly chirping hex bugs coding away on the
33
+ # shoulders of Why so that we can create more, and do more with less
34
+ # code, not by virtue of any sharpness of mind on our part, or any
35
+ # other distinction, but because we are carried high and raised up
36
+ # by his giant size.
37
+ #
38
+ #############################################################################
39
+ #
40
+
41
+
42
+ require 'rubygems'
43
+ require 'hpricot'
44
+ require 'metrocot'
45
+ require 'open-uri'
46
+
47
+ # example url - these guys give amazing concerts for kids
48
+ url = "http://www.musiccorner.ca/calendar.html"
49
+
50
+ # example xpath root - depends on the structure of the document
51
+ xpath_root = "//div[@id='content']/table/tr/td"
52
+
53
+ # esample metrocot scan specification - also depends on page
54
+ # notice how the values we pull out get names and that these
55
+ # names map to a specific scanner decleared when the scraper
56
+ # was created.
57
+ #
58
+ # the following patterns are supported:
59
+ # "ABC" a fixed string
60
+ # /[abc]+/ Regexp
61
+ # ... anything
62
+ # ./h1/p Hpricot path
63
+ # (PATTERN1 PATTERN2) composite pattern
64
+ # PATTERN1+ one or more occurence of
65
+ # SPACE optional white space
66
+ # NAME=PATTERN pull data out and scan, then collect
67
+ #
68
+ mspec = "starts_at=.//h3 ... title=.//h2 ... description=((.//p )+)"
69
+
70
+ if ARGV.size > 0
71
+ url = ARGV[0]
72
+ if ARGV.size > 1
73
+ xpath_root = ARGV[1]
74
+ if ARGV.size > 2
75
+ mspec = ARGV[2]
76
+ end
77
+ end
78
+ end
79
+
80
+
81
+ doc = open(URI.parse(url)) { |data| Hpricot(data) }
82
+
83
+ class Event < Object
84
+ attr_accessor :starts_at, :title, :description, :url
85
+ def initialize( starts_at, title, description, url )
86
+ @starts_at = starts_at
87
+ @title = title
88
+ @description = description
89
+ @url = url
90
+ end
91
+ end
92
+
93
+ scraper = Metrocot.new(
94
+ :starts_at => Metrocot::Scanners::DateTimeScanner,
95
+ :description => Metrocot::Scanners::TextScanner,
96
+ :title => Metrocot::Scanners::TextScanner
97
+ )
98
+
99
+ events = scraper.scrape(doc).descend(xpath_root) { |td|
100
+ td.collect( mspec ) { |starts_at, title, description|
101
+ Event.new( starts_at, title, description, url )
102
+ }
103
+ }.values.flatten
104
+
105
+ puts "Found #{events.size} events:"
106
+ events.each_with_index { |event, event_index|
107
+ puts "%3d %20s %s" % [event_index, event.starts_at, event.title]
108
+ }
109
+