scrapes 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,21 @@
1
+ <html>
2
+ <head>
3
+ <title>Rule Parser Test</title>
4
+ </head>
5
+ <body>
6
+ <p>here</p>
7
+ <p>there</p>
8
+ <div class="a">dude</div>
9
+ <div id="wow">wow</div>
10
+ <span>
11
+ <div class="ya" id="this">
12
+ <div class="inner">
13
+ <p>rabbit</p>
14
+ </div>
15
+ <font>a</font>
16
+ <fOnt>b</fOnt>
17
+ <FONT>c</FONT>
18
+ </div>
19
+ </span>
20
+ </body>
21
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>Simple Test HTML</title>
4
+ </head>
5
+ <body>
6
+ <div id="one">One<span id="two">Two<strong id="three">Three</strong>owT</span>enO</div>
7
+ </body>
8
+ </html>
@@ -0,0 +1,151 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ $REXTRA_DEBUG = true
27
+ require 'rextra/debug'
28
+ include Rextra::Debug
29
+ require 'scrapes'
30
+ ################################################################################
31
+ require 'test/lib/server'
32
+ require 'test/unit'
33
+
34
+ class TestRedhandedPage < Test::Unit::TestCase
35
+ include LocalHTTPServer
36
+
37
+ def setup
38
+ start_server
39
+ Scrapes::Initializer.run do |initializer|
40
+ initializer.pages_parent = 'test'
41
+ initializer.process
42
+ end
43
+ Scrapes::Session.start do |session|
44
+ @page = session.page(RuleParserTest, localhost_url('rule_parser.html'))
45
+ end
46
+ end
47
+
48
+ def teardown
49
+ stop_server
50
+ end
51
+
52
+ def test_truth
53
+ assert @server
54
+ assert @page
55
+ end
56
+
57
+ def test_rule
58
+ assert @page.p_test
59
+ assert @page.div_test
60
+ assert_equal Array, @page.p_test.class
61
+ assert_equal Hpricot::Elem, @page.p_test.first.class
62
+ assert_equal 3, @page.p_test.size
63
+ assert_equal 4, @page.div_test.size
64
+ end
65
+
66
+ def test_rule_1
67
+ assert @page.p_test_1
68
+ assert @page.div_test_1
69
+ assert(Array != @page.p_test_1.class)
70
+ assert(Hpricot::Elements != @page.p_test_1.class)
71
+ assert(Hpricot::Doc != @page.div_test_1.class)
72
+ end
73
+
74
+ def test_selector
75
+ assert @page.just_doc_test
76
+ assert @page.css_search_test
77
+ assert @page.xpath_search_test
78
+ assert_equal @page.just_doc_test.class, ::Hpricot::Doc
79
+ assert_equal @page.css_search_test.class, ::Hpricot::Elem
80
+ assert_equal @page.css_search_test.name, 'div'
81
+ assert_equal @page.xpath_search_test.class, ::Hpricot::Elem
82
+ assert_equal @page.xpath_search_test.name, 'div'
83
+ end
84
+
85
+ def test_extractor
86
+ assert @page.just_node_test
87
+ assert_equal @page.just_node_test.class, ::Hpricot::Doc
88
+ assert_equal @page.attributes_class_test.class, Array
89
+ assert_equal @page.attributes_class_test.size, 3
90
+ assert_equal @page.attributes_class_test, ['a','ya','inner']
91
+ end
92
+
93
+ def test_content
94
+ assert @page.font_content
95
+ assert_equal @page.font_content, ["a","b","c"]
96
+ assert @page.font_content_1
97
+ assert_equal @page.font_content_1, "a"
98
+ assert_equal @page.div_this.strip, ""
99
+ assert_equal @page.title, "Rule Parser Test"
100
+ end
101
+
102
+ def test_contents
103
+ assert @page.font_contents
104
+ assert_equal @page.font_contents, [["a"],["b"],["c"]]
105
+ assert @page.font_contents_1
106
+ assert_equal @page.font_contents_1, ["a"]
107
+ assert_equal @page.div_this_s.map{|e|e.strip}, ["","","","",""]
108
+ assert_equal @page.title_s, ["Rule Parser Test"]
109
+ end
110
+
111
+ def test_text
112
+ assert @page.font_text
113
+ assert_equal @page.font_text, ["a","b","c"]
114
+ assert @page.font_text_1
115
+ assert_equal @page.font_text_1, "a"
116
+ #assert_equal @page.div_this_t.strip, ""
117
+ assert_equal @page.title_t, "Rule Parser Test"
118
+ end
119
+
120
+ def test_texts
121
+ assert @page.font_texts
122
+ assert_equal @page.font_texts, [["a"],["b"],["c"]]
123
+ assert @page.font_texts_1
124
+ assert_equal @page.font_texts_1, ["a"]
125
+ #assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
126
+ assert_equal @page.title_ts, ["Rule Parser Test"]
127
+ end
128
+
129
+ def test_word
130
+ assert @page.font_word
131
+ assert_equal @page.font_word, ["a","b","c"]
132
+ assert @page.font_word_1
133
+ assert_equal @page.font_word_1, "a"
134
+ #assert_equal @page.div_this_t.strip, ""
135
+ assert_equal @page.title_w, "Rule Parser Test"
136
+ end
137
+
138
+ def test_words
139
+ assert @page.font_texts
140
+ assert_equal @page.font_texts, [["a"],["b"],["c"]]
141
+ assert @page.font_texts_1
142
+ assert_equal @page.font_texts_1, ["a"]
143
+ #assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
144
+ assert_equal @page.title_ts, ["Rule Parser Test"]
145
+ end
146
+
147
+ def test_standalone_text
148
+ assert text(@page.div_test.first)
149
+ assert text(@page.div_test)
150
+ end
151
+ end
@@ -0,0 +1,45 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'rubygems'
26
+ require 'scrapes/session'
27
+ require 'test/unit'
28
+ require 'webrick'
29
+
30
+ class TestSession < Test::Unit::TestCase
31
+ def setup
32
+ @session = Scrapes::Session.new
33
+ end
34
+
35
+ def test_truth
36
+ assert @session
37
+ end
38
+ end
39
+ #Scrapes::Session.start do |session|
40
+ # session.page(GoogleMain, 'http://google.com') do |main_page|
41
+ # session.page(GoogleAbout, main_page.about_link) do |about_page|
42
+ # puts about_page.title + ': ' + main_page.about_link
43
+ # end
44
+ # end
45
+ #end
@@ -0,0 +1,71 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestSimpleHTMLPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_texts
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalSimple, localhost_url('simple.html'))
53
+
54
+ assert_equal ["OneenO"], @page.content_one
55
+ assert_equal ["TwoowT"], @page.content_two
56
+ assert_equal ["Three"], @page.content_three
57
+
58
+ assert_equal [["One", "enO"]], @page.contents_one
59
+ assert_equal [["Two", "owT"]], @page.contents_two
60
+ assert_equal [["Three"]], @page.contents_three
61
+
62
+ assert_equal ["OneTwoThreeowTenO"], @page.text_one
63
+ assert_equal ["TwoThreeowT"], @page.text_two
64
+ assert_equal ["Three"], @page.text_three
65
+
66
+ assert_equal [["One", ["Two", ["Three"], "owT"], "enO"]], @page.texts_one
67
+ assert_equal [["Two", ["Three"], "owT"]], @page.texts_two
68
+ assert_equal [["Three"]], @page.texts_three
69
+ end
70
+ end
71
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: scrapes
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.0
7
+ date: 2006-12-01 00:00:00 -07:00
8
+ summary: Web site scraping framework
9
+ require_paths:
10
+ - lib
11
+ email: pjones@pmade.com
12
+ homepage: http://pmade.com/open-source-software/scrapes/
13
+ rubyforge_project: scrapes
14
+ description:
15
+ autorequire: scrapes.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Peter Jones
31
+ files:
32
+ - lib/scrapes
33
+ - lib/scrapes.rb
34
+ - lib/scrapes/cache.rb
35
+ - lib/scrapes/cookbook.rb
36
+ - lib/scrapes/cookies.rb
37
+ - lib/scrapes/crawler.rb
38
+ - lib/scrapes/hpricot.rb
39
+ - lib/scrapes/initializer.rb
40
+ - lib/scrapes/page.rb
41
+ - lib/scrapes/rule_parser.rb
42
+ - lib/scrapes/session.rb
43
+ - lib/scrapes/to_proxy.rb
44
+ - demo/demo.rb
45
+ - demo/pages
46
+ - demo/pages/about.rb
47
+ - demo/pages/main.rb
48
+ - test/cache.rb
49
+ - test/cookies.rb
50
+ - test/crawler.rb
51
+ - test/hpricot.rb
52
+ - test/initializer.rb
53
+ - test/lib
54
+ - test/page.rb
55
+ - test/pages
56
+ - test/public
57
+ - test/rule_parser.rb
58
+ - test/session.rb
59
+ - test/textcontent.rb
60
+ - test/lib/server.rb
61
+ - test/pages/foils.rb
62
+ - test/pages/foils2.rb
63
+ - test/pages/redhanded_entries.rb
64
+ - test/pages/redhanded_main.rb
65
+ - test/pages/rule_parser.rb
66
+ - test/pages/simple.rb
67
+ - test/public/foil72.html
68
+ - test/public/foil73.html
69
+ - test/public/foil74.html
70
+ - test/public/foo.txt
71
+ - test/public/index.html
72
+ - test/public/redhanded.html
73
+ - test/public/rule_parser.html
74
+ - test/public/simple.html
75
+ - README
76
+ - LICENSE
77
+ test_files: []
78
+
79
+ rdoc_options:
80
+ - --main
81
+ - README
82
+ - --title
83
+ - scrapes
84
+ - --line-numbers
85
+ extra_rdoc_files:
86
+ - README
87
+ - LICENSE
88
+ - lib/scrapes.rb
89
+ - lib/scrapes/cache.rb
90
+ - lib/scrapes/cookbook.rb
91
+ - lib/scrapes/cookies.rb
92
+ - lib/scrapes/crawler.rb
93
+ - lib/scrapes/hpricot.rb
94
+ - lib/scrapes/initializer.rb
95
+ - lib/scrapes/page.rb
96
+ - lib/scrapes/rule_parser.rb
97
+ - lib/scrapes/session.rb
98
+ - lib/scrapes/to_proxy.rb
99
+ executables: []
100
+
101
+ extensions: []
102
+
103
+ requirements: []
104
+
105
+ dependencies:
106
+ - !ruby/object:Gem::Dependency
107
+ name: hpricot
108
+ version_requirement:
109
+ version_requirements: !ruby/object:Gem::Version::Requirement
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: 0.4.59
114
+ version:
115
+ - !ruby/object:Gem::Dependency
116
+ name: rextra
117
+ version_requirement:
118
+ version_requirements: !ruby/object:Gem::Version::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 2.0.4
123
+ version: