scrapes 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ <html>
2
+ <head>
3
+ <title>Rule Parser Test</title>
4
+ </head>
5
+ <body>
6
+ <p>here</p>
7
+ <p>there</p>
8
+ <div class="a">dude</div>
9
+ <div id="wow">wow</div>
10
+ <span>
11
+ <div class="ya" id="this">
12
+ <div class="inner">
13
+ <p>rabbit</p>
14
+ </div>
15
+ <font>a</font>
16
+ <fOnt>b</fOnt>
17
+ <FONT>c</FONT>
18
+ </div>
19
+ </span>
20
+ </body>
21
+ </html>
@@ -0,0 +1,8 @@
1
+ <html>
2
+ <head>
3
+ <title>Simple Test HTML</title>
4
+ </head>
5
+ <body>
6
+ <div id="one">One<span id="two">Two<strong id="three">Three</strong>owT</span>enO</div>
7
+ </body>
8
+ </html>
@@ -0,0 +1,151 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ $REXTRA_DEBUG = true
27
+ require 'rextra/debug'
28
+ include Rextra::Debug
29
+ require 'scrapes'
30
+ ################################################################################
31
+ require 'test/lib/server'
32
+ require 'test/unit'
33
+
34
+ class TestRedhandedPage < Test::Unit::TestCase
35
+ include LocalHTTPServer
36
+
37
+ def setup
38
+ start_server
39
+ Scrapes::Initializer.run do |initializer|
40
+ initializer.pages_parent = 'test'
41
+ initializer.process
42
+ end
43
+ Scrapes::Session.start do |session|
44
+ @page = session.page(RuleParserTest, localhost_url('rule_parser.html'))
45
+ end
46
+ end
47
+
48
+ def teardown
49
+ stop_server
50
+ end
51
+
52
+ def test_truth
53
+ assert @server
54
+ assert @page
55
+ end
56
+
57
+ def test_rule
58
+ assert @page.p_test
59
+ assert @page.div_test
60
+ assert_equal Array, @page.p_test.class
61
+ assert_equal Hpricot::Elem, @page.p_test.first.class
62
+ assert_equal 3, @page.p_test.size
63
+ assert_equal 4, @page.div_test.size
64
+ end
65
+
66
+ def test_rule_1
67
+ assert @page.p_test_1
68
+ assert @page.div_test_1
69
+ assert(Array != @page.p_test_1.class)
70
+ assert(Hpricot::Elements != @page.p_test_1.class)
71
+ assert(Hpricot::Doc != @page.div_test_1.class)
72
+ end
73
+
74
+ def test_selector
75
+ assert @page.just_doc_test
76
+ assert @page.css_search_test
77
+ assert @page.xpath_search_test
78
+ assert_equal @page.just_doc_test.class, ::Hpricot::Doc
79
+ assert_equal @page.css_search_test.class, ::Hpricot::Elem
80
+ assert_equal @page.css_search_test.name, 'div'
81
+ assert_equal @page.xpath_search_test.class, ::Hpricot::Elem
82
+ assert_equal @page.xpath_search_test.name, 'div'
83
+ end
84
+
85
+ def test_extractor
86
+ assert @page.just_node_test
87
+ assert_equal @page.just_node_test.class, ::Hpricot::Doc
88
+ assert_equal @page.attributes_class_test.class, Array
89
+ assert_equal @page.attributes_class_test.size, 3
90
+ assert_equal @page.attributes_class_test, ['a','ya','inner']
91
+ end
92
+
93
+ def test_content
94
+ assert @page.font_content
95
+ assert_equal @page.font_content, ["a","b","c"]
96
+ assert @page.font_content_1
97
+ assert_equal @page.font_content_1, "a"
98
+ assert_equal @page.div_this.strip, ""
99
+ assert_equal @page.title, "Rule Parser Test"
100
+ end
101
+
102
+ def test_contents
103
+ assert @page.font_contents
104
+ assert_equal @page.font_contents, [["a"],["b"],["c"]]
105
+ assert @page.font_contents_1
106
+ assert_equal @page.font_contents_1, ["a"]
107
+ assert_equal @page.div_this_s.map{|e|e.strip}, ["","","","",""]
108
+ assert_equal @page.title_s, ["Rule Parser Test"]
109
+ end
110
+
111
+ def test_text
112
+ assert @page.font_text
113
+ assert_equal @page.font_text, ["a","b","c"]
114
+ assert @page.font_text_1
115
+ assert_equal @page.font_text_1, "a"
116
+ #assert_equal @page.div_this_t.strip, ""
117
+ assert_equal @page.title_t, "Rule Parser Test"
118
+ end
119
+
120
+ def test_texts
121
+ assert @page.font_texts
122
+ assert_equal @page.font_texts, [["a"],["b"],["c"]]
123
+ assert @page.font_texts_1
124
+ assert_equal @page.font_texts_1, ["a"]
125
+ #assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
126
+ assert_equal @page.title_ts, ["Rule Parser Test"]
127
+ end
128
+
129
+ def test_word
130
+ assert @page.font_word
131
+ assert_equal @page.font_word, ["a","b","c"]
132
+ assert @page.font_word_1
133
+ assert_equal @page.font_word_1, "a"
134
+ #assert_equal @page.div_this_t.strip, ""
135
+ assert_equal @page.title_w, "Rule Parser Test"
136
+ end
137
+
138
+ def test_words
139
+ assert @page.font_texts
140
+ assert_equal @page.font_texts, [["a"],["b"],["c"]]
141
+ assert @page.font_texts_1
142
+ assert_equal @page.font_texts_1, ["a"]
143
+ #assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
144
+ assert_equal @page.title_ts, ["Rule Parser Test"]
145
+ end
146
+
147
+ def test_standalone_text
148
+ assert text(@page.div_test.first)
149
+ assert text(@page.div_test)
150
+ end
151
+ end
@@ -0,0 +1,45 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+ require 'rubygems'
26
+ require 'scrapes/session'
27
+ require 'test/unit'
28
+ require 'webrick'
29
+
30
+ class TestSession < Test::Unit::TestCase
31
+ def setup
32
+ @session = Scrapes::Session.new
33
+ end
34
+
35
+ def test_truth
36
+ assert @session
37
+ end
38
+ end
39
+ #Scrapes::Session.start do |session|
40
+ # session.page(GoogleMain, 'http://google.com') do |main_page|
41
+ # session.page(GoogleAbout, main_page.about_link) do |about_page|
42
+ # puts about_page.title + ': ' + main_page.about_link
43
+ # end
44
+ # end
45
+ #end
@@ -0,0 +1,71 @@
1
+ ################################################################################ #
2
+ # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining
5
+ # a copy of this software and associated documentation files (the
6
+ # "Software"), to deal in the Software without restriction, including
7
+ # without limitation the rights to use, copy, modify, merge, publish,
8
+ # distribute, sublicense, and/or sell copies of the Software, and to
9
+ # permit persons to whom the Software is furnished to do so, subject to
10
+ # the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be
13
+ # included in all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
22
+ #
23
+ ################################################################################
24
+ require 'rubygems'
25
+ ################################################################################
26
+ require 'scrapes'
27
+ ################################################################################
28
+ require 'test/lib/server'
29
+ require 'test/unit'
30
+
31
+ class TestSimpleHTMLPage < Test::Unit::TestCase
32
+ include LocalHTTPServer
33
+
34
+ def setup
35
+ start_server
36
+ Scrapes::Initializer.run do |initializer|
37
+ initializer.pages_parent = 'test'
38
+ initializer.process
39
+ end
40
+ end
41
+
42
+ def teardown
43
+ stop_server
44
+ end
45
+
46
+ def test_truth
47
+ assert @server
48
+ end
49
+
50
+ def test_texts
51
+ Scrapes::Session.start do |session|
52
+ @page = session.page(LocalSimple, localhost_url('simple.html'))
53
+
54
+ assert_equal ["OneenO"], @page.content_one
55
+ assert_equal ["TwoowT"], @page.content_two
56
+ assert_equal ["Three"], @page.content_three
57
+
58
+ assert_equal [["One", "enO"]], @page.contents_one
59
+ assert_equal [["Two", "owT"]], @page.contents_two
60
+ assert_equal [["Three"]], @page.contents_three
61
+
62
+ assert_equal ["OneTwoThreeowTenO"], @page.text_one
63
+ assert_equal ["TwoThreeowT"], @page.text_two
64
+ assert_equal ["Three"], @page.text_three
65
+
66
+ assert_equal [["One", ["Two", ["Three"], "owT"], "enO"]], @page.texts_one
67
+ assert_equal [["Two", ["Three"], "owT"]], @page.texts_two
68
+ assert_equal [["Three"]], @page.texts_three
69
+ end
70
+ end
71
+ end
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: scrapes
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.2.0
7
+ date: 2006-12-01 00:00:00 -07:00
8
+ summary: Web site scraping framework
9
+ require_paths:
10
+ - lib
11
+ email: pjones@pmade.com
12
+ homepage: http://pmade.com/open-source-software/scrapes/
13
+ rubyforge_project: scrapes
14
+ description:
15
+ autorequire: scrapes.rb
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Peter Jones
31
+ files:
32
+ - lib/scrapes
33
+ - lib/scrapes.rb
34
+ - lib/scrapes/cache.rb
35
+ - lib/scrapes/cookbook.rb
36
+ - lib/scrapes/cookies.rb
37
+ - lib/scrapes/crawler.rb
38
+ - lib/scrapes/hpricot.rb
39
+ - lib/scrapes/initializer.rb
40
+ - lib/scrapes/page.rb
41
+ - lib/scrapes/rule_parser.rb
42
+ - lib/scrapes/session.rb
43
+ - lib/scrapes/to_proxy.rb
44
+ - demo/demo.rb
45
+ - demo/pages
46
+ - demo/pages/about.rb
47
+ - demo/pages/main.rb
48
+ - test/cache.rb
49
+ - test/cookies.rb
50
+ - test/crawler.rb
51
+ - test/hpricot.rb
52
+ - test/initializer.rb
53
+ - test/lib
54
+ - test/page.rb
55
+ - test/pages
56
+ - test/public
57
+ - test/rule_parser.rb
58
+ - test/session.rb
59
+ - test/textcontent.rb
60
+ - test/lib/server.rb
61
+ - test/pages/foils.rb
62
+ - test/pages/foils2.rb
63
+ - test/pages/redhanded_entries.rb
64
+ - test/pages/redhanded_main.rb
65
+ - test/pages/rule_parser.rb
66
+ - test/pages/simple.rb
67
+ - test/public/foil72.html
68
+ - test/public/foil73.html
69
+ - test/public/foil74.html
70
+ - test/public/foo.txt
71
+ - test/public/index.html
72
+ - test/public/redhanded.html
73
+ - test/public/rule_parser.html
74
+ - test/public/simple.html
75
+ - README
76
+ - LICENSE
77
+ test_files: []
78
+
79
+ rdoc_options:
80
+ - --main
81
+ - README
82
+ - --title
83
+ - scrapes
84
+ - --line-numbers
85
+ extra_rdoc_files:
86
+ - README
87
+ - LICENSE
88
+ - lib/scrapes.rb
89
+ - lib/scrapes/cache.rb
90
+ - lib/scrapes/cookbook.rb
91
+ - lib/scrapes/cookies.rb
92
+ - lib/scrapes/crawler.rb
93
+ - lib/scrapes/hpricot.rb
94
+ - lib/scrapes/initializer.rb
95
+ - lib/scrapes/page.rb
96
+ - lib/scrapes/rule_parser.rb
97
+ - lib/scrapes/session.rb
98
+ - lib/scrapes/to_proxy.rb
99
+ executables: []
100
+
101
+ extensions: []
102
+
103
+ requirements: []
104
+
105
+ dependencies:
106
+ - !ruby/object:Gem::Dependency
107
+ name: hpricot
108
+ version_requirement:
109
+ version_requirements: !ruby/object:Gem::Version::Requirement
110
+ requirements:
111
+ - - ">="
112
+ - !ruby/object:Gem::Version
113
+ version: 0.4.59
114
+ version:
115
+ - !ruby/object:Gem::Dependency
116
+ name: rextra
117
+ version_requirement:
118
+ version_requirements: !ruby/object:Gem::Version::Requirement
119
+ requirements:
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 2.0.4
123
+ version: