scrapes 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +22 -0
- data/README +123 -0
- data/demo/demo.rb +33 -0
- data/demo/pages/about.rb +32 -0
- data/demo/pages/main.rb +32 -0
- data/lib/scrapes.rb +41 -0
- data/lib/scrapes/cache.rb +110 -0
- data/lib/scrapes/cookbook.rb +53 -0
- data/lib/scrapes/cookies.rb +45 -0
- data/lib/scrapes/crawler.rb +97 -0
- data/lib/scrapes/hpricot.rb +110 -0
- data/lib/scrapes/initializer.rb +86 -0
- data/lib/scrapes/page.rb +319 -0
- data/lib/scrapes/rule_parser.rb +327 -0
- data/lib/scrapes/session.rb +155 -0
- data/lib/scrapes/to_proxy.rb +50 -0
- data/test/cache.rb +75 -0
- data/test/cookies.rb +34 -0
- data/test/crawler.rb +69 -0
- data/test/hpricot.rb +55 -0
- data/test/initializer.rb +54 -0
- data/test/lib/server.rb +63 -0
- data/test/page.rb +77 -0
- data/test/pages/foils.rb +61 -0
- data/test/pages/foils2.rb +38 -0
- data/test/pages/redhanded_entries.rb +36 -0
- data/test/pages/redhanded_main.rb +58 -0
- data/test/pages/rule_parser.rb +81 -0
- data/test/pages/simple.rb +21 -0
- data/test/public/foil72.html +10 -0
- data/test/public/foil73.html +9 -0
- data/test/public/foil74.html +11 -0
- data/test/public/foo.txt +1 -0
- data/test/public/index.html +20 -0
- data/test/public/redhanded.html +1208 -0
- data/test/public/rule_parser.html +21 -0
- data/test/public/simple.html +8 -0
- data/test/rule_parser.rb +151 -0
- data/test/session.rb +45 -0
- data/test/textcontent.rb +71 -0
- metadata +123 -0
@@ -0,0 +1,21 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Rule Parser Test</title>
|
4
|
+
</head>
|
5
|
+
<body>
|
6
|
+
<p>here</p>
|
7
|
+
<p>there</p>
|
8
|
+
<div class="a">dude</div>
|
9
|
+
<div id="wow">wow</div>
|
10
|
+
<span>
|
11
|
+
<div class="ya" id="this">
|
12
|
+
<div class="inner">
|
13
|
+
<p>rabbit</p>
|
14
|
+
</div>
|
15
|
+
<font>a</font>
|
16
|
+
<fOnt>b</fOnt>
|
17
|
+
<FONT>c</FONT>
|
18
|
+
</div>
|
19
|
+
</span>
|
20
|
+
</body>
|
21
|
+
</html>
|
data/test/rule_parser.rb
ADDED
@@ -0,0 +1,151 @@
|
|
1
|
+
################################################################################ #
|
2
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#
|
23
|
+
################################################################################
|
24
|
+
require 'rubygems'
|
25
|
+
################################################################################
|
26
|
+
$REXTRA_DEBUG = true
|
27
|
+
require 'rextra/debug'
|
28
|
+
include Rextra::Debug
|
29
|
+
require 'scrapes'
|
30
|
+
################################################################################
|
31
|
+
require 'test/lib/server'
|
32
|
+
require 'test/unit'
|
33
|
+
|
34
|
+
class TestRedhandedPage < Test::Unit::TestCase
|
35
|
+
include LocalHTTPServer
|
36
|
+
|
37
|
+
def setup
|
38
|
+
start_server
|
39
|
+
Scrapes::Initializer.run do |initializer|
|
40
|
+
initializer.pages_parent = 'test'
|
41
|
+
initializer.process
|
42
|
+
end
|
43
|
+
Scrapes::Session.start do |session|
|
44
|
+
@page = session.page(RuleParserTest, localhost_url('rule_parser.html'))
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def teardown
|
49
|
+
stop_server
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_truth
|
53
|
+
assert @server
|
54
|
+
assert @page
|
55
|
+
end
|
56
|
+
|
57
|
+
def test_rule
|
58
|
+
assert @page.p_test
|
59
|
+
assert @page.div_test
|
60
|
+
assert_equal Array, @page.p_test.class
|
61
|
+
assert_equal Hpricot::Elem, @page.p_test.first.class
|
62
|
+
assert_equal 3, @page.p_test.size
|
63
|
+
assert_equal 4, @page.div_test.size
|
64
|
+
end
|
65
|
+
|
66
|
+
def test_rule_1
|
67
|
+
assert @page.p_test_1
|
68
|
+
assert @page.div_test_1
|
69
|
+
assert(Array != @page.p_test_1.class)
|
70
|
+
assert(Hpricot::Elements != @page.p_test_1.class)
|
71
|
+
assert(Hpricot::Doc != @page.div_test_1.class)
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_selector
|
75
|
+
assert @page.just_doc_test
|
76
|
+
assert @page.css_search_test
|
77
|
+
assert @page.xpath_search_test
|
78
|
+
assert_equal @page.just_doc_test.class, ::Hpricot::Doc
|
79
|
+
assert_equal @page.css_search_test.class, ::Hpricot::Elem
|
80
|
+
assert_equal @page.css_search_test.name, 'div'
|
81
|
+
assert_equal @page.xpath_search_test.class, ::Hpricot::Elem
|
82
|
+
assert_equal @page.xpath_search_test.name, 'div'
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_extractor
|
86
|
+
assert @page.just_node_test
|
87
|
+
assert_equal @page.just_node_test.class, ::Hpricot::Doc
|
88
|
+
assert_equal @page.attributes_class_test.class, Array
|
89
|
+
assert_equal @page.attributes_class_test.size, 3
|
90
|
+
assert_equal @page.attributes_class_test, ['a','ya','inner']
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_content
|
94
|
+
assert @page.font_content
|
95
|
+
assert_equal @page.font_content, ["a","b","c"]
|
96
|
+
assert @page.font_content_1
|
97
|
+
assert_equal @page.font_content_1, "a"
|
98
|
+
assert_equal @page.div_this.strip, ""
|
99
|
+
assert_equal @page.title, "Rule Parser Test"
|
100
|
+
end
|
101
|
+
|
102
|
+
def test_contents
|
103
|
+
assert @page.font_contents
|
104
|
+
assert_equal @page.font_contents, [["a"],["b"],["c"]]
|
105
|
+
assert @page.font_contents_1
|
106
|
+
assert_equal @page.font_contents_1, ["a"]
|
107
|
+
assert_equal @page.div_this_s.map{|e|e.strip}, ["","","","",""]
|
108
|
+
assert_equal @page.title_s, ["Rule Parser Test"]
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_text
|
112
|
+
assert @page.font_text
|
113
|
+
assert_equal @page.font_text, ["a","b","c"]
|
114
|
+
assert @page.font_text_1
|
115
|
+
assert_equal @page.font_text_1, "a"
|
116
|
+
#assert_equal @page.div_this_t.strip, ""
|
117
|
+
assert_equal @page.title_t, "Rule Parser Test"
|
118
|
+
end
|
119
|
+
|
120
|
+
def test_texts
|
121
|
+
assert @page.font_texts
|
122
|
+
assert_equal @page.font_texts, [["a"],["b"],["c"]]
|
123
|
+
assert @page.font_texts_1
|
124
|
+
assert_equal @page.font_texts_1, ["a"]
|
125
|
+
#assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
|
126
|
+
assert_equal @page.title_ts, ["Rule Parser Test"]
|
127
|
+
end
|
128
|
+
|
129
|
+
def test_word
|
130
|
+
assert @page.font_word
|
131
|
+
assert_equal @page.font_word, ["a","b","c"]
|
132
|
+
assert @page.font_word_1
|
133
|
+
assert_equal @page.font_word_1, "a"
|
134
|
+
#assert_equal @page.div_this_t.strip, ""
|
135
|
+
assert_equal @page.title_w, "Rule Parser Test"
|
136
|
+
end
|
137
|
+
|
138
|
+
def test_words
|
139
|
+
assert @page.font_texts
|
140
|
+
assert_equal @page.font_texts, [["a"],["b"],["c"]]
|
141
|
+
assert @page.font_texts_1
|
142
|
+
assert_equal @page.font_texts_1, ["a"]
|
143
|
+
#assert_equal @page.div_this_ts.flatten.map{|e|e.strip}, ["","","","",""]
|
144
|
+
assert_equal @page.title_ts, ["Rule Parser Test"]
|
145
|
+
end
|
146
|
+
|
147
|
+
def test_standalone_text
|
148
|
+
assert text(@page.div_test.first)
|
149
|
+
assert text(@page.div_test)
|
150
|
+
end
|
151
|
+
end
|
data/test/session.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
require 'rubygems'
|
26
|
+
require 'scrapes/session'
|
27
|
+
require 'test/unit'
|
28
|
+
require 'webrick'
|
29
|
+
|
30
|
+
class TestSession < Test::Unit::TestCase
|
31
|
+
def setup
|
32
|
+
@session = Scrapes::Session.new
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_truth
|
36
|
+
assert @session
|
37
|
+
end
|
38
|
+
end
|
39
|
+
#Scrapes::Session.start do |session|
|
40
|
+
# session.page(GoogleMain, 'http://google.com') do |main_page|
|
41
|
+
# session.page(GoogleAbout, main_page.about_link) do |about_page|
|
42
|
+
# puts about_page.title + ': ' + main_page.about_link
|
43
|
+
# end
|
44
|
+
# end
|
45
|
+
#end
|
data/test/textcontent.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
################################################################################ #
|
2
|
+
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
# a copy of this software and associated documentation files (the
|
6
|
+
# "Software"), to deal in the Software without restriction, including
|
7
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
# the following conditions:
|
11
|
+
#
|
12
|
+
# The above copyright notice and this permission notice shall be
|
13
|
+
# included in all copies or substantial portions of the Software.
|
14
|
+
#
|
15
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
22
|
+
#
|
23
|
+
################################################################################
|
24
|
+
require 'rubygems'
|
25
|
+
################################################################################
|
26
|
+
require 'scrapes'
|
27
|
+
################################################################################
|
28
|
+
require 'test/lib/server'
|
29
|
+
require 'test/unit'
|
30
|
+
|
31
|
+
class TestSimpleHTMLPage < Test::Unit::TestCase
|
32
|
+
include LocalHTTPServer
|
33
|
+
|
34
|
+
def setup
|
35
|
+
start_server
|
36
|
+
Scrapes::Initializer.run do |initializer|
|
37
|
+
initializer.pages_parent = 'test'
|
38
|
+
initializer.process
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def teardown
|
43
|
+
stop_server
|
44
|
+
end
|
45
|
+
|
46
|
+
def test_truth
|
47
|
+
assert @server
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_texts
|
51
|
+
Scrapes::Session.start do |session|
|
52
|
+
@page = session.page(LocalSimple, localhost_url('simple.html'))
|
53
|
+
|
54
|
+
assert_equal ["OneenO"], @page.content_one
|
55
|
+
assert_equal ["TwoowT"], @page.content_two
|
56
|
+
assert_equal ["Three"], @page.content_three
|
57
|
+
|
58
|
+
assert_equal [["One", "enO"]], @page.contents_one
|
59
|
+
assert_equal [["Two", "owT"]], @page.contents_two
|
60
|
+
assert_equal [["Three"]], @page.contents_three
|
61
|
+
|
62
|
+
assert_equal ["OneTwoThreeowTenO"], @page.text_one
|
63
|
+
assert_equal ["TwoThreeowT"], @page.text_two
|
64
|
+
assert_equal ["Three"], @page.text_three
|
65
|
+
|
66
|
+
assert_equal [["One", ["Two", ["Three"], "owT"], "enO"]], @page.texts_one
|
67
|
+
assert_equal [["Two", ["Three"], "owT"]], @page.texts_two
|
68
|
+
assert_equal [["Three"]], @page.texts_three
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: scrapes
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.2.0
|
7
|
+
date: 2006-12-01 00:00:00 -07:00
|
8
|
+
summary: Web site scraping framework
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: pjones@pmade.com
|
12
|
+
homepage: http://pmade.com/open-source-software/scrapes/
|
13
|
+
rubyforge_project: scrapes
|
14
|
+
description:
|
15
|
+
autorequire: scrapes.rb
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Peter Jones
|
31
|
+
files:
|
32
|
+
- lib/scrapes
|
33
|
+
- lib/scrapes.rb
|
34
|
+
- lib/scrapes/cache.rb
|
35
|
+
- lib/scrapes/cookbook.rb
|
36
|
+
- lib/scrapes/cookies.rb
|
37
|
+
- lib/scrapes/crawler.rb
|
38
|
+
- lib/scrapes/hpricot.rb
|
39
|
+
- lib/scrapes/initializer.rb
|
40
|
+
- lib/scrapes/page.rb
|
41
|
+
- lib/scrapes/rule_parser.rb
|
42
|
+
- lib/scrapes/session.rb
|
43
|
+
- lib/scrapes/to_proxy.rb
|
44
|
+
- demo/demo.rb
|
45
|
+
- demo/pages
|
46
|
+
- demo/pages/about.rb
|
47
|
+
- demo/pages/main.rb
|
48
|
+
- test/cache.rb
|
49
|
+
- test/cookies.rb
|
50
|
+
- test/crawler.rb
|
51
|
+
- test/hpricot.rb
|
52
|
+
- test/initializer.rb
|
53
|
+
- test/lib
|
54
|
+
- test/page.rb
|
55
|
+
- test/pages
|
56
|
+
- test/public
|
57
|
+
- test/rule_parser.rb
|
58
|
+
- test/session.rb
|
59
|
+
- test/textcontent.rb
|
60
|
+
- test/lib/server.rb
|
61
|
+
- test/pages/foils.rb
|
62
|
+
- test/pages/foils2.rb
|
63
|
+
- test/pages/redhanded_entries.rb
|
64
|
+
- test/pages/redhanded_main.rb
|
65
|
+
- test/pages/rule_parser.rb
|
66
|
+
- test/pages/simple.rb
|
67
|
+
- test/public/foil72.html
|
68
|
+
- test/public/foil73.html
|
69
|
+
- test/public/foil74.html
|
70
|
+
- test/public/foo.txt
|
71
|
+
- test/public/index.html
|
72
|
+
- test/public/redhanded.html
|
73
|
+
- test/public/rule_parser.html
|
74
|
+
- test/public/simple.html
|
75
|
+
- README
|
76
|
+
- LICENSE
|
77
|
+
test_files: []
|
78
|
+
|
79
|
+
rdoc_options:
|
80
|
+
- --main
|
81
|
+
- README
|
82
|
+
- --title
|
83
|
+
- scrapes
|
84
|
+
- --line-numbers
|
85
|
+
extra_rdoc_files:
|
86
|
+
- README
|
87
|
+
- LICENSE
|
88
|
+
- lib/scrapes.rb
|
89
|
+
- lib/scrapes/cache.rb
|
90
|
+
- lib/scrapes/cookbook.rb
|
91
|
+
- lib/scrapes/cookies.rb
|
92
|
+
- lib/scrapes/crawler.rb
|
93
|
+
- lib/scrapes/hpricot.rb
|
94
|
+
- lib/scrapes/initializer.rb
|
95
|
+
- lib/scrapes/page.rb
|
96
|
+
- lib/scrapes/rule_parser.rb
|
97
|
+
- lib/scrapes/session.rb
|
98
|
+
- lib/scrapes/to_proxy.rb
|
99
|
+
executables: []
|
100
|
+
|
101
|
+
extensions: []
|
102
|
+
|
103
|
+
requirements: []
|
104
|
+
|
105
|
+
dependencies:
|
106
|
+
- !ruby/object:Gem::Dependency
|
107
|
+
name: hpricot
|
108
|
+
version_requirement:
|
109
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
110
|
+
requirements:
|
111
|
+
- - ">="
|
112
|
+
- !ruby/object:Gem::Version
|
113
|
+
version: 0.4.59
|
114
|
+
version:
|
115
|
+
- !ruby/object:Gem::Dependency
|
116
|
+
name: rextra
|
117
|
+
version_requirement:
|
118
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 2.0.4
|
123
|
+
version:
|