scrubyt 0.2.6 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
data/CHANGELOG
CHANGED
@@ -1,29 +1,76 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
-
== 0.2.
|
3
|
+
== 0.2.7
|
4
|
+
=== 15th April, 2007
|
5
|
+
|
6
|
+
=<tt>changes:</tt>
|
7
|
+
|
8
|
+
[NEW] download pattern: download the file pointed to by the
|
9
|
+
parent pattern
|
10
|
+
[NEW] checking checkboxes
|
11
|
+
[NEW] basic authentication support
|
12
|
+
[NEW] default values for missing elements
|
13
|
+
[NEW] possibility to resolve relative paths against a custom url
|
14
|
+
[NEW] first simple version of to_csv and to_hash
|
15
|
+
[NEW] complete rewrite of the exporting system (Credit: Neelance)
|
16
|
+
[NEW] first version of smart regular expressions: they are constructed
|
17
|
+
from examples, just as regular expressions (Credit: Neelance)
|
18
|
+
[NEW] Possibility to click the n-th link
|
19
|
+
[FIX] Clicking on links using scRUBYt's aadvanced example lookup
|
20
|
+
[NEW] Forcing writing text of non-leaf nodes with :write_text => true
|
21
|
+
[NEW] Possibility to set custom user-agent; Specified default user agent
|
22
|
+
as Microsoft IE6
|
23
|
+
[FIX] Fixed crawling to detail pages in case of leaving the
|
24
|
+
original site (Credit: Michael Mazour)
|
25
|
+
[FIX] fixing the '//' problem - if the relative url contained two
|
26
|
+
slashes, the fetching failed
|
27
|
+
[FIX] scrubyt assumed that documents have a list of nested elements
|
28
|
+
(Credit: Rick Bradley)
|
29
|
+
[FIX] crawling to detail pages works also if the parent pattern is
|
30
|
+
a string pattern
|
31
|
+
[FIX] shorcut url fixed again
|
32
|
+
[FIX] regexp pattern fixed in case it's parent was a string
|
33
|
+
[FIX] refactoring the core classes, lots of bugfixes and stabilization
|
34
|
+
|
35
|
+
== 0.2.6
|
4
36
|
=== 22th March, 2007
|
5
37
|
|
6
|
-
The mission of this release was to add even more powerful features,
|
7
|
-
|
38
|
+
The mission of this release was to add even more powerful features,
|
39
|
+
like crawling to detail pages or compound example specification,
|
40
|
+
as well as fixing the most frequently popping-up bugs. Scraping
|
41
|
+
of concrete sites is more and more frequently the cause for new
|
42
|
+
features and bugfixes, which in my opinion means that the
|
43
|
+
framework is beginning to make sense: from a shiny toy which
|
44
|
+
looks cool and everybody wants to play with, it is moving
|
45
|
+
towards a tool which you reach after if you seriously want
|
46
|
+
to scrape a site.
|
47
|
+
|
48
|
+
The new stuff in this release is 99% scraping related - if
|
49
|
+
you are looking for new features in the navigation part,
|
50
|
+
probably the next version will be for you, where I will
|
51
|
+
concentrate more on adding new widgets and possibilities
|
52
|
+
to the navigation process. Firewatir integration is very
|
53
|
+
close, too - perhaps already the next release will
|
54
|
+
support FireWatir navigation!
|
8
55
|
|
9
56
|
=<tt>changes:</tt>
|
10
57
|
* [NEW] Automatically crawling to and extracting from detail pages
|
11
58
|
* [NEW] Compound example specification: So far the example of a pattern had to be a string.
|
12
59
|
Now it can be a hash as well, like {:contains => /\d\d-\d/, :begins_with => 'Telephone'}
|
13
60
|
* [NEW] More sophisticated example specification: Possible to use regexp as well, and need not
|
14
|
-
(but still possible of course) to specify the whole content of the node - nodes that
|
61
|
+
(but still possible of course) to specify the whole content of the node - nodes that
|
15
62
|
contain the string/match the regexp will be returned, too
|
16
63
|
* [NEW] Possibility to force writing text in case of non-leaf nodes
|
17
|
-
* [NEW] Crawling to the next page now possible via image links as well
|
64
|
+
* [NEW] Crawling to the next page now possible via image links as well
|
18
65
|
* [NEW] Possibility to define examples for any pattern (before it did not make sense for ancestors)
|
19
66
|
* [NEW] Implementation of crawling to the next page with different methods
|
20
|
-
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
67
|
+
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
21
68
|
some_url 'href', :type => :attribute
|
22
69
|
* [FIX] Crawling to the next page (the broken google example): if the next
|
23
|
-
link text is not an <a>, traverse down until the <a> is found; if it is
|
70
|
+
link text is not an <a>, traverse down until the <a> is found; if it is
|
24
71
|
still not found, traverse up until it is found
|
25
72
|
* [FIX] Crawling to next pages does not break if the next link is greyed out
|
26
|
-
(or otherwise present but has no href attribute (Credit:
|
73
|
+
(or otherwise present but has no href attribute (Credit: Robert Au)
|
27
74
|
* [FIX] DRY-ed next link lookup - it should be much more robust now as it is uses the 'standard' example lookup
|
28
75
|
* [NEW] Correct exporting of detail page extractors
|
29
76
|
* [NEW] Added more powerful XPath regexp (Credit: Karol Hosiawa)
|
@@ -105,13 +152,13 @@ This is a preview release before the first real public release, 0.2.0. Basically
|
|
105
152
|
* Enhanced heuristics for example text detection
|
106
153
|
* First version of algorithm to remove dupes resulting from multiple examples
|
107
154
|
* empty XML leaf nodes are not written
|
108
|
-
* new examples
|
155
|
+
* new examples
|
109
156
|
* TONS of bugfixes
|
110
157
|
|
111
158
|
= 0.1
|
112
159
|
=== 15th January, 2007
|
113
160
|
|
114
|
-
First pre-alpha (non-public) release
|
161
|
+
First pre-alpha (non-public) release
|
115
162
|
This release was made more for myself (to try and test rubyforge, gems, etc) rather than for the community at this time.
|
116
163
|
|
117
164
|
Fairly nice set of features, but still need a lot of testing and stabilizing before it will be really usable.
|
@@ -201,13 +248,13 @@ This is a preview release before the first real public release, 0.2.0. Basically
|
|
201
248
|
* Enhanced heuristics for example text detection
|
202
249
|
* First version of algorithm to remove dupes resulting from multiple examples
|
203
250
|
* empty XML leaf nodes are not written
|
204
|
-
* new examples
|
251
|
+
* new examples
|
205
252
|
* TONS of bugfixes
|
206
253
|
|
207
254
|
= 0.1
|
208
255
|
=== 15th January, 2007
|
209
256
|
|
210
|
-
First pre-alpha (non-public) release
|
257
|
+
First pre-alpha (non-public) release
|
211
258
|
This release was made more for myself (to try and test rubyforge, gems, etc) rather than for the community at this time.
|
212
259
|
|
213
260
|
Fairly nice set of features, but still need a lot of testing and stabilizing before it will be really usable.
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
18
18
|
|
19
19
|
gem_spec = Gem::Specification.new do |s|
|
20
20
|
s.name = 'scrubyt'
|
21
|
-
s.version = '0.2.
|
21
|
+
s.version = '0.2.8'
|
22
22
|
s.summary = 'A powerful Web-scraping framework'
|
23
23
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
24
24
|
# Files containing Test::Unit test cases.
|
@@ -82,7 +82,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
82
82
|
pkg.need_tar = false
|
83
83
|
end
|
84
84
|
|
85
|
-
Rake::PackageTask.new('scrubyt-examples', '0.2.
|
85
|
+
Rake::PackageTask.new('scrubyt-examples', '0.2.8') do |pkg|
|
86
86
|
pkg.need_zip = true
|
87
87
|
pkg.need_tar = true
|
88
88
|
pkg.package_files.include("examples/**/*")
|
data/lib/scrubyt.rb
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
#ruby core
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
#gems
|
5
|
+
require 'rubygems'
|
6
|
+
require 'mechanize'
|
7
|
+
require 'hpricot'
|
8
|
+
require 'parse_tree'
|
9
|
+
require 'ruby2ruby'
|
10
|
+
|
11
|
+
#scrubyt
|
12
|
+
require 'scrubyt/utils/ruby_extensions.rb'
|
13
|
+
require 'scrubyt/utils/xpathutils.rb'
|
14
|
+
require 'scrubyt/utils/shared_utils.rb'
|
15
|
+
require 'scrubyt/utils/simple_example_lookup.rb'
|
16
|
+
require 'scrubyt/utils/compound_example_lookup.rb'
|
1
17
|
require 'scrubyt/core/scraping/constraint_adder.rb'
|
2
18
|
require 'scrubyt/core/scraping/constraint.rb'
|
3
19
|
require 'scrubyt/core/scraping/result_indexer.rb'
|
@@ -5,16 +21,18 @@ require 'scrubyt/core/scraping/pre_filter_document.rb'
|
|
5
21
|
require 'scrubyt/core/scraping/compound_example.rb'
|
6
22
|
require 'scrubyt/output/export.rb'
|
7
23
|
require 'scrubyt/core/shared/extractor.rb'
|
8
|
-
require 'scrubyt/core/scraping/
|
24
|
+
require 'scrubyt/core/scraping/filters/base_filter.rb'
|
25
|
+
require 'scrubyt/core/scraping/filters/attribute_filter.rb'
|
26
|
+
require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
|
27
|
+
require 'scrubyt/core/scraping/filters/download_filter.rb'
|
28
|
+
require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
|
29
|
+
require 'scrubyt/core/scraping/filters/regexp_filter.rb'
|
30
|
+
require 'scrubyt/core/scraping/filters/tree_filter.rb'
|
9
31
|
require 'scrubyt/core/scraping/pattern.rb'
|
10
32
|
require 'scrubyt/output/result_dumper.rb'
|
11
33
|
require 'scrubyt/output/result.rb'
|
12
|
-
require 'scrubyt/utils/xpathutils.rb'
|
13
34
|
require 'scrubyt/output/post_processor.rb'
|
14
35
|
require 'scrubyt/core/navigation/navigation_actions.rb'
|
15
36
|
require 'scrubyt/core/navigation/fetch_action.rb'
|
16
37
|
require 'scrubyt/core/shared/evaluation_context.rb'
|
17
|
-
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
18
|
-
require 'scrubyt/utils/shared_utils.rb'
|
19
|
-
require 'scrubyt/utils/simple_example_lookup.rb'
|
20
|
-
require 'scrubyt/utils/compound_example_lookup.rb'
|
38
|
+
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
@@ -2,38 +2,46 @@ module Scrubyt
|
|
2
2
|
##
|
3
3
|
#=<tt>Fetching pages (and related functionality)</tt>
|
4
4
|
#
|
5
|
-
#Since lot of things are happening during (and before)
|
5
|
+
#Since lot of things are happening during (and before)
|
6
6
|
#the fetching of a document, I decided to move out fetching related
|
7
7
|
#functionality to a separate class - so if you are looking for anything
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
10
|
class FetchAction
|
11
11
|
def initialize
|
12
|
-
@@current_doc_url = nil
|
12
|
+
@@current_doc_url = nil
|
13
13
|
@@current_doc_protocol = nil
|
14
14
|
@@base_dir = nil
|
15
15
|
@@host_name = nil
|
16
|
-
@@agent = WWW::Mechanize.new
|
16
|
+
@@agent = WWW::Mechanize.new
|
17
|
+
@@history = []
|
17
18
|
end
|
18
|
-
|
19
|
+
|
19
20
|
##
|
20
21
|
#Action to fetch a document (either a file or a http address)
|
21
|
-
#
|
22
|
+
#
|
22
23
|
#*parameters*
|
23
24
|
#
|
24
25
|
#_doc_url_ - the url or file name to fetch
|
25
|
-
def self.fetch(doc_url,
|
26
|
-
|
27
|
-
|
26
|
+
def self.fetch(doc_url, *args)
|
27
|
+
#Refactor this crap!!! with option_accessor stuff
|
28
|
+
proxy = args[0][:proxy]
|
29
|
+
mechanize_doc = args[0][:mechanize_doc]
|
30
|
+
resolve = args[0][:resolve] || :full
|
31
|
+
basic_auth = args[0][:basic_auth]
|
32
|
+
user_agent = args[0][:user_agent] || "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
|
33
|
+
#Refactor this whole stuff as well!!! It looks awful...
|
34
|
+
parse_and_set_proxy(proxy) if proxy
|
35
|
+
set_user_agent(user_agent)
|
36
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
37
|
+
if !mechanize_doc
|
28
38
|
@@current_doc_url = doc_url
|
29
39
|
@@current_doc_protocol = determine_protocol
|
30
40
|
handle_relative_path(doc_url)
|
31
|
-
handle_relative_url(doc_url)
|
32
|
-
|
41
|
+
handle_relative_url(doc_url,resolve)
|
33
42
|
puts "[ACTION] fetching document: #{@@current_doc_url}"
|
34
43
|
if @@current_doc_protocol != 'file'
|
35
|
-
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
36
|
-
store_host_name(doc_url)
|
44
|
+
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
37
45
|
end
|
38
46
|
else
|
39
47
|
@@current_doc_url = doc_url
|
@@ -44,60 +52,75 @@ module Scrubyt
|
|
44
52
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
45
53
|
else
|
46
54
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
55
|
+
store_host_name(self.get_current_doc_url) # in case we're on a new host
|
47
56
|
end
|
48
57
|
end
|
49
|
-
|
58
|
+
|
50
59
|
##
|
51
|
-
#Submit the last form;
|
60
|
+
#Submit the last form;
|
52
61
|
def self.submit(current_form, button=nil)
|
53
62
|
puts '[ACTION] submitting form...'
|
54
|
-
if button == nil
|
63
|
+
if button == nil
|
55
64
|
result_page = @@agent.submit(current_form)
|
56
65
|
else
|
57
66
|
result_page = @@agent.submit(current_form, button)
|
58
67
|
end
|
59
68
|
@@current_doc_url = result_page.uri.to_s
|
60
69
|
puts "[ACTION] fetched #{@@current_doc_url}"
|
61
|
-
fetch(@@current_doc_url,
|
70
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
62
71
|
end
|
63
|
-
|
72
|
+
|
64
73
|
##
|
65
|
-
#Click the link specified by the text
|
66
|
-
def self.click_link(
|
67
|
-
|
68
|
-
|
69
|
-
|
74
|
+
#Click the link specified by the text
|
75
|
+
def self.click_link(link_spec,index = 0)
|
76
|
+
print "[ACTION] clicking link specified by: "; p link_spec
|
77
|
+
if link_spec.is_a? Hash
|
78
|
+
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
79
|
+
else
|
80
|
+
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
81
|
+
end
|
82
|
+
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
83
|
+
result_page = @@agent.click(clicked_elem)
|
70
84
|
@@current_doc_url = result_page.uri.to_s
|
71
85
|
puts "[ACTION] fetched #{@@current_doc_url}"
|
72
|
-
fetch(@@current_doc_url,
|
73
|
-
end
|
74
|
-
|
86
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
87
|
+
end
|
88
|
+
|
75
89
|
##
|
76
90
|
# At any given point, the current document can be queried with this method; Typically used
|
77
|
-
# when the navigation is over and the result document is passed to the wrapper
|
91
|
+
# when the navigation is over and the result document is passed to the wrapper
|
78
92
|
def self.get_current_doc_url
|
79
93
|
@@current_doc_url
|
80
94
|
end
|
81
|
-
|
95
|
+
|
82
96
|
def self.get_mechanize_doc
|
83
97
|
@@mechanize_doc
|
84
98
|
end
|
85
|
-
|
99
|
+
|
86
100
|
def self.get_hpricot_doc
|
87
101
|
@@hpricot_doc
|
88
102
|
end
|
89
|
-
|
103
|
+
|
90
104
|
def self.get_host_name
|
91
|
-
@@host_name
|
105
|
+
@@host_name
|
92
106
|
end
|
93
|
-
|
107
|
+
|
94
108
|
def self.restore_host_name
|
109
|
+
return if @@current_doc_protocol == 'file'
|
95
110
|
@@host_name = @@original_host_name
|
96
|
-
end
|
97
|
-
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.store_page
|
114
|
+
@@history.push @@hpricot_doc
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.restore_page
|
118
|
+
@@hpricot_doc = @@history.pop
|
119
|
+
end
|
120
|
+
|
98
121
|
def self.determine_protocol
|
99
122
|
old_protocol = @@current_doc_protocol
|
100
|
-
new_protocol = case @@current_doc_url
|
123
|
+
new_protocol = case @@current_doc_url
|
101
124
|
when /^https/
|
102
125
|
'https'
|
103
126
|
when /^http/
|
@@ -110,10 +133,9 @@ private
|
|
110
133
|
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
111
134
|
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
112
135
|
new_protocol
|
113
|
-
end
|
114
|
-
|
136
|
+
end
|
137
|
+
|
115
138
|
def self.parse_and_set_proxy(proxy)
|
116
|
-
proxy = proxy[:proxy]
|
117
139
|
if proxy.downcase == 'localhost'
|
118
140
|
@@host = 'localhost'
|
119
141
|
@@port = proxy.split(':').last
|
@@ -130,34 +152,47 @@ private
|
|
130
152
|
puts "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
|
131
153
|
@@agent.set_proxy(@@host, @@port)
|
132
154
|
end
|
133
|
-
|
155
|
+
|
156
|
+
def self.parse_and_set_basic_auth(basic_auth)
|
157
|
+
login, pass = basic_auth.split('@')
|
158
|
+
puts "[ACTION] Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
159
|
+
@@agent.basic_auth(login, pass)
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.set_user_agent(user_agent)
|
163
|
+
#puts "[ACTION] Setting user-agent to #{user_agent}"
|
164
|
+
@@agent.user_agent = user_agent
|
165
|
+
end
|
166
|
+
|
134
167
|
def self.handle_relative_path(doc_url)
|
135
168
|
if @@base_dir == nil
|
136
169
|
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
137
|
-
else
|
170
|
+
else
|
138
171
|
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
139
172
|
end
|
140
173
|
end
|
141
|
-
|
142
|
-
def self.handle_relative_url(doc_url)
|
143
|
-
return if doc_url =~ /^http/
|
144
|
-
if @@host_name != nil
|
145
|
-
#p doc_url
|
146
|
-
#p @@host_name
|
147
|
-
if doc_url !~ /#{@@host_name}/
|
148
|
-
@@current_doc_url = (@@host_name + doc_url)
|
149
|
-
#remove duplicate parts, like /blogs/en/blogs/en
|
150
|
-
@@current_doc_url = @@current_doc_url.split('/').uniq.reject{|x| x == ""}.join('/')
|
151
|
-
@@current_doc_url.sub!('http:/', 'http://')
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
174
|
+
|
156
175
|
def self.store_host_name(doc_url)
|
157
176
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
158
177
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
159
178
|
@@host_name = doc_url if @@host_name == nil
|
160
|
-
@@
|
161
|
-
|
179
|
+
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
180
|
+
@@original_host_name ||= @@host_name
|
181
|
+
end #end of method store_host_name
|
182
|
+
|
183
|
+
def self.handle_relative_url(doc_url, resolve)
|
184
|
+
return if doc_url =~ /^http/
|
185
|
+
case resolve
|
186
|
+
when :full
|
187
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
188
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
189
|
+
when :host
|
190
|
+
base_host_name = @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0]
|
191
|
+
@@current_doc_url = base_host_name + doc_url
|
192
|
+
else
|
193
|
+
#custom resilving
|
194
|
+
@@current_doc_url = resolve + doc_url
|
195
|
+
end
|
196
|
+
end #end of function handle_relative_url
|
162
197
|
end #end of class FetchAction
|
163
198
|
end #end of module Scrubyt
|
@@ -8,25 +8,26 @@ module Scrubyt
|
|
8
8
|
class NavigationActions
|
9
9
|
#These are reserved keywords - they can not be the name of any pattern
|
10
10
|
#since they are reserved for describing the navigation
|
11
|
-
KEYWORDS = ['fetch',
|
11
|
+
KEYWORDS = ['fetch',
|
12
12
|
'fill_textfield',
|
13
|
-
'fill_textarea',
|
13
|
+
'fill_textarea',
|
14
14
|
'submit',
|
15
15
|
'click_link',
|
16
|
-
'select_option',
|
16
|
+
'select_option',
|
17
|
+
'check_checkbox',
|
17
18
|
'end']
|
18
|
-
|
19
|
+
|
19
20
|
def initialize
|
20
21
|
@@current_form = nil
|
21
22
|
FetchAction.new
|
22
23
|
end
|
23
|
-
|
24
|
+
|
24
25
|
##
|
25
26
|
#Action to fill a textfield with a query string
|
26
27
|
#
|
27
28
|
##*parameters*
|
28
29
|
#
|
29
|
-
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
30
|
+
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
30
31
|
#textfield is 'q'
|
31
32
|
#
|
32
33
|
#_query_string_ - the string that should be entered into the textfield
|
@@ -34,15 +35,15 @@ module Scrubyt
|
|
34
35
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
35
36
|
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
36
37
|
end
|
37
|
-
|
38
|
+
|
38
39
|
##
|
39
40
|
#Action to fill a textarea with text
|
40
41
|
def self.fill_textarea(textarea_name, text)
|
41
42
|
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
42
43
|
eval("@@current_form['#{textarea_name}'] = '#{text}'")
|
43
44
|
end
|
44
|
-
|
45
|
-
##
|
45
|
+
|
46
|
+
##
|
46
47
|
#Action for selecting an option from a dropdown box
|
47
48
|
def self.select_option(selectlist_name, option)
|
48
49
|
lookup_form_for_tag('select','select list',selectlist_name,option)
|
@@ -51,13 +52,19 @@ module Scrubyt
|
|
51
52
|
searched_option.click
|
52
53
|
end
|
53
54
|
|
55
|
+
def self.check_checkbox(checkbox_name)
|
56
|
+
puts checkbox_name
|
57
|
+
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
58
|
+
@@current_form.checkboxes.name(checkbox_name).check
|
59
|
+
end
|
60
|
+
|
54
61
|
##
|
55
62
|
#Fetch the document
|
56
|
-
def self.fetch(
|
57
|
-
FetchAction.fetch(
|
63
|
+
def self.fetch(*args)
|
64
|
+
FetchAction.fetch(*args)
|
58
65
|
end
|
59
66
|
##
|
60
|
-
#Submit the current form (delegate it to NavigationActions)
|
67
|
+
#Submit the current form (delegate it to NavigationActions)
|
61
68
|
def self.submit(index=nil)
|
62
69
|
if index == nil
|
63
70
|
FetchAction.submit(@@current_form)
|
@@ -65,39 +72,42 @@ module Scrubyt
|
|
65
72
|
FetchAction.submit(@@current_form, @@current_form.buttons[index])
|
66
73
|
end
|
67
74
|
end
|
68
|
-
|
75
|
+
|
69
76
|
##
|
70
77
|
#Click the link specified by the text ((delegate it to NavigationActions)
|
71
|
-
def self.click_link(
|
72
|
-
FetchAction.click_link(
|
78
|
+
def self.click_link(link_spec,index=0)
|
79
|
+
FetchAction.click_link(link_spec,index)
|
73
80
|
end
|
74
|
-
|
81
|
+
|
75
82
|
def self.get_hpricot_doc
|
76
83
|
FetchAction.get_hpricot_doc
|
77
84
|
end
|
78
|
-
|
85
|
+
|
79
86
|
def self.get_current_doc_url
|
80
87
|
FetchAction.get_current_doc_url
|
81
88
|
end
|
82
|
-
|
89
|
+
|
90
|
+
def self.get_host_name
|
91
|
+
FetchAction.get_host_name
|
92
|
+
end
|
93
|
+
|
83
94
|
private
|
84
95
|
def self.lookup_form_for_tag(tag,widget_name,name_attribute,query_string)
|
85
96
|
puts "[ACTION] typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
86
97
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[0]
|
87
98
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
88
|
-
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
99
|
+
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
89
100
|
end
|
90
|
-
|
101
|
+
|
91
102
|
def self.find_form_based_on_tag(tag, possible_attrs)
|
92
103
|
lookup_attribute_name = nil
|
93
104
|
lookup_attribute_value = nil
|
94
|
-
|
105
|
+
|
95
106
|
possible_attrs.each { |a|
|
96
107
|
lookup_attribute_name = a
|
97
108
|
lookup_attribute_value = tag.attributes[a]
|
98
109
|
break if lookup_attribute_value != nil
|
99
110
|
}
|
100
|
-
|
101
111
|
i = 0
|
102
112
|
loop do
|
103
113
|
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|