scrubyt 0.2.6 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +59 -12
- data/Rakefile +2 -2
- data/lib/scrubyt.rb +24 -6
- data/lib/scrubyt/core/navigation/fetch_action.rb +91 -56
- data/lib/scrubyt/core/navigation/navigation_actions.rb +32 -22
- data/lib/scrubyt/core/scraping/constraint.rb +53 -57
- data/lib/scrubyt/core/scraping/constraint_adder.rb +15 -38
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +111 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +7 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +17 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +121 -0
- data/lib/scrubyt/core/scraping/pattern.rb +292 -157
- data/lib/scrubyt/core/scraping/result_indexer.rb +51 -47
- data/lib/scrubyt/core/shared/evaluation_context.rb +3 -42
- data/lib/scrubyt/core/shared/extractor.rb +122 -163
- data/lib/scrubyt/output/export.rb +59 -174
- data/lib/scrubyt/output/post_processor.rb +4 -3
- data/lib/scrubyt/output/result.rb +8 -9
- data/lib/scrubyt/output/result_dumper.rb +81 -42
- data/lib/scrubyt/utils/compound_example_lookup.rb +11 -11
- data/lib/scrubyt/utils/ruby_extensions.rb +113 -0
- data/lib/scrubyt/utils/shared_utils.rb +39 -26
- data/lib/scrubyt/utils/simple_example_lookup.rb +6 -6
- data/lib/scrubyt/utils/xpathutils.rb +31 -30
- data/test/unittests/constraint_test.rb +11 -7
- data/test/unittests/extractor_test.rb +6 -6
- data/test/unittests/filter_test.rb +66 -66
- metadata +22 -15
- data/lib/scrubyt/core/scraping/filter.rb +0 -201
data/CHANGELOG
CHANGED
@@ -1,29 +1,76 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
-
== 0.2.
|
3
|
+
== 0.2.7
|
4
|
+
=== 15th April, 2007
|
5
|
+
|
6
|
+
=<tt>changes:</tt>
|
7
|
+
|
8
|
+
[NEW] download pattern: download the file pointed to by the
|
9
|
+
parent pattern
|
10
|
+
[NEW] checking checkboxes
|
11
|
+
[NEW] basic authentication support
|
12
|
+
[NEW] default values for missing elements
|
13
|
+
[NEW] possibility to resolve relative paths against a custom url
|
14
|
+
[NEW] first simple version of to_csv and to_hash
|
15
|
+
[NEW] complete rewrite of the exporting system (Credit: Neelance)
|
16
|
+
[NEW] first version of smart regular expressions: they are constructed
|
17
|
+
from examples, just as regular expressions (Credit: Neelance)
|
18
|
+
[NEW] Possibility to click the n-th link
|
19
|
+
[FIX] Clicking on links using scRUBYt's aadvanced example lookup
|
20
|
+
[NEW] Forcing writing text of non-leaf nodes with :write_text => true
|
21
|
+
[NEW] Possibility to set custom user-agent; Specified default user agent
|
22
|
+
as Microsoft IE6
|
23
|
+
[FIX] Fixed crawling to detail pages in case of leaving the
|
24
|
+
original site (Credit: Michael Mazour)
|
25
|
+
[FIX] fixing the '//' problem - if the relative url contained two
|
26
|
+
slashes, the fetching failed
|
27
|
+
[FIX] scrubyt assumed that documents have a list of nested elements
|
28
|
+
(Credit: Rick Bradley)
|
29
|
+
[FIX] crawling to detail pages works also if the parent pattern is
|
30
|
+
a string pattern
|
31
|
+
[FIX] shorcut url fixed again
|
32
|
+
[FIX] regexp pattern fixed in case it's parent was a string
|
33
|
+
[FIX] refactoring the core classes, lots of bugfixes and stabilization
|
34
|
+
|
35
|
+
== 0.2.6
|
4
36
|
=== 22th March, 2007
|
5
37
|
|
6
|
-
The mission of this release was to add even more powerful features,
|
7
|
-
|
38
|
+
The mission of this release was to add even more powerful features,
|
39
|
+
like crawling to detail pages or compound example specification,
|
40
|
+
as well as fixing the most frequently popping-up bugs. Scraping
|
41
|
+
of concrete sites is more and more frequently the cause for new
|
42
|
+
features and bugfixes, which in my opinion means that the
|
43
|
+
framework is beginning to make sense: from a shiny toy which
|
44
|
+
looks cool and everybody wants to play with, it is moving
|
45
|
+
towards a tool which you reach after if you seriously want
|
46
|
+
to scrape a site.
|
47
|
+
|
48
|
+
The new stuff in this release is 99% scraping related - if
|
49
|
+
you are looking for new features in the navigation part,
|
50
|
+
probably the next version will be for you, where I will
|
51
|
+
concentrate more on adding new widgets and possibilities
|
52
|
+
to the navigation process. Firewatir integration is very
|
53
|
+
close, too - perhaps already the next release will
|
54
|
+
support FireWatir navigation!
|
8
55
|
|
9
56
|
=<tt>changes:</tt>
|
10
57
|
* [NEW] Automatically crawling to and extracting from detail pages
|
11
58
|
* [NEW] Compound example specification: So far the example of a pattern had to be a string.
|
12
59
|
Now it can be a hash as well, like {:contains => /\d\d-\d/, :begins_with => 'Telephone'}
|
13
60
|
* [NEW] More sophisticated example specification: Possible to use regexp as well, and need not
|
14
|
-
(but still possible of course) to specify the whole content of the node - nodes that
|
61
|
+
(but still possible of course) to specify the whole content of the node - nodes that
|
15
62
|
contain the string/match the regexp will be returned, too
|
16
63
|
* [NEW] Possibility to force writing text in case of non-leaf nodes
|
17
|
-
* [NEW] Crawling to the next page now possible via image links as well
|
64
|
+
* [NEW] Crawling to the next page now possible via image links as well
|
18
65
|
* [NEW] Possibility to define examples for any pattern (before it did not make sense for ancestors)
|
19
66
|
* [NEW] Implementation of crawling to the next page with different methods
|
20
|
-
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
67
|
+
* [NEW] Heuristics: if something ends with _url, it is a shortcut for:
|
21
68
|
some_url 'href', :type => :attribute
|
22
69
|
* [FIX] Crawling to the next page (the broken google example): if the next
|
23
|
-
link text is not an <a>, traverse down until the <a> is found; if it is
|
70
|
+
link text is not an <a>, traverse down until the <a> is found; if it is
|
24
71
|
still not found, traverse up until it is found
|
25
72
|
* [FIX] Crawling to next pages does not break if the next link is greyed out
|
26
|
-
(or otherwise present but has no href attribute (Credit:
|
73
|
+
(or otherwise present but has no href attribute (Credit: Robert Au)
|
27
74
|
* [FIX] DRY-ed next link lookup - it should be much more robust now as it is uses the 'standard' example lookup
|
28
75
|
* [NEW] Correct exporting of detail page extractors
|
29
76
|
* [NEW] Added more powerful XPath regexp (Credit: Karol Hosiawa)
|
@@ -105,13 +152,13 @@ This is a preview release before the first real public release, 0.2.0. Basically
|
|
105
152
|
* Enhanced heuristics for example text detection
|
106
153
|
* First version of algorithm to remove dupes resulting from multiple examples
|
107
154
|
* empty XML leaf nodes are not written
|
108
|
-
* new examples
|
155
|
+
* new examples
|
109
156
|
* TONS of bugfixes
|
110
157
|
|
111
158
|
= 0.1
|
112
159
|
=== 15th January, 2007
|
113
160
|
|
114
|
-
First pre-alpha (non-public) release
|
161
|
+
First pre-alpha (non-public) release
|
115
162
|
This release was made more for myself (to try and test rubyforge, gems, etc) rather than for the community at this time.
|
116
163
|
|
117
164
|
Fairly nice set of features, but still need a lot of testing and stabilizing before it will be really usable.
|
@@ -201,13 +248,13 @@ This is a preview release before the first real public release, 0.2.0. Basically
|
|
201
248
|
* Enhanced heuristics for example text detection
|
202
249
|
* First version of algorithm to remove dupes resulting from multiple examples
|
203
250
|
* empty XML leaf nodes are not written
|
204
|
-
* new examples
|
251
|
+
* new examples
|
205
252
|
* TONS of bugfixes
|
206
253
|
|
207
254
|
= 0.1
|
208
255
|
=== 15th January, 2007
|
209
256
|
|
210
|
-
First pre-alpha (non-public) release
|
257
|
+
First pre-alpha (non-public) release
|
211
258
|
This release was made more for myself (to try and test rubyforge, gems, etc) rather than for the community at this time.
|
212
259
|
|
213
260
|
Fairly nice set of features, but still need a lot of testing and stabilizing before it will be really usable.
|
data/Rakefile
CHANGED
@@ -18,7 +18,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
18
18
|
|
19
19
|
gem_spec = Gem::Specification.new do |s|
|
20
20
|
s.name = 'scrubyt'
|
21
|
-
s.version = '0.2.
|
21
|
+
s.version = '0.2.8'
|
22
22
|
s.summary = 'A powerful Web-scraping framework'
|
23
23
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
24
24
|
# Files containing Test::Unit test cases.
|
@@ -82,7 +82,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
82
82
|
pkg.need_tar = false
|
83
83
|
end
|
84
84
|
|
85
|
-
Rake::PackageTask.new('scrubyt-examples', '0.2.
|
85
|
+
Rake::PackageTask.new('scrubyt-examples', '0.2.8') do |pkg|
|
86
86
|
pkg.need_zip = true
|
87
87
|
pkg.need_tar = true
|
88
88
|
pkg.package_files.include("examples/**/*")
|
data/lib/scrubyt.rb
CHANGED
@@ -1,3 +1,19 @@
|
|
1
|
+
#ruby core
|
2
|
+
require 'open-uri'
|
3
|
+
|
4
|
+
#gems
|
5
|
+
require 'rubygems'
|
6
|
+
require 'mechanize'
|
7
|
+
require 'hpricot'
|
8
|
+
require 'parse_tree'
|
9
|
+
require 'ruby2ruby'
|
10
|
+
|
11
|
+
#scrubyt
|
12
|
+
require 'scrubyt/utils/ruby_extensions.rb'
|
13
|
+
require 'scrubyt/utils/xpathutils.rb'
|
14
|
+
require 'scrubyt/utils/shared_utils.rb'
|
15
|
+
require 'scrubyt/utils/simple_example_lookup.rb'
|
16
|
+
require 'scrubyt/utils/compound_example_lookup.rb'
|
1
17
|
require 'scrubyt/core/scraping/constraint_adder.rb'
|
2
18
|
require 'scrubyt/core/scraping/constraint.rb'
|
3
19
|
require 'scrubyt/core/scraping/result_indexer.rb'
|
@@ -5,16 +21,18 @@ require 'scrubyt/core/scraping/pre_filter_document.rb'
|
|
5
21
|
require 'scrubyt/core/scraping/compound_example.rb'
|
6
22
|
require 'scrubyt/output/export.rb'
|
7
23
|
require 'scrubyt/core/shared/extractor.rb'
|
8
|
-
require 'scrubyt/core/scraping/
|
24
|
+
require 'scrubyt/core/scraping/filters/base_filter.rb'
|
25
|
+
require 'scrubyt/core/scraping/filters/attribute_filter.rb'
|
26
|
+
require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
|
27
|
+
require 'scrubyt/core/scraping/filters/download_filter.rb'
|
28
|
+
require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
|
29
|
+
require 'scrubyt/core/scraping/filters/regexp_filter.rb'
|
30
|
+
require 'scrubyt/core/scraping/filters/tree_filter.rb'
|
9
31
|
require 'scrubyt/core/scraping/pattern.rb'
|
10
32
|
require 'scrubyt/output/result_dumper.rb'
|
11
33
|
require 'scrubyt/output/result.rb'
|
12
|
-
require 'scrubyt/utils/xpathutils.rb'
|
13
34
|
require 'scrubyt/output/post_processor.rb'
|
14
35
|
require 'scrubyt/core/navigation/navigation_actions.rb'
|
15
36
|
require 'scrubyt/core/navigation/fetch_action.rb'
|
16
37
|
require 'scrubyt/core/shared/evaluation_context.rb'
|
17
|
-
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
18
|
-
require 'scrubyt/utils/shared_utils.rb'
|
19
|
-
require 'scrubyt/utils/simple_example_lookup.rb'
|
20
|
-
require 'scrubyt/utils/compound_example_lookup.rb'
|
38
|
+
require 'scrubyt/core/shared/u_r_i_builder.rb'
|
@@ -2,38 +2,46 @@ module Scrubyt
|
|
2
2
|
##
|
3
3
|
#=<tt>Fetching pages (and related functionality)</tt>
|
4
4
|
#
|
5
|
-
#Since lot of things are happening during (and before)
|
5
|
+
#Since lot of things are happening during (and before)
|
6
6
|
#the fetching of a document, I decided to move out fetching related
|
7
7
|
#functionality to a separate class - so if you are looking for anything
|
8
8
|
#which is loading a document (even by submitting a form or clicking a link)
|
9
9
|
#and related things like setting a proxy etc. you should find it here.
|
10
10
|
class FetchAction
|
11
11
|
def initialize
|
12
|
-
@@current_doc_url = nil
|
12
|
+
@@current_doc_url = nil
|
13
13
|
@@current_doc_protocol = nil
|
14
14
|
@@base_dir = nil
|
15
15
|
@@host_name = nil
|
16
|
-
@@agent = WWW::Mechanize.new
|
16
|
+
@@agent = WWW::Mechanize.new
|
17
|
+
@@history = []
|
17
18
|
end
|
18
|
-
|
19
|
+
|
19
20
|
##
|
20
21
|
#Action to fetch a document (either a file or a http address)
|
21
|
-
#
|
22
|
+
#
|
22
23
|
#*parameters*
|
23
24
|
#
|
24
25
|
#_doc_url_ - the url or file name to fetch
|
25
|
-
def self.fetch(doc_url,
|
26
|
-
|
27
|
-
|
26
|
+
def self.fetch(doc_url, *args)
|
27
|
+
#Refactor this crap!!! with option_accessor stuff
|
28
|
+
proxy = args[0][:proxy]
|
29
|
+
mechanize_doc = args[0][:mechanize_doc]
|
30
|
+
resolve = args[0][:resolve] || :full
|
31
|
+
basic_auth = args[0][:basic_auth]
|
32
|
+
user_agent = args[0][:user_agent] || "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"
|
33
|
+
#Refactor this whole stuff as well!!! It looks awful...
|
34
|
+
parse_and_set_proxy(proxy) if proxy
|
35
|
+
set_user_agent(user_agent)
|
36
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
37
|
+
if !mechanize_doc
|
28
38
|
@@current_doc_url = doc_url
|
29
39
|
@@current_doc_protocol = determine_protocol
|
30
40
|
handle_relative_path(doc_url)
|
31
|
-
handle_relative_url(doc_url)
|
32
|
-
|
41
|
+
handle_relative_url(doc_url,resolve)
|
33
42
|
puts "[ACTION] fetching document: #{@@current_doc_url}"
|
34
43
|
if @@current_doc_protocol != 'file'
|
35
|
-
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
36
|
-
store_host_name(doc_url)
|
44
|
+
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
37
45
|
end
|
38
46
|
else
|
39
47
|
@@current_doc_url = doc_url
|
@@ -44,60 +52,75 @@ module Scrubyt
|
|
44
52
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
45
53
|
else
|
46
54
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
55
|
+
store_host_name(self.get_current_doc_url) # in case we're on a new host
|
47
56
|
end
|
48
57
|
end
|
49
|
-
|
58
|
+
|
50
59
|
##
|
51
|
-
#Submit the last form;
|
60
|
+
#Submit the last form;
|
52
61
|
def self.submit(current_form, button=nil)
|
53
62
|
puts '[ACTION] submitting form...'
|
54
|
-
if button == nil
|
63
|
+
if button == nil
|
55
64
|
result_page = @@agent.submit(current_form)
|
56
65
|
else
|
57
66
|
result_page = @@agent.submit(current_form, button)
|
58
67
|
end
|
59
68
|
@@current_doc_url = result_page.uri.to_s
|
60
69
|
puts "[ACTION] fetched #{@@current_doc_url}"
|
61
|
-
fetch(@@current_doc_url,
|
70
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
62
71
|
end
|
63
|
-
|
72
|
+
|
64
73
|
##
|
65
|
-
#Click the link specified by the text
|
66
|
-
def self.click_link(
|
67
|
-
|
68
|
-
|
69
|
-
|
74
|
+
#Click the link specified by the text
|
75
|
+
def self.click_link(link_spec,index = 0)
|
76
|
+
print "[ACTION] clicking link specified by: "; p link_spec
|
77
|
+
if link_spec.is_a? Hash
|
78
|
+
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
79
|
+
else
|
80
|
+
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
81
|
+
end
|
82
|
+
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
83
|
+
result_page = @@agent.click(clicked_elem)
|
70
84
|
@@current_doc_url = result_page.uri.to_s
|
71
85
|
puts "[ACTION] fetched #{@@current_doc_url}"
|
72
|
-
fetch(@@current_doc_url,
|
73
|
-
end
|
74
|
-
|
86
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
87
|
+
end
|
88
|
+
|
75
89
|
##
|
76
90
|
# At any given point, the current document can be queried with this method; Typically used
|
77
|
-
# when the navigation is over and the result document is passed to the wrapper
|
91
|
+
# when the navigation is over and the result document is passed to the wrapper
|
78
92
|
def self.get_current_doc_url
|
79
93
|
@@current_doc_url
|
80
94
|
end
|
81
|
-
|
95
|
+
|
82
96
|
def self.get_mechanize_doc
|
83
97
|
@@mechanize_doc
|
84
98
|
end
|
85
|
-
|
99
|
+
|
86
100
|
def self.get_hpricot_doc
|
87
101
|
@@hpricot_doc
|
88
102
|
end
|
89
|
-
|
103
|
+
|
90
104
|
def self.get_host_name
|
91
|
-
@@host_name
|
105
|
+
@@host_name
|
92
106
|
end
|
93
|
-
|
107
|
+
|
94
108
|
def self.restore_host_name
|
109
|
+
return if @@current_doc_protocol == 'file'
|
95
110
|
@@host_name = @@original_host_name
|
96
|
-
end
|
97
|
-
|
111
|
+
end
|
112
|
+
|
113
|
+
def self.store_page
|
114
|
+
@@history.push @@hpricot_doc
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.restore_page
|
118
|
+
@@hpricot_doc = @@history.pop
|
119
|
+
end
|
120
|
+
|
98
121
|
def self.determine_protocol
|
99
122
|
old_protocol = @@current_doc_protocol
|
100
|
-
new_protocol = case @@current_doc_url
|
123
|
+
new_protocol = case @@current_doc_url
|
101
124
|
when /^https/
|
102
125
|
'https'
|
103
126
|
when /^http/
|
@@ -110,10 +133,9 @@ private
|
|
110
133
|
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
111
134
|
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
112
135
|
new_protocol
|
113
|
-
end
|
114
|
-
|
136
|
+
end
|
137
|
+
|
115
138
|
def self.parse_and_set_proxy(proxy)
|
116
|
-
proxy = proxy[:proxy]
|
117
139
|
if proxy.downcase == 'localhost'
|
118
140
|
@@host = 'localhost'
|
119
141
|
@@port = proxy.split(':').last
|
@@ -130,34 +152,47 @@ private
|
|
130
152
|
puts "[ACTION] Setting proxy: host=<#{@@host}>, port=<#{@@port}>"
|
131
153
|
@@agent.set_proxy(@@host, @@port)
|
132
154
|
end
|
133
|
-
|
155
|
+
|
156
|
+
def self.parse_and_set_basic_auth(basic_auth)
|
157
|
+
login, pass = basic_auth.split('@')
|
158
|
+
puts "[ACTION] Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
159
|
+
@@agent.basic_auth(login, pass)
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.set_user_agent(user_agent)
|
163
|
+
#puts "[ACTION] Setting user-agent to #{user_agent}"
|
164
|
+
@@agent.user_agent = user_agent
|
165
|
+
end
|
166
|
+
|
134
167
|
def self.handle_relative_path(doc_url)
|
135
168
|
if @@base_dir == nil
|
136
169
|
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
137
|
-
else
|
170
|
+
else
|
138
171
|
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
139
172
|
end
|
140
173
|
end
|
141
|
-
|
142
|
-
def self.handle_relative_url(doc_url)
|
143
|
-
return if doc_url =~ /^http/
|
144
|
-
if @@host_name != nil
|
145
|
-
#p doc_url
|
146
|
-
#p @@host_name
|
147
|
-
if doc_url !~ /#{@@host_name}/
|
148
|
-
@@current_doc_url = (@@host_name + doc_url)
|
149
|
-
#remove duplicate parts, like /blogs/en/blogs/en
|
150
|
-
@@current_doc_url = @@current_doc_url.split('/').uniq.reject{|x| x == ""}.join('/')
|
151
|
-
@@current_doc_url.sub!('http:/', 'http://')
|
152
|
-
end
|
153
|
-
end
|
154
|
-
end
|
155
|
-
|
174
|
+
|
156
175
|
def self.store_host_name(doc_url)
|
157
176
|
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.scan(/http:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'http'
|
158
177
|
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.scan(/https:\/\/(.+\/)+/).flatten[0] if @@current_doc_protocol == 'https'
|
159
178
|
@@host_name = doc_url if @@host_name == nil
|
160
|
-
@@
|
161
|
-
|
179
|
+
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
180
|
+
@@original_host_name ||= @@host_name
|
181
|
+
end #end of method store_host_name
|
182
|
+
|
183
|
+
def self.handle_relative_url(doc_url, resolve)
|
184
|
+
return if doc_url =~ /^http/
|
185
|
+
case resolve
|
186
|
+
when :full
|
187
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
188
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
189
|
+
when :host
|
190
|
+
base_host_name = @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0]
|
191
|
+
@@current_doc_url = base_host_name + doc_url
|
192
|
+
else
|
193
|
+
#custom resilving
|
194
|
+
@@current_doc_url = resolve + doc_url
|
195
|
+
end
|
196
|
+
end #end of function handle_relative_url
|
162
197
|
end #end of class FetchAction
|
163
198
|
end #end of module Scrubyt
|
@@ -8,25 +8,26 @@ module Scrubyt
|
|
8
8
|
class NavigationActions
|
9
9
|
#These are reserved keywords - they can not be the name of any pattern
|
10
10
|
#since they are reserved for describing the navigation
|
11
|
-
KEYWORDS = ['fetch',
|
11
|
+
KEYWORDS = ['fetch',
|
12
12
|
'fill_textfield',
|
13
|
-
'fill_textarea',
|
13
|
+
'fill_textarea',
|
14
14
|
'submit',
|
15
15
|
'click_link',
|
16
|
-
'select_option',
|
16
|
+
'select_option',
|
17
|
+
'check_checkbox',
|
17
18
|
'end']
|
18
|
-
|
19
|
+
|
19
20
|
def initialize
|
20
21
|
@@current_form = nil
|
21
22
|
FetchAction.new
|
22
23
|
end
|
23
|
-
|
24
|
+
|
24
25
|
##
|
25
26
|
#Action to fill a textfield with a query string
|
26
27
|
#
|
27
28
|
##*parameters*
|
28
29
|
#
|
29
|
-
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
30
|
+
#_textfield_name_ - the name of the textfield (e.g. the name of the google search
|
30
31
|
#textfield is 'q'
|
31
32
|
#
|
32
33
|
#_query_string_ - the string that should be entered into the textfield
|
@@ -34,15 +35,15 @@ module Scrubyt
|
|
34
35
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
35
36
|
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
36
37
|
end
|
37
|
-
|
38
|
+
|
38
39
|
##
|
39
40
|
#Action to fill a textarea with text
|
40
41
|
def self.fill_textarea(textarea_name, text)
|
41
42
|
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
42
43
|
eval("@@current_form['#{textarea_name}'] = '#{text}'")
|
43
44
|
end
|
44
|
-
|
45
|
-
##
|
45
|
+
|
46
|
+
##
|
46
47
|
#Action for selecting an option from a dropdown box
|
47
48
|
def self.select_option(selectlist_name, option)
|
48
49
|
lookup_form_for_tag('select','select list',selectlist_name,option)
|
@@ -51,13 +52,19 @@ module Scrubyt
|
|
51
52
|
searched_option.click
|
52
53
|
end
|
53
54
|
|
55
|
+
def self.check_checkbox(checkbox_name)
|
56
|
+
puts checkbox_name
|
57
|
+
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
58
|
+
@@current_form.checkboxes.name(checkbox_name).check
|
59
|
+
end
|
60
|
+
|
54
61
|
##
|
55
62
|
#Fetch the document
|
56
|
-
def self.fetch(
|
57
|
-
FetchAction.fetch(
|
63
|
+
def self.fetch(*args)
|
64
|
+
FetchAction.fetch(*args)
|
58
65
|
end
|
59
66
|
##
|
60
|
-
#Submit the current form (delegate it to NavigationActions)
|
67
|
+
#Submit the current form (delegate it to NavigationActions)
|
61
68
|
def self.submit(index=nil)
|
62
69
|
if index == nil
|
63
70
|
FetchAction.submit(@@current_form)
|
@@ -65,39 +72,42 @@ module Scrubyt
|
|
65
72
|
FetchAction.submit(@@current_form, @@current_form.buttons[index])
|
66
73
|
end
|
67
74
|
end
|
68
|
-
|
75
|
+
|
69
76
|
##
|
70
77
|
#Click the link specified by the text ((delegate it to NavigationActions)
|
71
|
-
def self.click_link(
|
72
|
-
FetchAction.click_link(
|
78
|
+
def self.click_link(link_spec,index=0)
|
79
|
+
FetchAction.click_link(link_spec,index)
|
73
80
|
end
|
74
|
-
|
81
|
+
|
75
82
|
def self.get_hpricot_doc
|
76
83
|
FetchAction.get_hpricot_doc
|
77
84
|
end
|
78
|
-
|
85
|
+
|
79
86
|
def self.get_current_doc_url
|
80
87
|
FetchAction.get_current_doc_url
|
81
88
|
end
|
82
|
-
|
89
|
+
|
90
|
+
def self.get_host_name
|
91
|
+
FetchAction.get_host_name
|
92
|
+
end
|
93
|
+
|
83
94
|
private
|
84
95
|
def self.lookup_form_for_tag(tag,widget_name,name_attribute,query_string)
|
85
96
|
puts "[ACTION] typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
86
97
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[0]
|
87
98
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
88
|
-
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
99
|
+
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
89
100
|
end
|
90
|
-
|
101
|
+
|
91
102
|
def self.find_form_based_on_tag(tag, possible_attrs)
|
92
103
|
lookup_attribute_name = nil
|
93
104
|
lookup_attribute_value = nil
|
94
|
-
|
105
|
+
|
95
106
|
possible_attrs.each { |a|
|
96
107
|
lookup_attribute_name = a
|
97
108
|
lookup_attribute_value = tag.attributes[a]
|
98
109
|
break if lookup_attribute_value != nil
|
99
110
|
}
|
100
|
-
|
101
111
|
i = 0
|
102
112
|
loop do
|
103
113
|
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|