scrubyt 0.3.4 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +31 -0
- data/README +1 -1
- data/Rakefile +4 -9
- data/lib/scrubyt.rb +37 -56
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +249 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +253 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +2 -183
- data/lib/scrubyt/core/navigation/navigation_actions.rb +30 -48
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +6 -6
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +0 -4
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +0 -3
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +1 -5
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +0 -8
- data/lib/scrubyt/core/scraping/pattern.rb +6 -27
- data/lib/scrubyt/core/scraping/result_indexer.rb +0 -4
- data/lib/scrubyt/core/shared/extractor.rb +15 -1
- data/lib/scrubyt/output/result_node.rb +42 -6
- data/lib/scrubyt/output/scrubyt_result.rb +35 -30
- data/lib/scrubyt/utils/ruby_extensions.rb +0 -53
- data/lib/scrubyt/utils/xpathutils.rb +2 -1
- metadata +84 -119
- data/lib/scrubyt/output/export.rb +0 -157
data/CHANGELOG
CHANGED
@@ -1,5 +1,36 @@
|
|
1
1
|
= scRUBYt! Changelog
|
2
2
|
|
3
|
+
== 0.4.05
|
4
|
+
== 20th October
|
5
|
+
|
6
|
+
=<tt>changes:</tt>
|
7
|
+
- [NEW] possibility to use FireWatir as the agent for scraping (credit: Glenn Gillen, Glen Gillen and... did I mention Glenn already?)
|
8
|
+
- [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
|
9
|
+
- [NEW] navigation action: click_by_xpath to click arbitrary elements
|
10
|
+
- [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
|
11
|
+
- [NEW] scraping through frames (e.g. google analytics)
|
12
|
+
- [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
|
13
|
+
- [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
|
14
|
+
- [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
|
15
|
+
- [FIX] firewatir slowness (credit: jak4)
|
16
|
+
- [FIX] lot of bugfixes and stability fixes
|
17
|
+
-
|
18
|
+
|
19
|
+
== 0.4.0 (unofficial)
|
20
|
+
=== 31st October, 2007
|
21
|
+
|
22
|
+
=<tt>changes:</tt>
|
23
|
+
- [NEW] possibility to define a default value for patterns
|
24
|
+
- [MOD] rewrite of to_flat_xml to a more robust algorithm
|
25
|
+
- [NEW] find_string method in text pattern; return the string if it's present in the input
|
26
|
+
|
27
|
+
== 0.3.4
|
28
|
+
=== 26th September, 2007
|
29
|
+
|
30
|
+
=<tt>changes:</tt>
|
31
|
+
It seems I have been too busy to update the changelog ;)
|
32
|
+
|
33
|
+
|
3
34
|
== 0.3.1
|
4
35
|
=== 29th May, 2007
|
5
36
|
|
data/README
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
= scRUBYt! - Hpricot and Mechanize on steroids
|
1
|
+
= scRUBYt! - Hpricot and Mechanize (or FireWatir) on steroids
|
2
2
|
|
3
3
|
A simple to learn and use, yet very powerful web extraction framework written in Ruby. Navigate through the Web, Extract, query, transform and save relevant data from the Web page of your interest by the concise and easy to use DSL.
|
4
4
|
|
data/Rakefile
CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.
|
21
|
-
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
|
20
|
+
s.version = '0.4.1'
|
21
|
+
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
24
24
|
s.test_files = FileList['test/unittests/**/*']
|
@@ -29,12 +29,7 @@ gem_spec = Gem::Specification.new do |s|
|
|
29
29
|
s.homepage = 'http://www.scrubyt.org'
|
30
30
|
s.add_dependency('hpricot', '>= 0.5')
|
31
31
|
s.add_dependency('mechanize', '>= 0.6.3')
|
32
|
-
s.
|
33
|
-
s.add_dependency('RubyInlineAcceleration')
|
34
|
-
s.add_dependency('RubyInline', '= 3.6.3')
|
35
|
-
s.add_dependency('ParseTree', '= 1.7.1')
|
36
|
-
s.add_dependency('ruby2ruby', '= 1.1.6')
|
37
|
-
#s.has_rdoc = 'true'
|
32
|
+
s.has_rdoc = 'true'
|
38
33
|
end
|
39
34
|
|
40
35
|
###################################################
|
@@ -99,7 +94,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
|
|
99
94
|
pkg.need_tar = false
|
100
95
|
end
|
101
96
|
|
102
|
-
#Rake::PackageTask.new('scrubyt-examples', '0.
|
97
|
+
#Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
|
103
98
|
# pkg.need_zip = true
|
104
99
|
# pkg.need_tar = true
|
105
100
|
# pkg.package_files.include("examples/**/*")
|
data/lib/scrubyt.rb
CHANGED
@@ -1,62 +1,43 @@
|
|
1
|
-
$KCODE =
|
2
|
-
require
|
1
|
+
$KCODE = "u"
|
2
|
+
require "jcode"
|
3
3
|
|
4
4
|
#ruby core
|
5
|
-
require
|
6
|
-
require
|
5
|
+
require "open-uri"
|
6
|
+
require "erb"
|
7
7
|
|
8
8
|
#gems
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require 'rexml/text'
|
14
|
-
|
15
|
-
#little hack to avoid that ruby2ruby tries to load the original parse_tree
|
16
|
-
if Gem
|
17
|
-
module Gem
|
18
|
-
class << self
|
19
|
-
alias_method :activate_orig, :activate
|
20
|
-
def activate(gem, autorequire, *version_requirements)
|
21
|
-
activate_orig(gem, autorequire, *version_requirements) unless gem.is_a?(Gem::Dependency) && gem.name == 'ParseTree'
|
22
|
-
end
|
23
|
-
end
|
24
|
-
end
|
25
|
-
end
|
26
|
-
module Kernel
|
27
|
-
alias_method :require_orig, :require
|
28
|
-
def require(path)
|
29
|
-
require_orig(path) unless path == 'parse_tree'
|
30
|
-
end
|
31
|
-
end
|
32
|
-
require 'ruby2ruby'
|
9
|
+
require "rexml/text"
|
10
|
+
require "rubygems"
|
11
|
+
require "mechanize"
|
12
|
+
require "hpricot"
|
33
13
|
|
34
14
|
#scrubyt
|
35
|
-
require
|
36
|
-
require
|
37
|
-
require
|
38
|
-
require
|
39
|
-
require
|
40
|
-
require
|
41
|
-
require
|
42
|
-
require
|
43
|
-
require
|
44
|
-
require
|
45
|
-
require
|
46
|
-
require
|
47
|
-
require
|
48
|
-
require
|
49
|
-
require
|
50
|
-
require
|
51
|
-
require
|
52
|
-
require
|
53
|
-
require
|
54
|
-
require
|
55
|
-
require
|
56
|
-
require
|
57
|
-
require
|
58
|
-
require
|
59
|
-
require
|
60
|
-
require
|
61
|
-
require
|
62
|
-
require
|
15
|
+
require "#{File.dirname(__FILE__)}/scrubyt/logging"
|
16
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
|
17
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
|
18
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
|
19
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
|
20
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
|
21
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
|
22
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
|
23
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
|
24
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
|
25
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
|
26
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
|
27
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
28
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
29
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
30
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
31
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
|
32
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
|
33
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
|
35
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
|
36
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
|
37
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
|
38
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
|
39
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
|
40
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
|
41
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
|
42
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
|
43
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
|
@@ -0,0 +1,249 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'firewatir'
|
3
|
+
module Scrubyt
|
4
|
+
##
|
5
|
+
#=<tt>Fetching pages (and related functionality)</tt>
|
6
|
+
#
|
7
|
+
#Since lot of things are happening during (and before)
|
8
|
+
#the fetching of a document, I decided to move out fetching related
|
9
|
+
#functionality to a separate class - so if you are looking for anything
|
10
|
+
#which is loading a document (even by submitting a form or clicking a link)
|
11
|
+
#and related things like setting a proxy etc. you should find it here.
|
12
|
+
module Navigation
|
13
|
+
module Firewatir
|
14
|
+
|
15
|
+
def self.included(base)
|
16
|
+
base.module_eval do
|
17
|
+
@@agent = FireWatir::Firefox.new
|
18
|
+
@@current_doc_url = nil
|
19
|
+
@@current_doc_protocol = nil
|
20
|
+
@@base_dir = nil
|
21
|
+
@@host_name = nil
|
22
|
+
@@history = []
|
23
|
+
@@current_form = nil
|
24
|
+
@@current_frame = nil
|
25
|
+
|
26
|
+
##
|
27
|
+
#Action to fetch a document (either a file or a http address)
|
28
|
+
#
|
29
|
+
#*parameters*
|
30
|
+
#
|
31
|
+
#_doc_url_ - the url or file name to fetch
|
32
|
+
def self.fetch(doc_url, *args)
|
33
|
+
#Refactor this crap!!! with option_accessor stuff
|
34
|
+
if args.size > 0
|
35
|
+
mechanize_doc = args[0][:mechanize_doc]
|
36
|
+
resolve = args[0][:resolve]
|
37
|
+
basic_auth = args[0][:basic_auth]
|
38
|
+
#Refactor this whole stuff as well!!! It looks awful...
|
39
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
40
|
+
else
|
41
|
+
mechanize_doc = nil
|
42
|
+
resolve = :full
|
43
|
+
end
|
44
|
+
|
45
|
+
@@current_doc_url = doc_url
|
46
|
+
@@current_doc_protocol = determine_protocol
|
47
|
+
if mechanize_doc.nil?
|
48
|
+
handle_relative_path(doc_url) unless @@current_doc_protocol == 'xpath'
|
49
|
+
handle_relative_url(doc_url, resolve)
|
50
|
+
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
51
|
+
case @@current_doc_protocol
|
52
|
+
when 'file': @@agent.goto("file://"+ @@current_doc_url)
|
53
|
+
else @@agent.goto(@@current_doc_url)
|
54
|
+
end
|
55
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
56
|
+
else
|
57
|
+
@@mechanize_doc = mechanize_doc
|
58
|
+
end
|
59
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
60
|
+
store_host_name(@@agent.url) # in case we're on a new host
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.frame(attribute, value)
|
64
|
+
if @@current_frame
|
65
|
+
@@current_frame.frame(attribute, value)
|
66
|
+
else
|
67
|
+
@@current_frame = @@agent.frame(attribute, value)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
#Submit the last form;
|
73
|
+
def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
|
74
|
+
if @@current_frame
|
75
|
+
#BRUTAL hax but FW is such a shitty piece of software
|
76
|
+
#this sucks FAIL omg
|
77
|
+
@@current_frame.locate
|
78
|
+
form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
|
79
|
+
form.submit
|
80
|
+
else
|
81
|
+
@@agent.element_by_xpath(@@current_form).submit
|
82
|
+
end
|
83
|
+
|
84
|
+
if sleep_time
|
85
|
+
sleep sleep_time
|
86
|
+
@@agent.wait
|
87
|
+
end
|
88
|
+
|
89
|
+
@@current_doc_url = @@agent.url
|
90
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
91
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
92
|
+
end
|
93
|
+
|
94
|
+
##
|
95
|
+
#Click the link specified by the text
|
96
|
+
def self.click_link(link_spec,index = 0,wait_secs=0)
|
97
|
+
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
98
|
+
if link_spec.is_a?(Hash)
|
99
|
+
elem = XPathUtils.generate_XPath(CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index), nil, true)
|
100
|
+
result_page = @@agent.element_by_xpath(elem).click
|
101
|
+
else
|
102
|
+
@@agent.link(:innerHTML, Regexp.escape(link_spec)).click
|
103
|
+
end
|
104
|
+
sleep(wait_secs) if wait_secs > 0
|
105
|
+
@@agent.wait
|
106
|
+
@@current_doc_url = @@agent.url
|
107
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
108
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
109
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.click_by_xpath(xpath)
|
113
|
+
Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
|
114
|
+
@@agent.element_by_xpath(xpath).click
|
115
|
+
@@agent.wait
|
116
|
+
@@current_doc_url = @@agent.url
|
117
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
118
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
119
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.click_image_map(index = 0)
|
123
|
+
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
124
|
+
uri = @@mechanize_doc.search("//area")[index]['href']
|
125
|
+
result_page = @@agent.get(uri)
|
126
|
+
@@current_doc_url = result_page.uri.to_s
|
127
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
128
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.store_host_name(doc_url)
|
132
|
+
@@host_name = doc_url.match(/.*\..*?\//)[0] if doc_url.match(/.*\..*?\//)
|
133
|
+
@@original_host_name ||= @@host_name
|
134
|
+
end #end of method store_host_name
|
135
|
+
|
136
|
+
def self.determine_protocol
|
137
|
+
old_protocol = @@current_doc_protocol
|
138
|
+
new_protocol = case @@current_doc_url
|
139
|
+
when /^\/\//
|
140
|
+
'xpath'
|
141
|
+
when /^https/
|
142
|
+
'https'
|
143
|
+
when /^http/
|
144
|
+
'http'
|
145
|
+
when /^www\./
|
146
|
+
'http'
|
147
|
+
else
|
148
|
+
'file'
|
149
|
+
end
|
150
|
+
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
151
|
+
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
152
|
+
new_protocol
|
153
|
+
end
|
154
|
+
|
155
|
+
def self.parse_and_set_basic_auth(basic_auth)
|
156
|
+
login, pass = basic_auth.split('@')
|
157
|
+
Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
158
|
+
@@agent.basic_auth(login, pass)
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.handle_relative_path(doc_url)
|
162
|
+
if @@base_dir == nil || doc_url[0..0] == "/"
|
163
|
+
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
164
|
+
else
|
165
|
+
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def self.handle_relative_url(doc_url, resolve)
|
170
|
+
return if doc_url =~ /^(http:|javascript:)/
|
171
|
+
if doc_url !~ /^\//
|
172
|
+
first_char = doc_url[0..0]
|
173
|
+
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
174
|
+
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
175
|
+
current_uri = @@mechanize_doc.uri.to_s
|
176
|
+
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
177
|
+
if (current_uri.include? '?')
|
178
|
+
current_uri = current_uri.scan(/.+\//)[0]
|
179
|
+
else
|
180
|
+
current_uri += '/' unless current_uri[-1..-1] == '/'
|
181
|
+
end
|
182
|
+
@@current_doc_url = current_uri + doc_url
|
183
|
+
return
|
184
|
+
end
|
185
|
+
end
|
186
|
+
case resolve
|
187
|
+
when :full
|
188
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
189
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
190
|
+
when :host
|
191
|
+
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
192
|
+
@@current_doc_url = base_host_name + doc_url
|
193
|
+
else
|
194
|
+
#custom resilving
|
195
|
+
@@current_doc_url = resolve + doc_url
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
|
200
|
+
@@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
|
201
|
+
target = @@current_frame || @@agent
|
202
|
+
if useValue
|
203
|
+
target.text_field(:name,textfield_name).value = query_string
|
204
|
+
else
|
205
|
+
target.text_field(:name,textfield_name).set(query_string)
|
206
|
+
end
|
207
|
+
sleep(wait_secs) if wait_secs > 0
|
208
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
209
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
##
|
214
|
+
#Action to fill a textarea with text
|
215
|
+
def self.fill_textarea(textarea_name, text)
|
216
|
+
@@current_form = "//input[@name='#{textarea_name}']/ancestor::form"
|
217
|
+
@@agent.text_field(:name,textarea_name).set(text)
|
218
|
+
end
|
219
|
+
|
220
|
+
##
|
221
|
+
#Action for selecting an option from a dropdown box
|
222
|
+
def self.select_option(selectlist_name, option)
|
223
|
+
@@current_form = "//select[@name='#{selectlist_name}']/ancestor::form"
|
224
|
+
@@agent.select_list(:name,selectlist_name).select(option)
|
225
|
+
end
|
226
|
+
|
227
|
+
def self.check_checkbox(checkbox_name)
|
228
|
+
@@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
|
229
|
+
@@agent.checkbox(:name,checkbox_name).set(true)
|
230
|
+
end
|
231
|
+
|
232
|
+
def self.check_radiobutton(checkbox_name, index=0)
|
233
|
+
@@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
|
234
|
+
@@agent.elements_by_xpath("//input[@name='#{checkbox_name}']")[index].set
|
235
|
+
end
|
236
|
+
|
237
|
+
def self.click_image_map(index=0)
|
238
|
+
raise 'NotImplemented'
|
239
|
+
end
|
240
|
+
|
241
|
+
def self.wait(time=1)
|
242
|
+
sleep(time)
|
243
|
+
@@agent.wait
|
244
|
+
end
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
module Scrubyt
|
4
|
+
##
|
5
|
+
#=<tt>Fetching pages (and related functionality)</tt>
|
6
|
+
#
|
7
|
+
#Since lot of things are happening during (and before)
|
8
|
+
#the fetching of a document, I decided to move out fetching related
|
9
|
+
#functionality to a separate class - so if you are looking for anything
|
10
|
+
#which is loading a document (even by submitting a form or clicking a link)
|
11
|
+
#and related things like setting a proxy etc. you should find it here.
|
12
|
+
module Navigation
|
13
|
+
module Mechanize
|
14
|
+
|
15
|
+
def self.included(base)
|
16
|
+
base.module_eval do
|
17
|
+
@@agent = WWW::Mechanize.new
|
18
|
+
@@current_doc_url = nil
|
19
|
+
@@current_doc_protocol = nil
|
20
|
+
@@base_dir = nil
|
21
|
+
@@host_name = nil
|
22
|
+
@@history = []
|
23
|
+
|
24
|
+
##
|
25
|
+
#Action to fetch a document (either a file or a http address)
|
26
|
+
#
|
27
|
+
#*parameters*
|
28
|
+
#
|
29
|
+
#_doc_url_ - the url or file name to fetch
|
30
|
+
def self.fetch(doc_url, *args)
|
31
|
+
#Refactor this crap!!! with option_accessor stuff
|
32
|
+
|
33
|
+
if args.size > 0
|
34
|
+
mechanize_doc = args[0][:mechanize_doc]
|
35
|
+
html = args[0][:html]
|
36
|
+
resolve = args[0][:resolve]
|
37
|
+
basic_auth = args[0][:basic_auth]
|
38
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
39
|
+
if html
|
40
|
+
@@current_doc_protocol = 'string'
|
41
|
+
mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
|
42
|
+
end
|
43
|
+
else
|
44
|
+
mechanize_doc = nil
|
45
|
+
resolve = :full
|
46
|
+
end
|
47
|
+
|
48
|
+
@@current_doc_url = doc_url
|
49
|
+
@@current_doc_protocol ||= determine_protocol
|
50
|
+
|
51
|
+
if mechanize_doc.nil? && @@current_doc_protocol != 'file'
|
52
|
+
handle_relative_path(doc_url)
|
53
|
+
handle_relative_url(doc_url, resolve)
|
54
|
+
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
55
|
+
|
56
|
+
unless 'file' == @@current_doc_protocol
|
57
|
+
@@mechanize_doc = @@agent.get(@@current_doc_url)
|
58
|
+
end
|
59
|
+
else
|
60
|
+
@@mechanize_doc = mechanize_doc
|
61
|
+
end
|
62
|
+
|
63
|
+
if @@current_doc_protocol == 'file'
|
64
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
|
65
|
+
else
|
66
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
|
67
|
+
store_host_name(self.get_current_doc_url) if self.get_current_doc_url # in case we're on a new host
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
#Submit the last form;
|
73
|
+
def self.submit(index=nil, sleep_time=nil, type=nil)
|
74
|
+
Scrubyt.log :ACTION, 'Submitting form...'
|
75
|
+
if index == nil
|
76
|
+
result_page = @@agent.submit(@@current_form)
|
77
|
+
process_submit(@@current_form)
|
78
|
+
#----- added by nickmerwin@gmail.com -----
|
79
|
+
elsif index.class == String && !type.nil?
|
80
|
+
button = @@current_form.buttons.detect{|b| b.name == index}
|
81
|
+
result_page = @@current_form.submit(button)
|
82
|
+
process_submit(@@current_form, button,type)
|
83
|
+
#-----------------------------------------
|
84
|
+
else
|
85
|
+
result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
|
86
|
+
end
|
87
|
+
@@current_doc_url = result_page.uri.to_s
|
88
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
89
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
90
|
+
end
|
91
|
+
|
92
|
+
##
|
93
|
+
#Click the link specified by the text
|
94
|
+
def self.click_link(link_spec,index = 0,wait_secs=0)
|
95
|
+
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
96
|
+
if link_spec.is_a? Hash
|
97
|
+
clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
|
98
|
+
else
|
99
|
+
clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
|
100
|
+
end
|
101
|
+
clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
|
102
|
+
result_page = @@agent.click(clicked_elem)
|
103
|
+
@@current_doc_url = result_page.uri.to_s
|
104
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
105
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.click_image_map(index = 0)
|
109
|
+
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
110
|
+
uri = @@mechanize_doc.search("//area")[index]['href']
|
111
|
+
result_page = @@agent.get(uri)
|
112
|
+
@@current_doc_url = result_page.uri.to_s
|
113
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
114
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.store_host_name(doc_url)
|
118
|
+
@@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
|
119
|
+
@@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
|
120
|
+
@@host_name = doc_url if @@host_name == nil
|
121
|
+
@@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
|
122
|
+
@@original_host_name ||= @@host_name
|
123
|
+
end #end of method store_host_name
|
124
|
+
|
125
|
+
def self.determine_protocol
|
126
|
+
old_protocol = @@current_doc_protocol
|
127
|
+
new_protocol = case @@current_doc_url
|
128
|
+
when /^https/
|
129
|
+
'https'
|
130
|
+
when /^http/
|
131
|
+
'http'
|
132
|
+
when /^www/
|
133
|
+
'http'
|
134
|
+
else
|
135
|
+
'file'
|
136
|
+
end
|
137
|
+
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
138
|
+
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
139
|
+
new_protocol
|
140
|
+
end
|
141
|
+
|
142
|
+
def self.handle_relative_path(doc_url)
|
143
|
+
if @@base_dir == nil
|
144
|
+
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
145
|
+
else
|
146
|
+
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def self.handle_relative_url(doc_url, resolve)
|
151
|
+
return if doc_url =~ /^http/
|
152
|
+
if doc_url !~ /^\//
|
153
|
+
first_char = doc_url[0..0]
|
154
|
+
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
155
|
+
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
156
|
+
current_uri = @@mechanize_doc.uri.to_s
|
157
|
+
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
158
|
+
if (current_uri.include? '?')
|
159
|
+
current_uri = current_uri.scan(/.+\//)[0]
|
160
|
+
else
|
161
|
+
current_uri += '/' unless current_uri[-1..-1] == '/'
|
162
|
+
end
|
163
|
+
@@current_doc_url = current_uri + doc_url
|
164
|
+
return
|
165
|
+
end
|
166
|
+
end
|
167
|
+
case resolve
|
168
|
+
when :full
|
169
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
170
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
171
|
+
when :host
|
172
|
+
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
173
|
+
@@current_doc_url = base_host_name + doc_url
|
174
|
+
else
|
175
|
+
#custom resilving
|
176
|
+
@@current_doc_url = resolve + doc_url
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
def self.fill_textfield(textfield_name, query_string, *unused)
|
181
|
+
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
182
|
+
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
183
|
+
end
|
184
|
+
|
185
|
+
##
|
186
|
+
#Action to fill a textarea with text
|
187
|
+
def self.fill_textarea(textarea_name, text)
|
188
|
+
lookup_form_for_tag('textarea','textarea',textarea_name,text)
|
189
|
+
eval("@@current_form['#{textarea_name}'] = '#{text}'")
|
190
|
+
end
|
191
|
+
|
192
|
+
##
|
193
|
+
#Action for selecting an option from a dropdown box
|
194
|
+
def self.select_option(selectlist_name, option)
|
195
|
+
lookup_form_for_tag('select','select list',selectlist_name,option)
|
196
|
+
select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
|
197
|
+
searched_option = select_list.options.find{|f| f.text.strip == option}
|
198
|
+
searched_option.click
|
199
|
+
end
|
200
|
+
|
201
|
+
def self.check_checkbox(checkbox_name)
|
202
|
+
lookup_form_for_tag('input','checkbox',checkbox_name, '')
|
203
|
+
@@current_form.checkboxes.name(checkbox_name).check
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.check_radiobutton(checkbox_name, index=0)
|
207
|
+
lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
|
208
|
+
@@current_form.radiobuttons.name(checkbox_name)[index].check
|
209
|
+
end
|
210
|
+
|
211
|
+
#private
|
212
|
+
def self.process_submit(current_form, button=nil, type=nil)
|
213
|
+
if button == nil
|
214
|
+
result_page = @@agent.submit(current_form)
|
215
|
+
elsif type
|
216
|
+
result_page = current_form.submit(button)
|
217
|
+
else
|
218
|
+
result_page = @@agent.submit(current_form, button)
|
219
|
+
end
|
220
|
+
@@current_doc_url = result_page.uri.to_s
|
221
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
222
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
226
|
+
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
227
|
+
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
228
|
+
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
229
|
+
find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
230
|
+
end
|
231
|
+
|
232
|
+
def self.find_form_based_on_tag(tag, possible_attrs)
|
233
|
+
lookup_attribute_name = nil
|
234
|
+
lookup_attribute_value = nil
|
235
|
+
|
236
|
+
possible_attrs.each { |a|
|
237
|
+
lookup_attribute_name = a
|
238
|
+
lookup_attribute_value = tag.attributes[a]
|
239
|
+
break if lookup_attribute_value != nil
|
240
|
+
}
|
241
|
+
i = 0
|
242
|
+
loop do
|
243
|
+
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|
244
|
+
return nil if @@current_form == nil
|
245
|
+
break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
|
246
|
+
i+= 1
|
247
|
+
end
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|