scrubyt 0.3.4 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,5 +1,36 @@
1
1
  = scRUBYt! Changelog
2
2
 
3
+ == 0.4.05
4
+ == 20th October
5
+
6
+ =<tt>changes:</tt>
7
+ - [NEW] possibility to use FireWatir as the agent for scraping (credit: Glenn Gillen, Glen Gillen and... did I mention Glenn already?)
8
+ - [FIX] navigation doesn't crash if a 404/500 is returned (credit: Glen Gillen)
9
+ - [NEW] navigation action: click_by_xpath to click arbitrary elements
10
+ - [MOD] dropped dependencies: RubyInline, ParseTree, Ruby2Ruby (hooray for win32 users)
11
+ - [NEW] scraping through frames (e.g. google analytics)
12
+ - [MOD] exporting temporarily doesn't work - for now, generated XPaths are printed to the screen
13
+ - [MOD] possibility to wait after clicking link/filling textfield (to be able to scrape inserted AJAX stuff)
14
+ - [NEW] possibility to fetch from a string, by specifying nil as the url and the html string with the :html option
15
+ - [FIX] firewatir slowness (credit: jak4)
16
+ - [FIX] lot of bugfixes and stability fixes
17
+ -
18
+
19
+ == 0.4.0 (unofficial)
20
+ === 31st October, 2007
21
+
22
+ =<tt>changes:</tt>
23
+ - [NEW] possibility to define a default value for patterns
24
+ - [MOD] rewrite of to_flat_xml to a more robust algorithm
25
+ - [NEW] find_string method in text pattern; return the string if it's present in the input
26
+
27
+ == 0.3.4
28
+ === 26th September, 2007
29
+
30
+ =<tt>changes:</tt>
31
+ It seems I have been too busy to update the changelog ;)
32
+
33
+
3
34
  == 0.3.1
4
35
  === 29th May, 2007
5
36
 
data/README CHANGED
@@ -1,4 +1,4 @@
1
- = scRUBYt! - Hpricot and Mechanize on steroids
1
+ = scRUBYt! - Hpricot and Mechanize (or FireWatir) on steroids
2
2
 
3
3
  A simple to learn and use, yet very powerful web extraction framework written in Ruby. Navigate through the Web, Extract, query, transform and save relevant data from the Web page of your interest by the concise and easy to use DSL.
4
4
 
data/Rakefile CHANGED
@@ -17,8 +17,8 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.3.4'
21
- s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot'
20
+ s.version = '0.4.1'
21
+ s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
24
24
  s.test_files = FileList['test/unittests/**/*']
@@ -29,12 +29,7 @@ gem_spec = Gem::Specification.new do |s|
29
29
  s.homepage = 'http://www.scrubyt.org'
30
30
  s.add_dependency('hpricot', '>= 0.5')
31
31
  s.add_dependency('mechanize', '>= 0.6.3')
32
- s.add_dependency('ParseTreeReloaded')
33
- s.add_dependency('RubyInlineAcceleration')
34
- s.add_dependency('RubyInline', '= 3.6.3')
35
- s.add_dependency('ParseTree', '= 1.7.1')
36
- s.add_dependency('ruby2ruby', '= 1.1.6')
37
- #s.has_rdoc = 'true'
32
+ s.has_rdoc = 'true'
38
33
  end
39
34
 
40
35
  ###################################################
@@ -99,7 +94,7 @@ Rake::GemPackageTask.new(gem_spec) do |pkg|
99
94
  pkg.need_tar = false
100
95
  end
101
96
 
102
- #Rake::PackageTask.new('scrubyt-examples', '0.3.0') do |pkg|
97
+ #Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
103
98
  # pkg.need_zip = true
104
99
  # pkg.need_tar = true
105
100
  # pkg.package_files.include("examples/**/*")
@@ -1,62 +1,43 @@
1
- $KCODE = 'u'
2
- require 'jcode'
1
+ $KCODE = "u"
2
+ require "jcode"
3
3
 
4
4
  #ruby core
5
- require 'open-uri'
6
- require 'erb'
5
+ require "open-uri"
6
+ require "erb"
7
7
 
8
8
  #gems
9
- require 'rubygems'
10
- require 'mechanize'
11
- require 'hpricot'
12
- require 'parse_tree_reloaded'
13
- require 'rexml/text'
14
-
15
- #little hack to avoid that ruby2ruby tries to load the original parse_tree
16
- if Gem
17
- module Gem
18
- class << self
19
- alias_method :activate_orig, :activate
20
- def activate(gem, autorequire, *version_requirements)
21
- activate_orig(gem, autorequire, *version_requirements) unless gem.is_a?(Gem::Dependency) && gem.name == 'ParseTree'
22
- end
23
- end
24
- end
25
- end
26
- module Kernel
27
- alias_method :require_orig, :require
28
- def require(path)
29
- require_orig(path) unless path == 'parse_tree'
30
- end
31
- end
32
- require 'ruby2ruby'
9
+ require "rexml/text"
10
+ require "rubygems"
11
+ require "mechanize"
12
+ require "hpricot"
33
13
 
34
14
  #scrubyt
35
- require 'scrubyt/logging'
36
- require 'scrubyt/utils/ruby_extensions.rb'
37
- require 'scrubyt/utils/xpathutils.rb'
38
- require 'scrubyt/utils/shared_utils.rb'
39
- require 'scrubyt/utils/simple_example_lookup.rb'
40
- require 'scrubyt/utils/compound_example_lookup.rb'
41
- require 'scrubyt/core/scraping/constraint_adder.rb'
42
- require 'scrubyt/core/scraping/constraint.rb'
43
- require 'scrubyt/core/scraping/result_indexer.rb'
44
- require 'scrubyt/core/scraping/pre_filter_document.rb'
45
- require 'scrubyt/core/scraping/compound_example.rb'
46
- require 'scrubyt/output/result_node.rb'
47
- require 'scrubyt/output/scrubyt_result.rb'
48
- require 'scrubyt/output/export.rb'
49
- require 'scrubyt/core/navigation/navigation_actions.rb'
50
- require 'scrubyt/core/navigation/fetch_action.rb'
51
- require 'scrubyt/core/shared/extractor.rb'
52
- require 'scrubyt/core/scraping/filters/base_filter.rb'
53
- require 'scrubyt/core/scraping/filters/attribute_filter.rb'
54
- require 'scrubyt/core/scraping/filters/constant_filter.rb'
55
- require 'scrubyt/core/scraping/filters/script_filter.rb'
56
- require 'scrubyt/core/scraping/filters/text_filter.rb'
57
- require 'scrubyt/core/scraping/filters/detail_page_filter.rb'
58
- require 'scrubyt/core/scraping/filters/download_filter.rb'
59
- require 'scrubyt/core/scraping/filters/html_subtree_filter.rb'
60
- require 'scrubyt/core/scraping/filters/regexp_filter.rb'
61
- require 'scrubyt/core/scraping/filters/tree_filter.rb'
62
- require 'scrubyt/core/scraping/pattern.rb'
15
+ require "#{File.dirname(__FILE__)}/scrubyt/logging"
16
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
17
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
18
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
19
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
20
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
21
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
22
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
23
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
24
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
25
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
26
+ require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
27
+ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
28
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
29
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
30
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
31
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
32
+ require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
33
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
35
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
36
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
37
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
38
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
39
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
40
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
41
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
42
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
43
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
@@ -0,0 +1,249 @@
1
+ require 'rubygems'
2
+ require 'firewatir'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Firewatir
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = FireWatir::Firefox.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+ @@current_form = nil
24
+ @@current_frame = nil
25
+
26
+ ##
27
+ #Action to fetch a document (either a file or a http address)
28
+ #
29
+ #*parameters*
30
+ #
31
+ #_doc_url_ - the url or file name to fetch
32
+ def self.fetch(doc_url, *args)
33
+ #Refactor this crap!!! with option_accessor stuff
34
+ if args.size > 0
35
+ mechanize_doc = args[0][:mechanize_doc]
36
+ resolve = args[0][:resolve]
37
+ basic_auth = args[0][:basic_auth]
38
+ #Refactor this whole stuff as well!!! It looks awful...
39
+ parse_and_set_basic_auth(basic_auth) if basic_auth
40
+ else
41
+ mechanize_doc = nil
42
+ resolve = :full
43
+ end
44
+
45
+ @@current_doc_url = doc_url
46
+ @@current_doc_protocol = determine_protocol
47
+ if mechanize_doc.nil?
48
+ handle_relative_path(doc_url) unless @@current_doc_protocol == 'xpath'
49
+ handle_relative_url(doc_url, resolve)
50
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
51
+ case @@current_doc_protocol
52
+ when 'file': @@agent.goto("file://"+ @@current_doc_url)
53
+ else @@agent.goto(@@current_doc_url)
54
+ end
55
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
56
+ else
57
+ @@mechanize_doc = mechanize_doc
58
+ end
59
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
60
+ store_host_name(@@agent.url) # in case we're on a new host
61
+ end
62
+
63
+ def self.frame(attribute, value)
64
+ if @@current_frame
65
+ @@current_frame.frame(attribute, value)
66
+ else
67
+ @@current_frame = @@agent.frame(attribute, value)
68
+ end
69
+ end
70
+
71
+ ##
72
+ #Submit the last form;
73
+ def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
74
+ if @@current_frame
75
+ #BRUTAL hax but FW is such a shitty piece of software
76
+ #this sucks FAIL omg
77
+ @@current_frame.locate
78
+ form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
79
+ form.submit
80
+ else
81
+ @@agent.element_by_xpath(@@current_form).submit
82
+ end
83
+
84
+ if sleep_time
85
+ sleep sleep_time
86
+ @@agent.wait
87
+ end
88
+
89
+ @@current_doc_url = @@agent.url
90
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
91
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
92
+ end
93
+
94
+ ##
95
+ #Click the link specified by the text
96
+ def self.click_link(link_spec,index = 0,wait_secs=0)
97
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
98
+ if link_spec.is_a?(Hash)
99
+ elem = XPathUtils.generate_XPath(CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index), nil, true)
100
+ result_page = @@agent.element_by_xpath(elem).click
101
+ else
102
+ @@agent.link(:innerHTML, Regexp.escape(link_spec)).click
103
+ end
104
+ sleep(wait_secs) if wait_secs > 0
105
+ @@agent.wait
106
+ @@current_doc_url = @@agent.url
107
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
108
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
109
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
110
+ end
111
+
112
+ def self.click_by_xpath(xpath)
113
+ Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
114
+ @@agent.element_by_xpath(xpath).click
115
+ @@agent.wait
116
+ @@current_doc_url = @@agent.url
117
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
118
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
119
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
120
+ end
121
+
122
+ def self.click_image_map(index = 0)
123
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
124
+ uri = @@mechanize_doc.search("//area")[index]['href']
125
+ result_page = @@agent.get(uri)
126
+ @@current_doc_url = result_page.uri.to_s
127
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
128
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
129
+ end
130
+
131
+ def self.store_host_name(doc_url)
132
+ @@host_name = doc_url.match(/.*\..*?\//)[0] if doc_url.match(/.*\..*?\//)
133
+ @@original_host_name ||= @@host_name
134
+ end #end of method store_host_name
135
+
136
+ def self.determine_protocol
137
+ old_protocol = @@current_doc_protocol
138
+ new_protocol = case @@current_doc_url
139
+ when /^\/\//
140
+ 'xpath'
141
+ when /^https/
142
+ 'https'
143
+ when /^http/
144
+ 'http'
145
+ when /^www\./
146
+ 'http'
147
+ else
148
+ 'file'
149
+ end
150
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
151
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
152
+ new_protocol
153
+ end
154
+
155
+ def self.parse_and_set_basic_auth(basic_auth)
156
+ login, pass = basic_auth.split('@')
157
+ Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
158
+ @@agent.basic_auth(login, pass)
159
+ end
160
+
161
+ def self.handle_relative_path(doc_url)
162
+ if @@base_dir == nil || doc_url[0..0] == "/"
163
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
164
+ else
165
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
166
+ end
167
+ end
168
+
169
+ def self.handle_relative_url(doc_url, resolve)
170
+ return if doc_url =~ /^(http:|javascript:)/
171
+ if doc_url !~ /^\//
172
+ first_char = doc_url[0..0]
173
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
174
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
175
+ current_uri = @@mechanize_doc.uri.to_s
176
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
177
+ if (current_uri.include? '?')
178
+ current_uri = current_uri.scan(/.+\//)[0]
179
+ else
180
+ current_uri += '/' unless current_uri[-1..-1] == '/'
181
+ end
182
+ @@current_doc_url = current_uri + doc_url
183
+ return
184
+ end
185
+ end
186
+ case resolve
187
+ when :full
188
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
189
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
190
+ when :host
191
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
192
+ @@current_doc_url = base_host_name + doc_url
193
+ else
194
+ #custom resilving
195
+ @@current_doc_url = resolve + doc_url
196
+ end
197
+ end
198
+
199
+ def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
200
+ @@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
201
+ target = @@current_frame || @@agent
202
+ if useValue
203
+ target.text_field(:name,textfield_name).value = query_string
204
+ else
205
+ target.text_field(:name,textfield_name).set(query_string)
206
+ end
207
+ sleep(wait_secs) if wait_secs > 0
208
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
209
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
210
+
211
+ end
212
+
213
+ ##
214
+ #Action to fill a textarea with text
215
+ def self.fill_textarea(textarea_name, text)
216
+ @@current_form = "//input[@name='#{textarea_name}']/ancestor::form"
217
+ @@agent.text_field(:name,textarea_name).set(text)
218
+ end
219
+
220
+ ##
221
+ #Action for selecting an option from a dropdown box
222
+ def self.select_option(selectlist_name, option)
223
+ @@current_form = "//select[@name='#{selectlist_name}']/ancestor::form"
224
+ @@agent.select_list(:name,selectlist_name).select(option)
225
+ end
226
+
227
+ def self.check_checkbox(checkbox_name)
228
+ @@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
229
+ @@agent.checkbox(:name,checkbox_name).set(true)
230
+ end
231
+
232
+ def self.check_radiobutton(checkbox_name, index=0)
233
+ @@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
234
+ @@agent.elements_by_xpath("//input[@name='#{checkbox_name}']")[index].set
235
+ end
236
+
237
+ def self.click_image_map(index=0)
238
+ raise 'NotImplemented'
239
+ end
240
+
241
+ def self.wait(time=1)
242
+ sleep(time)
243
+ @@agent.wait
244
+ end
245
+ end
246
+ end
247
+ end
248
+ end
249
+ end
@@ -0,0 +1,253 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Mechanize
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = WWW::Mechanize.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+
24
+ ##
25
+ #Action to fetch a document (either a file or a http address)
26
+ #
27
+ #*parameters*
28
+ #
29
+ #_doc_url_ - the url or file name to fetch
30
+ def self.fetch(doc_url, *args)
31
+ #Refactor this crap!!! with option_accessor stuff
32
+
33
+ if args.size > 0
34
+ mechanize_doc = args[0][:mechanize_doc]
35
+ html = args[0][:html]
36
+ resolve = args[0][:resolve]
37
+ basic_auth = args[0][:basic_auth]
38
+ parse_and_set_basic_auth(basic_auth) if basic_auth
39
+ if html
40
+ @@current_doc_protocol = 'string'
41
+ mechanize_doc = page = WWW::Mechanize::Page.new(nil, {'content-type' => 'text/html'}, html)
42
+ end
43
+ else
44
+ mechanize_doc = nil
45
+ resolve = :full
46
+ end
47
+
48
+ @@current_doc_url = doc_url
49
+ @@current_doc_protocol ||= determine_protocol
50
+
51
+ if mechanize_doc.nil? && @@current_doc_protocol != 'file'
52
+ handle_relative_path(doc_url)
53
+ handle_relative_url(doc_url, resolve)
54
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
55
+
56
+ unless 'file' == @@current_doc_protocol
57
+ @@mechanize_doc = @@agent.get(@@current_doc_url)
58
+ end
59
+ else
60
+ @@mechanize_doc = mechanize_doc
61
+ end
62
+
63
+ if @@current_doc_protocol == 'file'
64
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(open(@@current_doc_url).read))
65
+ else
66
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc.body))
67
+ store_host_name(self.get_current_doc_url) if self.get_current_doc_url # in case we're on a new host
68
+ end
69
+ end
70
+
71
+ ##
72
+ #Submit the last form;
73
+ def self.submit(index=nil, sleep_time=nil, type=nil)
74
+ Scrubyt.log :ACTION, 'Submitting form...'
75
+ if index == nil
76
+ result_page = @@agent.submit(@@current_form)
77
+ process_submit(@@current_form)
78
+ #----- added by nickmerwin@gmail.com -----
79
+ elsif index.class == String && !type.nil?
80
+ button = @@current_form.buttons.detect{|b| b.name == index}
81
+ result_page = @@current_form.submit(button)
82
+ process_submit(@@current_form, button,type)
83
+ #-----------------------------------------
84
+ else
85
+ result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
86
+ end
87
+ @@current_doc_url = result_page.uri.to_s
88
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
89
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
90
+ end
91
+
92
+ ##
93
+ #Click the link specified by the text
94
+ def self.click_link(link_spec,index = 0,wait_secs=0)
95
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
96
+ if link_spec.is_a? Hash
97
+ clicked_elem = CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index)
98
+ else
99
+ clicked_elem = SimpleExampleLookup.find_node_from_text(@@hpricot_doc, link_spec, false, index)
100
+ end
101
+ clicked_elem = XPathUtils.find_nearest_node_with_attribute(clicked_elem, 'href')
102
+ result_page = @@agent.click(clicked_elem)
103
+ @@current_doc_url = result_page.uri.to_s
104
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
105
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
106
+ end
107
+
108
+ def self.click_image_map(index = 0)
109
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
110
+ uri = @@mechanize_doc.search("//area")[index]['href']
111
+ result_page = @@agent.get(uri)
112
+ @@current_doc_url = result_page.uri.to_s
113
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
114
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
115
+ end
116
+
117
+ def self.store_host_name(doc_url)
118
+ @@host_name = 'http://' + @@mechanize_doc.uri.to_s.match(%r{http://(.+?)/+})[0] if @@current_doc_protocol == 'http'
119
+ @@host_name = 'https://' + @@mechanize_doc.uri.to_s.match(%r{https://(.+?)/+})[0] if @@current_doc_protocol == 'https'
120
+ @@host_name = doc_url if @@host_name == nil
121
+ @@host_name = @@host_name[0..-2] if @@host_name[-1].chr == '/'
122
+ @@original_host_name ||= @@host_name
123
+ end #end of method store_host_name
124
+
125
+ def self.determine_protocol
126
+ old_protocol = @@current_doc_protocol
127
+ new_protocol = case @@current_doc_url
128
+ when /^https/
129
+ 'https'
130
+ when /^http/
131
+ 'http'
132
+ when /^www/
133
+ 'http'
134
+ else
135
+ 'file'
136
+ end
137
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
138
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
139
+ new_protocol
140
+ end
141
+
142
+ def self.handle_relative_path(doc_url)
143
+ if @@base_dir == nil
144
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
145
+ else
146
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
147
+ end
148
+ end
149
+
150
+ def self.handle_relative_url(doc_url, resolve)
151
+ return if doc_url =~ /^http/
152
+ if doc_url !~ /^\//
153
+ first_char = doc_url[0..0]
154
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
155
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
156
+ current_uri = @@mechanize_doc.uri.to_s
157
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
158
+ if (current_uri.include? '?')
159
+ current_uri = current_uri.scan(/.+\//)[0]
160
+ else
161
+ current_uri += '/' unless current_uri[-1..-1] == '/'
162
+ end
163
+ @@current_doc_url = current_uri + doc_url
164
+ return
165
+ end
166
+ end
167
+ case resolve
168
+ when :full
169
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
170
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
171
+ when :host
172
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
173
+ @@current_doc_url = base_host_name + doc_url
174
+ else
175
+ #custom resilving
176
+ @@current_doc_url = resolve + doc_url
177
+ end
178
+ end
179
+
180
+ def self.fill_textfield(textfield_name, query_string, *unused)
181
+ lookup_form_for_tag('input','textfield',textfield_name,query_string)
182
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
183
+ end
184
+
185
+ ##
186
+ #Action to fill a textarea with text
187
+ def self.fill_textarea(textarea_name, text)
188
+ lookup_form_for_tag('textarea','textarea',textarea_name,text)
189
+ eval("@@current_form['#{textarea_name}'] = '#{text}'")
190
+ end
191
+
192
+ ##
193
+ #Action for selecting an option from a dropdown box
194
+ def self.select_option(selectlist_name, option)
195
+ lookup_form_for_tag('select','select list',selectlist_name,option)
196
+ select_list = @@current_form.fields.find {|f| f.name == selectlist_name}
197
+ searched_option = select_list.options.find{|f| f.text.strip == option}
198
+ searched_option.click
199
+ end
200
+
201
+ def self.check_checkbox(checkbox_name)
202
+ lookup_form_for_tag('input','checkbox',checkbox_name, '')
203
+ @@current_form.checkboxes.name(checkbox_name).check
204
+ end
205
+
206
+ def self.check_radiobutton(checkbox_name, index=0)
207
+ lookup_form_for_tag('input','radiobutton',checkbox_name, '',index)
208
+ @@current_form.radiobuttons.name(checkbox_name)[index].check
209
+ end
210
+
211
+ #private
212
+ def self.process_submit(current_form, button=nil, type=nil)
213
+ if button == nil
214
+ result_page = @@agent.submit(current_form)
215
+ elsif type
216
+ result_page = current_form.submit(button)
217
+ else
218
+ result_page = @@agent.submit(current_form, button)
219
+ end
220
+ @@current_doc_url = result_page.uri.to_s
221
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
222
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
223
+ end
224
+
225
+ def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
226
+ Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
227
+ widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
228
+ form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
229
+ find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
230
+ end
231
+
232
+ def self.find_form_based_on_tag(tag, possible_attrs)
233
+ lookup_attribute_name = nil
234
+ lookup_attribute_value = nil
235
+
236
+ possible_attrs.each { |a|
237
+ lookup_attribute_name = a
238
+ lookup_attribute_value = tag.attributes[a]
239
+ break if lookup_attribute_value != nil
240
+ }
241
+ i = 0
242
+ loop do
243
+ @@current_form = FetchAction.get_mechanize_doc.forms[i]
244
+ return nil if @@current_form == nil
245
+ break if @@current_form.form_node.attributes[lookup_attribute_name] == lookup_attribute_value
246
+ i+= 1
247
+ end
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end