sutch-scrubyt 0.4.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/CHANGELOG +350 -0
  2. data/COPYING +340 -0
  3. data/README +121 -0
  4. data/Rakefile +101 -0
  5. data/lib/scrubyt.rb +45 -0
  6. data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
  7. data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
  8. data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
  9. data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
  10. data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
  11. data/lib/scrubyt/core/scraping/constraint.rb +169 -0
  12. data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
  13. data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
  14. data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
  15. data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
  16. data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
  17. data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
  18. data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
  19. data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
  20. data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
  21. data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
  22. data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
  23. data/lib/scrubyt/core/scraping/pattern.rb +359 -0
  24. data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
  25. data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
  26. data/lib/scrubyt/core/shared/extractor.rb +168 -0
  27. data/lib/scrubyt/logging.rb +154 -0
  28. data/lib/scrubyt/output/post_processor.rb +139 -0
  29. data/lib/scrubyt/output/result.rb +44 -0
  30. data/lib/scrubyt/output/result_dumper.rb +154 -0
  31. data/lib/scrubyt/output/result_node.rb +140 -0
  32. data/lib/scrubyt/output/scrubyt_result.rb +42 -0
  33. data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
  34. data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
  35. data/lib/scrubyt/utils/shared_utils.rb +58 -0
  36. data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
  37. data/lib/scrubyt/utils/xpathutils.rb +202 -0
  38. data/test/blackbox_test.rb +60 -0
  39. data/test/blackbox_tests/basic/multi_root.rb +6 -0
  40. data/test/blackbox_tests/basic/simple.rb +5 -0
  41. data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
  42. data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
  43. data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
  44. data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
  45. metadata +117 -0
data/README ADDED
@@ -0,0 +1,121 @@
1
+ = scRUBYt! - Hpricot and Mechanize (or FireWatir) on steroids
2
+
3
+ A simple to learn and use, yet very powerful web extraction framework written in Ruby. Navigate through the Web,
4
+ Extract, query, transform and save relevant data from the Web page of your interest by the concise and easy to use DSL.
5
+
6
+
7
+ Do you think that Mechanize and Hpricot are powerful libraries? You're right, they are, indeed - hats off to their
8
+ authors: without these libs scRUBYt! could not exist now! I have been wondering whether their functionality could be
9
+ still enhanced further - so I took these two powerful ingredients, threw in a handful of smart heuristics, wrapped them
10
+ around with a chunky DSL coating and sprinkled the whole stuff with a lots of convention over configuration(tm) goodies
11
+ - and ... enter scRUBYt! and decide it yourself.
12
+
13
+ = Wait... why do we need one more web-scraping toolkit?
14
+
15
+ After all, we have HPricot, and Rubyful-soup, and Mechanize, and scrAPI, and ARIEL and scrapes and ...
16
+ Well, because scRUBYt! is different. It has an entirely different philosophy, underlying techniques, theoretical
17
+ background, use cases, todo list, real-life scenarios etc. - shortly it should be used in different situations with
18
+ different requirements than the previosly mentioned ones.
19
+
20
+ If you need something quick and/or would like to have maximal control over the scraping process, I recommend HPricot.
21
+ Mechanize shines when it comes to interaction with Web pages. Since scRUBYt! is operating based on XPaths, sometimes you
22
+ will chose scrAPI because CSS selectors will better suit your needs. The list goes on and on, boiling down to the good
23
+ old mantra: use the right tool for the right job!
24
+
25
+ I hope there will be also times when you will want to experiment with Pandora's box and reach after the power of
26
+ scRUBYt! :-)
27
+
28
+ = Sounds fine - show me an example!
29
+
30
+ Let's apply the "show don't tell" principle. Okay, here we go:
31
+
32
+ <tt>ebay_data = Scrubyt::Extractor.define do</tt>
33
+
34
+ fetch 'http://www.ebay.com/'
35
+ fill_textfield 'satitle', 'ipod'
36
+ submit
37
+ click_link 'Apple iPod'
38
+
39
+ record do
40
+ item_name 'APPLE NEW IPOD MINI 6GB MP3 PLAYER SILVER'
41
+ price '$71.99'
42
+ end
43
+ next_page 'Next >', :limit => 5
44
+
45
+ <tt>end</tt>
46
+
47
+ output:
48
+
49
+ <tt><root></tt>
50
+ <record>
51
+ <item_name>APPLE IPOD NANO 4GB - PINK - MP3 PLAYER</item_name>
52
+ <price>$149.95</price>
53
+ </record>
54
+ <record>
55
+ <item_name>APPLE IPOD 30GB BLACK VIDEO/PHOTO/MP3 PLAYER</item_name>
56
+ <price>$172.50</price>
57
+ </record>
58
+ <record>
59
+ <item_name>NEW APPLE IPOD NANO 4GB PINK MP3 PLAYER</item_name>
60
+ <price>$171.06</price>
61
+ </record>
62
+ <!-- another 200+ results -->
63
+ <tt></root></tt>
64
+
65
+ This was a relatively beginner-level example (scRUBYt knows a lot more than this and there are much complicated
66
+ extractors than the above one) - yet it did a lot of things automagically. First of all,
67
+ it automatically loaded the page of interest (by going to ebay.com, automatically searching for ipods
68
+ and narrowing down the results by clicking on 'Apple iPod'), then it extracted *all* the items that
69
+ looked like the specified example (which btw described also how the output structure should look like) - on the first 5
70
+ result pages. Not so bad for about 10 lines of code, eh?
71
+
72
+ = OK, OK, I believe you, what should I do?
73
+
74
+ You can find everything you will need at these addresses (or if not, I doubt you will find it elsewhere...). See the
75
+ next section about installation, and after installing be sure to check out these URLs:
76
+
77
+ * <a href='http://www.rubyrailways.com'>rubyrailways.com</a> - for some theory; if you would like to take a sneak peek
78
+ at web scraping in general and/or you would like to understand what's going on under the hood, check out <a
79
+ href='http://www.rubyrailways.com/data-extraction-for-web-20-screen-scraping-in-rubyrails'>this article about
80
+ web-scraping</a>!
81
+ * <a href='http://scrubyt.org'>http://scrubyt.org</a> - your source of tutorials, howtos, news etc.
82
+ * <a href='http://scrubyt.rubyforge.org'>scrubyt.rubyforge.org</a> - for an up-to-date, online Rdoc
83
+ * <a href='http://projects.rubyforge.org/scrubyt'>projects.rubyforge.org/scrubyt</a> - for developer info, including
84
+ open and closed bugs, files etc.
85
+ * projects.rubyforge.org/scrubyt/files... - fair amount (and still growing with every release) of examples, showcasing
86
+ the features of scRUBYt!
87
+ * planned: public extractor repository - hopefully (after people realize how great this package is :-)) scRUBYt! will
88
+ have a community, and people will upload their extractors for whatever reason
89
+
90
+ If you still can't find something here, drop a mail to the guys at scrubyt@/NO-SPAM/scrubyt.org!
91
+
92
+ = How to install
93
+
94
+ scRUBYt! requires these packages to be installed:
95
+
96
+ * Ruby 1.8.4
97
+ * Hpricot 0.5
98
+ * Mechanize 0.6.3
99
+
100
+ I assume you have ruby any rubygems installed. To install WWW::Mechanize 0.6.3 or higher, just run
101
+
102
+ <tt>sudo gem install mechanize</tt>
103
+
104
+ Hpricot 0.5 is just hot off the frying pan - perfect timing, _why! - install it with
105
+
106
+ <tt>sudo gem install hpricot</tt>
107
+
108
+ Once all the dependencies (Mechanize and Hpricot) are up and running, you can install scrubyt with
109
+
110
+ <tt>sudo gem install scrubyt</tt>
111
+
112
+ If you encounter any problems, drop a mail to the guys at scrubyt@/NO-SPAM/scrubyt.org!
113
+
114
+ = Author
115
+
116
+ Copyright (c) 2006 by Peter Szinek (peter@/NO-SPAM/rubyrailways.com)
117
+
118
+ = Copyright
119
+
120
+ This library is distributed under the GPL. Please see the LICENSE file.
121
+
data/Rakefile ADDED
@@ -0,0 +1,101 @@
1
+ require 'rake/rdoctask'
2
+ require 'rake/testtask'
3
+ require 'rake/gempackagetask'
4
+ require 'rake/packagetask'
5
+
6
+ ###################################################
7
+ # Dependencies
8
+ ###################################################
9
+
10
+ task "default" => ["test_all"]
11
+ task "generate_rdoc" => ["cleanup_readme"]
12
+ task "cleanup_readme" => ["rdoc"]
13
+
14
+ ###################################################
15
+ # Gem specification
16
+ ###################################################
17
+
18
+ gem_spec = Gem::Specification.new do |s|
19
+ s.name = 'scrubyt'
20
+ s.version = '0.4.20'
21
+ s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
+ s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
+ # Files containing Test::Unit test cases.
24
+ s.test_files = FileList['test/unittests/**/*']
25
+ # List of other files to be included.
26
+ s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
27
+ s.author = 'Peter Szinek'
28
+ s.email = 'peter@rubyrailways.com'
29
+ s.homepage = 'http://www.scrubyt.org'
30
+ s.add_dependency('hpricot', '>= 0.5')
31
+ s.add_dependency('mechanize', '>= 0.6.3')
32
+ s.has_rdoc = 'true'
33
+ end
34
+
35
+ ###################################################
36
+ # Tasks
37
+ ###################################################
38
+
39
+ Rake::RDocTask.new do |generate_rdoc|
40
+ files = ['lib/**/*.rb', 'README', 'CHANGELOG']
41
+ generate_rdoc.rdoc_files.add(files)
42
+ generate_rdoc.main = "README" # page to start on
43
+ generate_rdoc.title = "Scrubyt Documentation"
44
+ generate_rdoc.template = "resources/allison/allison.rb"
45
+ generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
46
+ generate_rdoc.options << '--line-numbers' << '--inline-source'
47
+ end
48
+
49
+ Rake::TestTask.new(:test_all) do |task|
50
+ task.pattern = 'test/*_test.rb'
51
+ end
52
+
53
+ Rake::TestTask.new(:test_blackbox) do |task|
54
+ task.test_files = ['test/blackbox_test.rb']
55
+ end
56
+
57
+ task "test_specific" do
58
+ ruby "test/blackbox_test.rb #{ARGV[1]}"
59
+ end
60
+
61
+ Rake::TestTask.new(:test_non_blackbox) do |task|
62
+ task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
63
+ end
64
+
65
+ task "rcov" do
66
+ sh 'rcov --xrefs test/*.rb'
67
+ puts 'Report done.'
68
+ end
69
+
70
+ task "cleanup_readme" do
71
+ puts "Cleaning up README..."
72
+ readme_in = open('./doc/files/README.html')
73
+ content = readme_in.read
74
+ content.sub!('<h1 id="item_name">File: README</h1>','')
75
+ content.sub!('<h1>Description</h1>','')
76
+ readme_in.close
77
+ open('./doc/files/README.html', 'w') {|f| f.write(content)}
78
+ #OK, this is uggly as hell and as non-DRY as possible, but
79
+ #I don't have time to deal with it right now
80
+ puts "Cleaning up CHANGELOG..."
81
+ readme_in = open('./doc/files/CHANGELOG.html')
82
+ content = readme_in.read
83
+ content.sub!('<h1 id="item_name">File: CHANGELOG</h1>','')
84
+ content.sub!('<h1>Description</h1>','')
85
+ readme_in.close
86
+ open('./doc/files/CHANGELOG.html', 'w') {|f| f.write(content)}
87
+ end
88
+
89
+ task "generate_rdoc" do
90
+ end
91
+
92
+ Rake::GemPackageTask.new(gem_spec) do |pkg|
93
+ pkg.need_zip = false
94
+ pkg.need_tar = false
95
+ end
96
+
97
+ #Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
98
+ # pkg.need_zip = true
99
+ # pkg.need_tar = true
100
+ # pkg.package_files.include("examples/**/*")
101
+ #end
data/lib/scrubyt.rb ADDED
@@ -0,0 +1,45 @@
1
+ if RUBY_VERSION < '1.9'
2
+ $KCODE = "u"
3
+ require "jcode"
4
+ end
5
+
6
+ #ruby core
7
+ require "open-uri"
8
+ require "erb"
9
+
10
+ #gems
11
+ require "rexml/text"
12
+ require "rubygems"
13
+ require "mechanize"
14
+ require "hpricot"
15
+
16
+ #scrubyt
17
+ require "#{File.dirname(__FILE__)}/scrubyt/logging"
18
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
19
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
20
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
21
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
22
+ require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
23
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
24
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
25
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
26
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
27
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
28
+ require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
29
+ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
32
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
33
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
35
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
36
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
37
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
38
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
39
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
40
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
41
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
42
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
43
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
44
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
45
+ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
@@ -0,0 +1,253 @@
1
+ require 'rubygems'
2
+ require 'firewatir'
3
+ module Scrubyt
4
+ ##
5
+ #=<tt>Fetching pages (and related functionality)</tt>
6
+ #
7
+ #Since lot of things are happening during (and before)
8
+ #the fetching of a document, I decided to move out fetching related
9
+ #functionality to a separate class - so if you are looking for anything
10
+ #which is loading a document (even by submitting a form or clicking a link)
11
+ #and related things like setting a proxy etc. you should find it here.
12
+ module Navigation
13
+ module Firewatir
14
+
15
+ def self.included(base)
16
+ base.module_eval do
17
+ @@agent = FireWatir::Firefox.new
18
+ @@current_doc_url = nil
19
+ @@current_doc_protocol = nil
20
+ @@base_dir = nil
21
+ @@host_name = nil
22
+ @@history = []
23
+ @@current_form = nil
24
+ @@current_frame = nil
25
+
26
+ ##
27
+ #Action to fetch a document (either a file or a http address)
28
+ #
29
+ #*parameters*
30
+ #
31
+ #_doc_url_ - the url or file name to fetch
32
+ def self.fetch(doc_url, *args)
33
+ #Refactor this crap!!! with option_accessor stuff
34
+ if args.size > 0
35
+ mechanize_doc = args[0][:mechanize_doc]
36
+ resolve = args[0][:resolve]
37
+ basic_auth = args[0][:basic_auth]
38
+ #Refactor this whole stuff as well!!! It looks awful...
39
+ parse_and_set_basic_auth(basic_auth) if basic_auth
40
+ else
41
+ mechanize_doc = nil
42
+ resolve = :full
43
+ end
44
+
45
+ @@current_doc_url = doc_url
46
+ @@current_doc_protocol = determine_protocol
47
+ if mechanize_doc.nil?
48
+ handle_relative_path(doc_url) unless @@current_doc_protocol == 'xpath'
49
+ handle_relative_url(doc_url, resolve)
50
+ Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
51
+ case @@current_doc_protocol
52
+ when 'file': @@agent.goto("file://"+ @@current_doc_url)
53
+ else @@agent.goto(@@current_doc_url)
54
+ end
55
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
56
+ else
57
+ @@mechanize_doc = mechanize_doc
58
+ end
59
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
60
+ store_host_name(@@agent.url) # in case we're on a new host
61
+ end
62
+
63
+ def self.frame(attribute, value)
64
+ if @@current_frame
65
+ @@current_frame.frame(attribute, value)
66
+ else
67
+ @@current_frame = @@agent.frame(attribute, value)
68
+ end
69
+ end
70
+
71
+ ##
72
+ #Submit the last form;
73
+ def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
74
+ if @@current_frame
75
+ #BRUTAL hax but FW is such a shitty piece of software
76
+ #this sucks FAIL omg
77
+ @@current_frame.locate
78
+ form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
79
+ form.submit
80
+ else
81
+ @@agent.element_by_xpath(@@current_form).submit
82
+ end
83
+
84
+ if sleep_time
85
+ sleep sleep_time
86
+ @@agent.wait
87
+ end
88
+
89
+ @@current_doc_url = @@agent.url
90
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
91
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
92
+ end
93
+
94
+ ##
95
+ #Click the link specified by the text
96
+ def self.click_link(link_spec,index = 0,wait_secs=0)
97
+ Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
98
+ if link_spec.is_a?(Hash)
99
+ elem = XPathUtils.generate_XPath(CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index), nil, true)
100
+ result_page = @@agent.element_by_xpath(elem).click
101
+ else
102
+ @@agent.link(:innerHTML, Regexp.escape(link_spec)).click
103
+ end
104
+ sleep(wait_secs) if wait_secs > 0
105
+ @@agent.wait
106
+ @@current_doc_url = @@agent.url
107
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
108
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
109
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
110
+ end
111
+
112
+ def self.click_by_xpath(xpath)
113
+ Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
114
+ @@agent.element_by_xpath(xpath).click
115
+ @@agent.wait
116
+ @@current_doc_url = @@agent.url
117
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
118
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
119
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
120
+ end
121
+
122
+ def self.click_image_map(index = 0)
123
+ Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
124
+ uri = @@mechanize_doc.search("//area")[index]['href']
125
+ result_page = @@agent.get(uri)
126
+ @@current_doc_url = result_page.uri.to_s
127
+ Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
128
+ fetch(@@current_doc_url, :mechanize_doc => result_page)
129
+ end
130
+
131
+ def self.store_host_name(doc_url)
132
+ @@host_name = doc_url.match(/.*\..*?\//)[0] if doc_url.match(/.*\..*?\//)
133
+ @@original_host_name ||= @@host_name
134
+ end #end of method store_host_name
135
+
136
+ def self.determine_protocol
137
+ old_protocol = @@current_doc_protocol
138
+ new_protocol = case @@current_doc_url
139
+ when /^\/\//
140
+ 'xpath'
141
+ when /^https/
142
+ 'https'
143
+ when /^http/
144
+ 'http'
145
+ when /^www\./
146
+ 'http'
147
+ else
148
+ 'file'
149
+ end
150
+ return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
151
+ return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
152
+ new_protocol
153
+ end
154
+
155
+ def self.parse_and_set_basic_auth(basic_auth)
156
+ login, pass = basic_auth.split('@')
157
+ Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
158
+ @@agent.basic_auth(login, pass)
159
+ end
160
+
161
+ def self.handle_relative_path(doc_url)
162
+ if @@base_dir == nil || doc_url[0..0] == "/"
163
+ @@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
164
+ else
165
+ @@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
166
+ end
167
+ end
168
+
169
+ def self.handle_relative_url(doc_url, resolve)
170
+ return if doc_url =~ /^(http:|javascript:)/
171
+ if doc_url !~ /^\//
172
+ first_char = doc_url[0..0]
173
+ doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
174
+ if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
175
+ current_uri = @@mechanize_doc.uri.to_s
176
+ current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
177
+ if (current_uri.include? '?')
178
+ current_uri = current_uri.scan(/.+\//)[0]
179
+ else
180
+ current_uri += '/' unless current_uri[-1..-1] == '/'
181
+ end
182
+ @@current_doc_url = current_uri + doc_url
183
+ return
184
+ end
185
+ end
186
+ case resolve
187
+ when :full
188
+ @@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
189
+ @@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
190
+ when :host
191
+ base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
192
+ @@current_doc_url = base_host_name + doc_url
193
+ else
194
+ #custom resilving
195
+ @@current_doc_url = resolve + doc_url
196
+ end
197
+ end
198
+
199
+ def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
200
+ @@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
201
+ target = @@current_frame || @@agent
202
+ if useValue
203
+ target.text_field(:name,textfield_name).value = query_string
204
+ else
205
+ target.text_field(:name,textfield_name).set(query_string)
206
+ end
207
+ sleep(wait_secs) if wait_secs > 0
208
+ @@mechanize_doc = "<html>#{@@agent.html}</html>"
209
+ @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
210
+
211
+ end
212
+
213
+ ##
214
+ #Action to fill a textarea with text
215
+ def self.fill_textarea(textarea_name, text)
216
+ @@current_form = "//input[@name='#{textarea_name}']/ancestor::form"
217
+ @@agent.text_field(:name,textarea_name).set(text)
218
+ end
219
+
220
+ ##
221
+ #Action for selecting an option from a dropdown box
222
+ def self.select_option(selectlist_name, option)
223
+ @@current_form = "//select[@name='#{selectlist_name}']/ancestor::form"
224
+ @@agent.select_list(:name,selectlist_name).select(option)
225
+ end
226
+
227
+ def self.check_checkbox(checkbox_name)
228
+ @@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
229
+ @@agent.checkbox(:name,checkbox_name).set(true)
230
+ end
231
+
232
+ def self.check_radiobutton(checkbox_name, index=0)
233
+ @@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
234
+ @@agent.elements_by_xpath("//input[@name='#{checkbox_name}']")[index].set
235
+ end
236
+
237
+ def self.click_image_map(index=0)
238
+ raise 'NotImplemented'
239
+ end
240
+
241
+ def self.wait(time=1)
242
+ sleep(time)
243
+ @@agent.wait
244
+ end
245
+
246
+ def self.close_firefox
247
+ @@agent.close
248
+ end
249
+ end
250
+ end
251
+ end
252
+ end
253
+ end