sutch-scrubyt 0.4.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +350 -0
- data/COPYING +340 -0
- data/README +121 -0
- data/Rakefile +101 -0
- data/lib/scrubyt.rb +45 -0
- data/lib/scrubyt/core/navigation/agents/firewatir.rb +253 -0
- data/lib/scrubyt/core/navigation/agents/mechanize.rb +289 -0
- data/lib/scrubyt/core/navigation/fetch_action.rb +54 -0
- data/lib/scrubyt/core/navigation/navigation_actions.rb +95 -0
- data/lib/scrubyt/core/scraping/compound_example.rb +30 -0
- data/lib/scrubyt/core/scraping/constraint.rb +169 -0
- data/lib/scrubyt/core/scraping/constraint_adder.rb +49 -0
- data/lib/scrubyt/core/scraping/filters/attribute_filter.rb +14 -0
- data/lib/scrubyt/core/scraping/filters/base_filter.rb +112 -0
- data/lib/scrubyt/core/scraping/filters/constant_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/detail_page_filter.rb +37 -0
- data/lib/scrubyt/core/scraping/filters/download_filter.rb +64 -0
- data/lib/scrubyt/core/scraping/filters/html_subtree_filter.rb +9 -0
- data/lib/scrubyt/core/scraping/filters/regexp_filter.rb +13 -0
- data/lib/scrubyt/core/scraping/filters/script_filter.rb +11 -0
- data/lib/scrubyt/core/scraping/filters/text_filter.rb +34 -0
- data/lib/scrubyt/core/scraping/filters/tree_filter.rb +138 -0
- data/lib/scrubyt/core/scraping/pattern.rb +359 -0
- data/lib/scrubyt/core/scraping/pre_filter_document.rb +14 -0
- data/lib/scrubyt/core/scraping/result_indexer.rb +90 -0
- data/lib/scrubyt/core/shared/extractor.rb +168 -0
- data/lib/scrubyt/logging.rb +154 -0
- data/lib/scrubyt/output/post_processor.rb +139 -0
- data/lib/scrubyt/output/result.rb +44 -0
- data/lib/scrubyt/output/result_dumper.rb +154 -0
- data/lib/scrubyt/output/result_node.rb +140 -0
- data/lib/scrubyt/output/scrubyt_result.rb +42 -0
- data/lib/scrubyt/utils/compound_example_lookup.rb +50 -0
- data/lib/scrubyt/utils/ruby_extensions.rb +85 -0
- data/lib/scrubyt/utils/shared_utils.rb +58 -0
- data/lib/scrubyt/utils/simple_example_lookup.rb +40 -0
- data/lib/scrubyt/utils/xpathutils.rb +202 -0
- data/test/blackbox_test.rb +60 -0
- data/test/blackbox_tests/basic/multi_root.rb +6 -0
- data/test/blackbox_tests/basic/simple.rb +5 -0
- data/test/blackbox_tests/detail_page/one_detail_page.rb +9 -0
- data/test/blackbox_tests/detail_page/two_detail_pages.rb +9 -0
- data/test/blackbox_tests/next_page/next_page_link.rb +7 -0
- data/test/blackbox_tests/next_page/page_list_links.rb +7 -0
- metadata +117 -0
data/README
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
= scRUBYt! - Hpricot and Mechanize (or FireWatir) on steroids
|
2
|
+
|
3
|
+
A simple to learn and use, yet very powerful web extraction framework written in Ruby. Navigate through the Web,
|
4
|
+
Extract, query, transform and save relevant data from the Web page of your interest by the concise and easy to use DSL.
|
5
|
+
|
6
|
+
|
7
|
+
Do you think that Mechanize and Hpricot are powerful libraries? You're right, they are, indeed - hats off to their
|
8
|
+
authors: without these libs scRUBYt! could not exist now! I have been wondering whether their functionality could be
|
9
|
+
still enhanced further - so I took these two powerful ingredients, threw in a handful of smart heuristics, wrapped them
|
10
|
+
around with a chunky DSL coating and sprinkled the whole stuff with a lots of convention over configuration(tm) goodies
|
11
|
+
- and ... enter scRUBYt! and decide it yourself.
|
12
|
+
|
13
|
+
= Wait... why do we need one more web-scraping toolkit?
|
14
|
+
|
15
|
+
After all, we have HPricot, and Rubyful-soup, and Mechanize, and scrAPI, and ARIEL and scrapes and ...
|
16
|
+
Well, because scRUBYt! is different. It has an entirely different philosophy, underlying techniques, theoretical
|
17
|
+
background, use cases, todo list, real-life scenarios etc. - shortly it should be used in different situations with
|
18
|
+
different requirements than the previosly mentioned ones.
|
19
|
+
|
20
|
+
If you need something quick and/or would like to have maximal control over the scraping process, I recommend HPricot.
|
21
|
+
Mechanize shines when it comes to interaction with Web pages. Since scRUBYt! is operating based on XPaths, sometimes you
|
22
|
+
will chose scrAPI because CSS selectors will better suit your needs. The list goes on and on, boiling down to the good
|
23
|
+
old mantra: use the right tool for the right job!
|
24
|
+
|
25
|
+
I hope there will be also times when you will want to experiment with Pandora's box and reach after the power of
|
26
|
+
scRUBYt! :-)
|
27
|
+
|
28
|
+
= Sounds fine - show me an example!
|
29
|
+
|
30
|
+
Let's apply the "show don't tell" principle. Okay, here we go:
|
31
|
+
|
32
|
+
<tt>ebay_data = Scrubyt::Extractor.define do</tt>
|
33
|
+
|
34
|
+
fetch 'http://www.ebay.com/'
|
35
|
+
fill_textfield 'satitle', 'ipod'
|
36
|
+
submit
|
37
|
+
click_link 'Apple iPod'
|
38
|
+
|
39
|
+
record do
|
40
|
+
item_name 'APPLE NEW IPOD MINI 6GB MP3 PLAYER SILVER'
|
41
|
+
price '$71.99'
|
42
|
+
end
|
43
|
+
next_page 'Next >', :limit => 5
|
44
|
+
|
45
|
+
<tt>end</tt>
|
46
|
+
|
47
|
+
output:
|
48
|
+
|
49
|
+
<tt><root></tt>
|
50
|
+
<record>
|
51
|
+
<item_name>APPLE IPOD NANO 4GB - PINK - MP3 PLAYER</item_name>
|
52
|
+
<price>$149.95</price>
|
53
|
+
</record>
|
54
|
+
<record>
|
55
|
+
<item_name>APPLE IPOD 30GB BLACK VIDEO/PHOTO/MP3 PLAYER</item_name>
|
56
|
+
<price>$172.50</price>
|
57
|
+
</record>
|
58
|
+
<record>
|
59
|
+
<item_name>NEW APPLE IPOD NANO 4GB PINK MP3 PLAYER</item_name>
|
60
|
+
<price>$171.06</price>
|
61
|
+
</record>
|
62
|
+
<!-- another 200+ results -->
|
63
|
+
<tt></root></tt>
|
64
|
+
|
65
|
+
This was a relatively beginner-level example (scRUBYt knows a lot more than this and there are much complicated
|
66
|
+
extractors than the above one) - yet it did a lot of things automagically. First of all,
|
67
|
+
it automatically loaded the page of interest (by going to ebay.com, automatically searching for ipods
|
68
|
+
and narrowing down the results by clicking on 'Apple iPod'), then it extracted *all* the items that
|
69
|
+
looked like the specified example (which btw described also how the output structure should look like) - on the first 5
|
70
|
+
result pages. Not so bad for about 10 lines of code, eh?
|
71
|
+
|
72
|
+
= OK, OK, I believe you, what should I do?
|
73
|
+
|
74
|
+
You can find everything you will need at these addresses (or if not, I doubt you will find it elsewhere...). See the
|
75
|
+
next section about installation, and after installing be sure to check out these URLs:
|
76
|
+
|
77
|
+
* <a href='http://www.rubyrailways.com'>rubyrailways.com</a> - for some theory; if you would like to take a sneak peek
|
78
|
+
at web scraping in general and/or you would like to understand what's going on under the hood, check out <a
|
79
|
+
href='http://www.rubyrailways.com/data-extraction-for-web-20-screen-scraping-in-rubyrails'>this article about
|
80
|
+
web-scraping</a>!
|
81
|
+
* <a href='http://scrubyt.org'>http://scrubyt.org</a> - your source of tutorials, howtos, news etc.
|
82
|
+
* <a href='http://scrubyt.rubyforge.org'>scrubyt.rubyforge.org</a> - for an up-to-date, online Rdoc
|
83
|
+
* <a href='http://projects.rubyforge.org/scrubyt'>projects.rubyforge.org/scrubyt</a> - for developer info, including
|
84
|
+
open and closed bugs, files etc.
|
85
|
+
* projects.rubyforge.org/scrubyt/files... - fair amount (and still growing with every release) of examples, showcasing
|
86
|
+
the features of scRUBYt!
|
87
|
+
* planned: public extractor repository - hopefully (after people realize how great this package is :-)) scRUBYt! will
|
88
|
+
have a community, and people will upload their extractors for whatever reason
|
89
|
+
|
90
|
+
If you still can't find something here, drop a mail to the guys at scrubyt@/NO-SPAM/scrubyt.org!
|
91
|
+
|
92
|
+
= How to install
|
93
|
+
|
94
|
+
scRUBYt! requires these packages to be installed:
|
95
|
+
|
96
|
+
* Ruby 1.8.4
|
97
|
+
* Hpricot 0.5
|
98
|
+
* Mechanize 0.6.3
|
99
|
+
|
100
|
+
I assume you have ruby any rubygems installed. To install WWW::Mechanize 0.6.3 or higher, just run
|
101
|
+
|
102
|
+
<tt>sudo gem install mechanize</tt>
|
103
|
+
|
104
|
+
Hpricot 0.5 is just hot off the frying pan - perfect timing, _why! - install it with
|
105
|
+
|
106
|
+
<tt>sudo gem install hpricot</tt>
|
107
|
+
|
108
|
+
Once all the dependencies (Mechanize and Hpricot) are up and running, you can install scrubyt with
|
109
|
+
|
110
|
+
<tt>sudo gem install scrubyt</tt>
|
111
|
+
|
112
|
+
If you encounter any problems, drop a mail to the guys at scrubyt@/NO-SPAM/scrubyt.org!
|
113
|
+
|
114
|
+
= Author
|
115
|
+
|
116
|
+
Copyright (c) 2006 by Peter Szinek (peter@/NO-SPAM/rubyrailways.com)
|
117
|
+
|
118
|
+
= Copyright
|
119
|
+
|
120
|
+
This library is distributed under the GPL. Please see the LICENSE file.
|
121
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'rake/rdoctask'
|
2
|
+
require 'rake/testtask'
|
3
|
+
require 'rake/gempackagetask'
|
4
|
+
require 'rake/packagetask'
|
5
|
+
|
6
|
+
###################################################
|
7
|
+
# Dependencies
|
8
|
+
###################################################
|
9
|
+
|
10
|
+
task "default" => ["test_all"]
|
11
|
+
task "generate_rdoc" => ["cleanup_readme"]
|
12
|
+
task "cleanup_readme" => ["rdoc"]
|
13
|
+
|
14
|
+
###################################################
|
15
|
+
# Gem specification
|
16
|
+
###################################################
|
17
|
+
|
18
|
+
gem_spec = Gem::Specification.new do |s|
|
19
|
+
s.name = 'scrubyt'
|
20
|
+
s.version = '0.4.20'
|
21
|
+
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
|
+
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
|
+
# Files containing Test::Unit test cases.
|
24
|
+
s.test_files = FileList['test/unittests/**/*']
|
25
|
+
# List of other files to be included.
|
26
|
+
s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
|
27
|
+
s.author = 'Peter Szinek'
|
28
|
+
s.email = 'peter@rubyrailways.com'
|
29
|
+
s.homepage = 'http://www.scrubyt.org'
|
30
|
+
s.add_dependency('hpricot', '>= 0.5')
|
31
|
+
s.add_dependency('mechanize', '>= 0.6.3')
|
32
|
+
s.has_rdoc = 'true'
|
33
|
+
end
|
34
|
+
|
35
|
+
###################################################
|
36
|
+
# Tasks
|
37
|
+
###################################################
|
38
|
+
|
39
|
+
Rake::RDocTask.new do |generate_rdoc|
|
40
|
+
files = ['lib/**/*.rb', 'README', 'CHANGELOG']
|
41
|
+
generate_rdoc.rdoc_files.add(files)
|
42
|
+
generate_rdoc.main = "README" # page to start on
|
43
|
+
generate_rdoc.title = "Scrubyt Documentation"
|
44
|
+
generate_rdoc.template = "resources/allison/allison.rb"
|
45
|
+
generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
|
46
|
+
generate_rdoc.options << '--line-numbers' << '--inline-source'
|
47
|
+
end
|
48
|
+
|
49
|
+
Rake::TestTask.new(:test_all) do |task|
|
50
|
+
task.pattern = 'test/*_test.rb'
|
51
|
+
end
|
52
|
+
|
53
|
+
Rake::TestTask.new(:test_blackbox) do |task|
|
54
|
+
task.test_files = ['test/blackbox_test.rb']
|
55
|
+
end
|
56
|
+
|
57
|
+
task "test_specific" do
|
58
|
+
ruby "test/blackbox_test.rb #{ARGV[1]}"
|
59
|
+
end
|
60
|
+
|
61
|
+
Rake::TestTask.new(:test_non_blackbox) do |task|
|
62
|
+
task.test_files = FileList['test/*_test.rb'] - ['test/blackbox_test.rb']
|
63
|
+
end
|
64
|
+
|
65
|
+
task "rcov" do
|
66
|
+
sh 'rcov --xrefs test/*.rb'
|
67
|
+
puts 'Report done.'
|
68
|
+
end
|
69
|
+
|
70
|
+
task "cleanup_readme" do
|
71
|
+
puts "Cleaning up README..."
|
72
|
+
readme_in = open('./doc/files/README.html')
|
73
|
+
content = readme_in.read
|
74
|
+
content.sub!('<h1 id="item_name">File: README</h1>','')
|
75
|
+
content.sub!('<h1>Description</h1>','')
|
76
|
+
readme_in.close
|
77
|
+
open('./doc/files/README.html', 'w') {|f| f.write(content)}
|
78
|
+
#OK, this is uggly as hell and as non-DRY as possible, but
|
79
|
+
#I don't have time to deal with it right now
|
80
|
+
puts "Cleaning up CHANGELOG..."
|
81
|
+
readme_in = open('./doc/files/CHANGELOG.html')
|
82
|
+
content = readme_in.read
|
83
|
+
content.sub!('<h1 id="item_name">File: CHANGELOG</h1>','')
|
84
|
+
content.sub!('<h1>Description</h1>','')
|
85
|
+
readme_in.close
|
86
|
+
open('./doc/files/CHANGELOG.html', 'w') {|f| f.write(content)}
|
87
|
+
end
|
88
|
+
|
89
|
+
task "generate_rdoc" do
|
90
|
+
end
|
91
|
+
|
92
|
+
Rake::GemPackageTask.new(gem_spec) do |pkg|
|
93
|
+
pkg.need_zip = false
|
94
|
+
pkg.need_tar = false
|
95
|
+
end
|
96
|
+
|
97
|
+
#Rake::PackageTask.new('scrubyt-examples', '0.4.03') do |pkg|
|
98
|
+
# pkg.need_zip = true
|
99
|
+
# pkg.need_tar = true
|
100
|
+
# pkg.package_files.include("examples/**/*")
|
101
|
+
#end
|
data/lib/scrubyt.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
if RUBY_VERSION < '1.9'
|
2
|
+
$KCODE = "u"
|
3
|
+
require "jcode"
|
4
|
+
end
|
5
|
+
|
6
|
+
#ruby core
|
7
|
+
require "open-uri"
|
8
|
+
require "erb"
|
9
|
+
|
10
|
+
#gems
|
11
|
+
require "rexml/text"
|
12
|
+
require "rubygems"
|
13
|
+
require "mechanize"
|
14
|
+
require "hpricot"
|
15
|
+
|
16
|
+
#scrubyt
|
17
|
+
require "#{File.dirname(__FILE__)}/scrubyt/logging"
|
18
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/ruby_extensions.rb"
|
19
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/xpathutils.rb"
|
20
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/shared_utils.rb"
|
21
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/simple_example_lookup.rb"
|
22
|
+
require "#{File.dirname(__FILE__)}/scrubyt/utils/compound_example_lookup.rb"
|
23
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint_adder.rb"
|
24
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/constraint.rb"
|
25
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/result_indexer.rb"
|
26
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pre_filter_document.rb"
|
27
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
|
28
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
|
29
|
+
require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
30
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
31
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
32
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
33
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"
|
35
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/base_filter.rb"
|
36
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/attribute_filter.rb"
|
37
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/constant_filter.rb"
|
38
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/script_filter.rb"
|
39
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/text_filter.rb"
|
40
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/detail_page_filter.rb"
|
41
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/download_filter.rb"
|
42
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/html_subtree_filter.rb"
|
43
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/regexp_filter.rb"
|
44
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/filters/tree_filter.rb"
|
45
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/pattern.rb"
|
@@ -0,0 +1,253 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'firewatir'
|
3
|
+
module Scrubyt
|
4
|
+
##
|
5
|
+
#=<tt>Fetching pages (and related functionality)</tt>
|
6
|
+
#
|
7
|
+
#Since lot of things are happening during (and before)
|
8
|
+
#the fetching of a document, I decided to move out fetching related
|
9
|
+
#functionality to a separate class - so if you are looking for anything
|
10
|
+
#which is loading a document (even by submitting a form or clicking a link)
|
11
|
+
#and related things like setting a proxy etc. you should find it here.
|
12
|
+
module Navigation
|
13
|
+
module Firewatir
|
14
|
+
|
15
|
+
def self.included(base)
|
16
|
+
base.module_eval do
|
17
|
+
@@agent = FireWatir::Firefox.new
|
18
|
+
@@current_doc_url = nil
|
19
|
+
@@current_doc_protocol = nil
|
20
|
+
@@base_dir = nil
|
21
|
+
@@host_name = nil
|
22
|
+
@@history = []
|
23
|
+
@@current_form = nil
|
24
|
+
@@current_frame = nil
|
25
|
+
|
26
|
+
##
|
27
|
+
#Action to fetch a document (either a file or a http address)
|
28
|
+
#
|
29
|
+
#*parameters*
|
30
|
+
#
|
31
|
+
#_doc_url_ - the url or file name to fetch
|
32
|
+
def self.fetch(doc_url, *args)
|
33
|
+
#Refactor this crap!!! with option_accessor stuff
|
34
|
+
if args.size > 0
|
35
|
+
mechanize_doc = args[0][:mechanize_doc]
|
36
|
+
resolve = args[0][:resolve]
|
37
|
+
basic_auth = args[0][:basic_auth]
|
38
|
+
#Refactor this whole stuff as well!!! It looks awful...
|
39
|
+
parse_and_set_basic_auth(basic_auth) if basic_auth
|
40
|
+
else
|
41
|
+
mechanize_doc = nil
|
42
|
+
resolve = :full
|
43
|
+
end
|
44
|
+
|
45
|
+
@@current_doc_url = doc_url
|
46
|
+
@@current_doc_protocol = determine_protocol
|
47
|
+
if mechanize_doc.nil?
|
48
|
+
handle_relative_path(doc_url) unless @@current_doc_protocol == 'xpath'
|
49
|
+
handle_relative_url(doc_url, resolve)
|
50
|
+
Scrubyt.log :ACTION, "fetching document: #{@@current_doc_url}"
|
51
|
+
case @@current_doc_protocol
|
52
|
+
when 'file': @@agent.goto("file://"+ @@current_doc_url)
|
53
|
+
else @@agent.goto(@@current_doc_url)
|
54
|
+
end
|
55
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
56
|
+
else
|
57
|
+
@@mechanize_doc = mechanize_doc
|
58
|
+
end
|
59
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
60
|
+
store_host_name(@@agent.url) # in case we're on a new host
|
61
|
+
end
|
62
|
+
|
63
|
+
def self.frame(attribute, value)
|
64
|
+
if @@current_frame
|
65
|
+
@@current_frame.frame(attribute, value)
|
66
|
+
else
|
67
|
+
@@current_frame = @@agent.frame(attribute, value)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
##
|
72
|
+
#Submit the last form;
|
73
|
+
def self.submit(current_form, sleep_time=nil, button=nil, type=nil)
|
74
|
+
if @@current_frame
|
75
|
+
#BRUTAL hax but FW is such a shitty piece of software
|
76
|
+
#this sucks FAIL omg
|
77
|
+
@@current_frame.locate
|
78
|
+
form = Document.new(@@current_frame).all.find{|t| t.tagName=="FORM"}
|
79
|
+
form.submit
|
80
|
+
else
|
81
|
+
@@agent.element_by_xpath(@@current_form).submit
|
82
|
+
end
|
83
|
+
|
84
|
+
if sleep_time
|
85
|
+
sleep sleep_time
|
86
|
+
@@agent.wait
|
87
|
+
end
|
88
|
+
|
89
|
+
@@current_doc_url = @@agent.url
|
90
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
91
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
92
|
+
end
|
93
|
+
|
94
|
+
##
|
95
|
+
#Click the link specified by the text
|
96
|
+
def self.click_link(link_spec,index = 0,wait_secs=0)
|
97
|
+
Scrubyt.log :ACTION, "Clicking link specified by: %p" % link_spec
|
98
|
+
if link_spec.is_a?(Hash)
|
99
|
+
elem = XPathUtils.generate_XPath(CompoundExampleLookup.find_node_from_compund_example(@@hpricot_doc, link_spec, false, index), nil, true)
|
100
|
+
result_page = @@agent.element_by_xpath(elem).click
|
101
|
+
else
|
102
|
+
@@agent.link(:innerHTML, Regexp.escape(link_spec)).click
|
103
|
+
end
|
104
|
+
sleep(wait_secs) if wait_secs > 0
|
105
|
+
@@agent.wait
|
106
|
+
@@current_doc_url = @@agent.url
|
107
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
108
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
109
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
110
|
+
end
|
111
|
+
|
112
|
+
def self.click_by_xpath(xpath)
|
113
|
+
Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
|
114
|
+
@@agent.element_by_xpath(xpath).click
|
115
|
+
@@agent.wait
|
116
|
+
@@current_doc_url = @@agent.url
|
117
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
118
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
119
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.click_image_map(index = 0)
|
123
|
+
Scrubyt.log :ACTION, "Clicking image map at index: %p" % index
|
124
|
+
uri = @@mechanize_doc.search("//area")[index]['href']
|
125
|
+
result_page = @@agent.get(uri)
|
126
|
+
@@current_doc_url = result_page.uri.to_s
|
127
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
128
|
+
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
129
|
+
end
|
130
|
+
|
131
|
+
def self.store_host_name(doc_url)
|
132
|
+
@@host_name = doc_url.match(/.*\..*?\//)[0] if doc_url.match(/.*\..*?\//)
|
133
|
+
@@original_host_name ||= @@host_name
|
134
|
+
end #end of method store_host_name
|
135
|
+
|
136
|
+
def self.determine_protocol
|
137
|
+
old_protocol = @@current_doc_protocol
|
138
|
+
new_protocol = case @@current_doc_url
|
139
|
+
when /^\/\//
|
140
|
+
'xpath'
|
141
|
+
when /^https/
|
142
|
+
'https'
|
143
|
+
when /^http/
|
144
|
+
'http'
|
145
|
+
when /^www\./
|
146
|
+
'http'
|
147
|
+
else
|
148
|
+
'file'
|
149
|
+
end
|
150
|
+
return 'http' if ((old_protocol == 'http') && new_protocol == 'file')
|
151
|
+
return 'https' if ((old_protocol == 'https') && new_protocol == 'file')
|
152
|
+
new_protocol
|
153
|
+
end
|
154
|
+
|
155
|
+
def self.parse_and_set_basic_auth(basic_auth)
|
156
|
+
login, pass = basic_auth.split('@')
|
157
|
+
Scrubyt.log :ACTION, "Basic authentication: login=<#{login}>, pass=<#{pass}>"
|
158
|
+
@@agent.basic_auth(login, pass)
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.handle_relative_path(doc_url)
|
162
|
+
if @@base_dir == nil || doc_url[0..0] == "/"
|
163
|
+
@@base_dir = doc_url.scan(/.+\//)[0] if @@current_doc_protocol == 'file'
|
164
|
+
else
|
165
|
+
@@current_doc_url = ((@@base_dir + doc_url) if doc_url !~ /#{@@base_dir}/)
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def self.handle_relative_url(doc_url, resolve)
|
170
|
+
return if doc_url =~ /^(http:|javascript:)/
|
171
|
+
if doc_url !~ /^\//
|
172
|
+
first_char = doc_url[0..0]
|
173
|
+
doc_url = ( first_char == '?' ? '' : '/' ) + doc_url
|
174
|
+
if first_char == '?' #This is an ugly hack... really have to throw this shit out and go with mechanize's
|
175
|
+
current_uri = @@mechanize_doc.uri.to_s
|
176
|
+
current_uri = @@agent.history.first.uri.to_s if current_uri =~ /\/popup\//
|
177
|
+
if (current_uri.include? '?')
|
178
|
+
current_uri = current_uri.scan(/.+\//)[0]
|
179
|
+
else
|
180
|
+
current_uri += '/' unless current_uri[-1..-1] == '/'
|
181
|
+
end
|
182
|
+
@@current_doc_url = current_uri + doc_url
|
183
|
+
return
|
184
|
+
end
|
185
|
+
end
|
186
|
+
case resolve
|
187
|
+
when :full
|
188
|
+
@@current_doc_url = (@@host_name + doc_url) if ( @@host_name != nil && (doc_url !~ /#{@@host_name}/))
|
189
|
+
@@current_doc_url = @@current_doc_url.split('/').uniq.join('/')
|
190
|
+
when :host
|
191
|
+
base_host_name = (@@host_name.count("/") == 2 ? @@host_name : @@host_name.scan(/(http.+?\/\/.+?)\//)[0][0])
|
192
|
+
@@current_doc_url = base_host_name + doc_url
|
193
|
+
else
|
194
|
+
#custom resilving
|
195
|
+
@@current_doc_url = resolve + doc_url
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
def self.fill_textfield(textfield_name, query_string, wait_secs, useValue)
|
200
|
+
@@current_form = "//input[@name='#{textfield_name}']/ancestor::form"
|
201
|
+
target = @@current_frame || @@agent
|
202
|
+
if useValue
|
203
|
+
target.text_field(:name,textfield_name).value = query_string
|
204
|
+
else
|
205
|
+
target.text_field(:name,textfield_name).set(query_string)
|
206
|
+
end
|
207
|
+
sleep(wait_secs) if wait_secs > 0
|
208
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
209
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
210
|
+
|
211
|
+
end
|
212
|
+
|
213
|
+
##
|
214
|
+
#Action to fill a textarea with text
|
215
|
+
def self.fill_textarea(textarea_name, text)
|
216
|
+
@@current_form = "//input[@name='#{textarea_name}']/ancestor::form"
|
217
|
+
@@agent.text_field(:name,textarea_name).set(text)
|
218
|
+
end
|
219
|
+
|
220
|
+
##
|
221
|
+
#Action for selecting an option from a dropdown box
|
222
|
+
def self.select_option(selectlist_name, option)
|
223
|
+
@@current_form = "//select[@name='#{selectlist_name}']/ancestor::form"
|
224
|
+
@@agent.select_list(:name,selectlist_name).select(option)
|
225
|
+
end
|
226
|
+
|
227
|
+
def self.check_checkbox(checkbox_name)
|
228
|
+
@@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
|
229
|
+
@@agent.checkbox(:name,checkbox_name).set(true)
|
230
|
+
end
|
231
|
+
|
232
|
+
def self.check_radiobutton(checkbox_name, index=0)
|
233
|
+
@@current_form = "//input[@name='#{checkbox_name}']/ancestor::form"
|
234
|
+
@@agent.elements_by_xpath("//input[@name='#{checkbox_name}']")[index].set
|
235
|
+
end
|
236
|
+
|
237
|
+
def self.click_image_map(index=0)
|
238
|
+
raise 'NotImplemented'
|
239
|
+
end
|
240
|
+
|
241
|
+
def self.wait(time=1)
|
242
|
+
sleep(time)
|
243
|
+
@@agent.wait
|
244
|
+
end
|
245
|
+
|
246
|
+
def self.close_firefox
|
247
|
+
@@agent.close
|
248
|
+
end
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|