scrubber-scrubyt 0.4.28 → 0.4.30
Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile
CHANGED
@@ -17,7 +17,7 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.4.
|
20
|
+
s.version = '0.4.30'
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
@@ -13,8 +13,8 @@ module Scrubyt
|
|
13
13
|
module Firewatir
|
14
14
|
|
15
15
|
def self.included(base)
|
16
|
-
base.module_eval do
|
17
|
-
@@agent = FireWatir::Firefox.new
|
16
|
+
base.module_eval do
|
17
|
+
@@agent = FireWatir::Firefox.new unless defined? @@agent
|
18
18
|
@@current_doc_url = nil
|
19
19
|
@@current_doc_protocol = nil
|
20
20
|
@@base_dir = nil
|
@@ -60,6 +60,11 @@ module Scrubyt
|
|
60
60
|
store_host_name(@@agent.url) # in case we're on a new host
|
61
61
|
end
|
62
62
|
|
63
|
+
def self.use_current_page
|
64
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
65
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
66
|
+
end
|
67
|
+
|
63
68
|
def self.frame(attribute, value)
|
64
69
|
if @@current_frame
|
65
70
|
@@current_frame.frame(attribute, value)
|
@@ -111,7 +116,24 @@ module Scrubyt
|
|
111
116
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
112
117
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
113
118
|
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
114
|
-
end
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.click_by_xpath_if_exists(xpath, wait_secs=0)
|
122
|
+
begin
|
123
|
+
result_page = @@agent.element_by_xpath(xpath).click
|
124
|
+
sleep(wait_secs) if wait_secs > 0
|
125
|
+
@@agent.wait
|
126
|
+
|
127
|
+
extractor.evaluate_extractor
|
128
|
+
|
129
|
+
@@current_doc_url = @@agent.url
|
130
|
+
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
131
|
+
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
132
|
+
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
133
|
+
rescue Watir::Exception::UnknownObjectException
|
134
|
+
Scrubyt.log :INFO, "XPath #{xpath} doesn't exist in this document"
|
135
|
+
end
|
136
|
+
end
|
115
137
|
|
116
138
|
def self.click_by_xpath(xpath, wait_secs=0)
|
117
139
|
Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
|
@@ -56,6 +56,10 @@ module Scrubyt
|
|
56
56
|
def fetch(*args)
|
57
57
|
FetchAction.fetch(*args)
|
58
58
|
end
|
59
|
+
|
60
|
+
def use_current_page
|
61
|
+
FetchAction.use_current_page
|
62
|
+
end
|
59
63
|
##
|
60
64
|
#Submit the current form
|
61
65
|
def submit(index=nil, type=nil)
|
@@ -76,6 +80,10 @@ module Scrubyt
|
|
76
80
|
FetchAction.click_link(link_spec, 0, sleep_secs)
|
77
81
|
end
|
78
82
|
|
83
|
+
def click_by_xpath_if_exists(xpath, sleep_secs=0)
|
84
|
+
FetchAction.click_by_xpath_if_exists(xpath, sleep_secs)
|
85
|
+
end
|
86
|
+
|
79
87
|
def click_by_xpath(xpath)
|
80
88
|
FetchAction.click_by_xpath(xpath)
|
81
89
|
end
|