scrubber-scrubyt 0.4.25 → 0.4.28
Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile
CHANGED
@@ -17,13 +17,13 @@ task "cleanup_readme" => ["rdoc"]
|
|
17
17
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
19
19
|
s.name = 'scrubyt'
|
20
|
-
s.version = '0.4.
|
20
|
+
s.version = '0.4.26'
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
23
23
|
# Files containing Test::Unit test cases.
|
24
24
|
s.test_files = FileList['test/unittests/**/*']
|
25
25
|
# List of other files to be included.
|
26
|
-
s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
|
26
|
+
s.files = FileList['COPYING', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
|
27
27
|
s.author = 'Peter Szinek'
|
28
28
|
s.email = 'peter@rubyrailways.com'
|
29
29
|
s.homepage = 'http://www.scrubyt.org'
|
@@ -37,9 +37,9 @@ end
|
|
37
37
|
###################################################
|
38
38
|
|
39
39
|
Rake::RDocTask.new do |generate_rdoc|
|
40
|
-
files = ['lib/**/*.rb', 'README', 'CHANGELOG']
|
40
|
+
files = ['lib/**/*.rb', 'README.rdoc', 'CHANGELOG']
|
41
41
|
generate_rdoc.rdoc_files.add(files)
|
42
|
-
generate_rdoc.main = "README" # page to start on
|
42
|
+
generate_rdoc.main = "README.rdoc" # page to start on
|
43
43
|
generate_rdoc.title = "Scrubyt Documentation"
|
44
44
|
generate_rdoc.template = "resources/allison/allison.rb"
|
45
45
|
generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
|
data/lib/scrubyt.rb
CHANGED
@@ -30,11 +30,11 @@ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
|
30
30
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
31
31
|
|
32
32
|
# -- Making Firewatir optional --
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
33
|
+
begin
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
35
|
+
rescue LoadError
|
36
|
+
puts "The gem firewatir is not installed, you'll be able to use Mechanize as the agent only"
|
37
|
+
end
|
38
38
|
# --
|
39
39
|
|
40
40
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require 'rubygems'
|
2
1
|
require 'firewatir'
|
2
|
+
|
3
3
|
module Scrubyt
|
4
4
|
##
|
5
5
|
#=<tt>Fetching pages (and related functionality)</tt>
|
@@ -113,10 +113,16 @@ module Scrubyt
|
|
113
113
|
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
114
114
|
end
|
115
115
|
|
116
|
-
def self.click_by_xpath(xpath)
|
116
|
+
def self.click_by_xpath(xpath, wait_secs=0)
|
117
117
|
Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
|
118
118
|
@@agent.element_by_xpath(xpath).click
|
119
|
+
Scrubyt.log :INFO, "sleeping #{wait_secs}..."
|
120
|
+
sleep(wait_secs) if wait_secs > 0
|
119
121
|
@@agent.wait
|
122
|
+
|
123
|
+
# evaluate the results
|
124
|
+
extractor.evaluate_extractor
|
125
|
+
|
120
126
|
@@current_doc_url = @@agent.url
|
121
127
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
122
128
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
@@ -74,13 +74,13 @@ module Scrubyt
|
|
74
74
|
def self.submit(index=nil, sleep_time=nil, type=nil)
|
75
75
|
Scrubyt.log :ACTION, 'Submitting form...'
|
76
76
|
if index == nil
|
77
|
-
result_page = @@agent.submit(@@current_form)
|
78
|
-
process_submit(@@current_form)
|
77
|
+
#result_page = @@agent.submit(@@current_form)
|
78
|
+
result_page = process_submit(@@current_form)
|
79
79
|
#----- added by nickmerwin@gmail.com -----
|
80
80
|
elsif index.class == String && !type.nil?
|
81
81
|
button = @@current_form.buttons.detect{|b| b.name == index}
|
82
|
-
result_page = @@current_form.submit(button)
|
83
|
-
process_submit(@@current_form, button,type)
|
82
|
+
#result_page = @@current_form.submit(button)
|
83
|
+
result_page = process_submit(@@current_form, button,type)
|
84
84
|
#-----------------------------------------
|
85
85
|
else
|
86
86
|
result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
|
@@ -215,7 +215,11 @@ module Scrubyt
|
|
215
215
|
|
216
216
|
def self.fill_textfield(textfield_name, query_string, *unused)
|
217
217
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
218
|
-
|
218
|
+
if(@@current_form)
|
219
|
+
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
220
|
+
else
|
221
|
+
Scrubyt.log :ERROR, "Couldn't find the form that contains this textfield. Please report a bug!"
|
222
|
+
end
|
219
223
|
end
|
220
224
|
|
221
225
|
##
|
@@ -253,16 +257,29 @@ module Scrubyt
|
|
253
257
|
else
|
254
258
|
result_page = @@agent.submit(current_form, button)
|
255
259
|
end
|
256
|
-
|
257
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
258
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
260
|
+
#@@current_doc_url = result_page.uri.to_s
|
261
|
+
#Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
262
|
+
#fetch(@@current_doc_url, :mechanize_doc => result_page)
|
263
|
+
result_page
|
259
264
|
end
|
260
265
|
|
261
266
|
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
262
267
|
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
263
268
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
264
269
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
265
|
-
|
270
|
+
puts "=" * 100
|
271
|
+
puts ">>#{Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)}<<"
|
272
|
+
puts "=" * 100
|
273
|
+
xp = Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)
|
274
|
+
form_element = FetchAction.get_mechanize_doc/xp
|
275
|
+
|
276
|
+
FetchAction.get_mechanize_doc.forms.each do |f|
|
277
|
+
@@current_form = f
|
278
|
+
break if f.form_node == form_element
|
279
|
+
end
|
280
|
+
|
281
|
+
|
282
|
+
#find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
266
283
|
end
|
267
284
|
|
268
285
|
def self.find_form_based_on_tag(tag, possible_attrs)
|
@@ -274,10 +291,16 @@ module Scrubyt
|
|
274
291
|
lookup_attribute_value = tag.attributes[a]
|
275
292
|
break if lookup_attribute_value != nil
|
276
293
|
}
|
294
|
+
|
295
|
+
#puts lookup_attribute_name
|
296
|
+
#puts lookup_attribute_value
|
297
|
+
|
277
298
|
i = 0
|
278
299
|
loop do
|
279
300
|
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|
301
|
+
#p @@current_form.form_node
|
280
302
|
return nil if @@current_form == nil
|
303
|
+
#puts ">>#{@@current_form.form_node.attributes[lookup_attribute_name].to_s}<< :: >>#{lookup_attribute_value}<<"
|
281
304
|
break if @@current_form.form_node.attributes[lookup_attribute_name].to_s == lookup_attribute_value
|
282
305
|
i+= 1
|
283
306
|
end
|