scrubber-scrubyt 0.4.25 → 0.4.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile
CHANGED
|
@@ -17,13 +17,13 @@ task "cleanup_readme" => ["rdoc"]
|
|
|
17
17
|
|
|
18
18
|
gem_spec = Gem::Specification.new do |s|
|
|
19
19
|
s.name = 'scrubyt'
|
|
20
|
-
s.version = '0.4.
|
|
20
|
+
s.version = '0.4.26'
|
|
21
21
|
s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
|
|
22
22
|
s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
|
|
23
23
|
# Files containing Test::Unit test cases.
|
|
24
24
|
s.test_files = FileList['test/unittests/**/*']
|
|
25
25
|
# List of other files to be included.
|
|
26
|
-
s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
|
|
26
|
+
s.files = FileList['COPYING', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
|
|
27
27
|
s.author = 'Peter Szinek'
|
|
28
28
|
s.email = 'peter@rubyrailways.com'
|
|
29
29
|
s.homepage = 'http://www.scrubyt.org'
|
|
@@ -37,9 +37,9 @@ end
|
|
|
37
37
|
###################################################
|
|
38
38
|
|
|
39
39
|
Rake::RDocTask.new do |generate_rdoc|
|
|
40
|
-
files = ['lib/**/*.rb', 'README', 'CHANGELOG']
|
|
40
|
+
files = ['lib/**/*.rb', 'README.rdoc', 'CHANGELOG']
|
|
41
41
|
generate_rdoc.rdoc_files.add(files)
|
|
42
|
-
generate_rdoc.main = "README" # page to start on
|
|
42
|
+
generate_rdoc.main = "README.rdoc" # page to start on
|
|
43
43
|
generate_rdoc.title = "Scrubyt Documentation"
|
|
44
44
|
generate_rdoc.template = "resources/allison/allison.rb"
|
|
45
45
|
generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
|
data/lib/scrubyt.rb
CHANGED
|
@@ -30,11 +30,11 @@ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
|
|
|
30
30
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
|
|
31
31
|
|
|
32
32
|
# -- Making Firewatir optional --
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
begin
|
|
34
|
+
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
|
|
35
|
+
rescue LoadError
|
|
36
|
+
puts "The gem firewatir is not installed, you'll be able to use Mechanize as the agent only"
|
|
37
|
+
end
|
|
38
38
|
# --
|
|
39
39
|
|
|
40
40
|
require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
require 'rubygems'
|
|
2
1
|
require 'firewatir'
|
|
2
|
+
|
|
3
3
|
module Scrubyt
|
|
4
4
|
##
|
|
5
5
|
#=<tt>Fetching pages (and related functionality)</tt>
|
|
@@ -113,10 +113,16 @@ module Scrubyt
|
|
|
113
113
|
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
|
114
114
|
end
|
|
115
115
|
|
|
116
|
-
def self.click_by_xpath(xpath)
|
|
116
|
+
def self.click_by_xpath(xpath, wait_secs=0)
|
|
117
117
|
Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
|
|
118
118
|
@@agent.element_by_xpath(xpath).click
|
|
119
|
+
Scrubyt.log :INFO, "sleeping #{wait_secs}..."
|
|
120
|
+
sleep(wait_secs) if wait_secs > 0
|
|
119
121
|
@@agent.wait
|
|
122
|
+
|
|
123
|
+
# evaluate the results
|
|
124
|
+
extractor.evaluate_extractor
|
|
125
|
+
|
|
120
126
|
@@current_doc_url = @@agent.url
|
|
121
127
|
@@mechanize_doc = "<html>#{@@agent.html}</html>"
|
|
122
128
|
@@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
|
|
@@ -74,13 +74,13 @@ module Scrubyt
|
|
|
74
74
|
def self.submit(index=nil, sleep_time=nil, type=nil)
|
|
75
75
|
Scrubyt.log :ACTION, 'Submitting form...'
|
|
76
76
|
if index == nil
|
|
77
|
-
result_page = @@agent.submit(@@current_form)
|
|
78
|
-
process_submit(@@current_form)
|
|
77
|
+
#result_page = @@agent.submit(@@current_form)
|
|
78
|
+
result_page = process_submit(@@current_form)
|
|
79
79
|
#----- added by nickmerwin@gmail.com -----
|
|
80
80
|
elsif index.class == String && !type.nil?
|
|
81
81
|
button = @@current_form.buttons.detect{|b| b.name == index}
|
|
82
|
-
result_page = @@current_form.submit(button)
|
|
83
|
-
process_submit(@@current_form, button,type)
|
|
82
|
+
#result_page = @@current_form.submit(button)
|
|
83
|
+
result_page = process_submit(@@current_form, button,type)
|
|
84
84
|
#-----------------------------------------
|
|
85
85
|
else
|
|
86
86
|
result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
|
|
@@ -215,7 +215,11 @@ module Scrubyt
|
|
|
215
215
|
|
|
216
216
|
def self.fill_textfield(textfield_name, query_string, *unused)
|
|
217
217
|
lookup_form_for_tag('input','textfield',textfield_name,query_string)
|
|
218
|
-
|
|
218
|
+
if(@@current_form)
|
|
219
|
+
eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
|
|
220
|
+
else
|
|
221
|
+
Scrubyt.log :ERROR, "Couldn't find the form that contains this textfield. Please report a bug!"
|
|
222
|
+
end
|
|
219
223
|
end
|
|
220
224
|
|
|
221
225
|
##
|
|
@@ -253,16 +257,29 @@ module Scrubyt
|
|
|
253
257
|
else
|
|
254
258
|
result_page = @@agent.submit(current_form, button)
|
|
255
259
|
end
|
|
256
|
-
|
|
257
|
-
Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
|
258
|
-
fetch(@@current_doc_url, :mechanize_doc => result_page)
|
|
260
|
+
#@@current_doc_url = result_page.uri.to_s
|
|
261
|
+
#Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
|
|
262
|
+
#fetch(@@current_doc_url, :mechanize_doc => result_page)
|
|
263
|
+
result_page
|
|
259
264
|
end
|
|
260
265
|
|
|
261
266
|
def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
|
|
262
267
|
Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
|
|
263
268
|
widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
|
|
264
269
|
form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
|
|
265
|
-
|
|
270
|
+
puts "=" * 100
|
|
271
|
+
puts ">>#{Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)}<<"
|
|
272
|
+
puts "=" * 100
|
|
273
|
+
xp = Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)
|
|
274
|
+
form_element = FetchAction.get_mechanize_doc/xp
|
|
275
|
+
|
|
276
|
+
FetchAction.get_mechanize_doc.forms.each do |f|
|
|
277
|
+
@@current_form = f
|
|
278
|
+
break if f.form_node == form_element
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
#find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
|
|
266
283
|
end
|
|
267
284
|
|
|
268
285
|
def self.find_form_based_on_tag(tag, possible_attrs)
|
|
@@ -274,10 +291,16 @@ module Scrubyt
|
|
|
274
291
|
lookup_attribute_value = tag.attributes[a]
|
|
275
292
|
break if lookup_attribute_value != nil
|
|
276
293
|
}
|
|
294
|
+
|
|
295
|
+
#puts lookup_attribute_name
|
|
296
|
+
#puts lookup_attribute_value
|
|
297
|
+
|
|
277
298
|
i = 0
|
|
278
299
|
loop do
|
|
279
300
|
@@current_form = FetchAction.get_mechanize_doc.forms[i]
|
|
301
|
+
#p @@current_form.form_node
|
|
280
302
|
return nil if @@current_form == nil
|
|
303
|
+
#puts ">>#{@@current_form.form_node.attributes[lookup_attribute_name].to_s}<< :: >>#{lookup_attribute_value}<<"
|
|
281
304
|
break if @@current_form.form_node.attributes[lookup_attribute_name].to_s == lookup_attribute_value
|
|
282
305
|
i+= 1
|
|
283
306
|
end
|