scrubber-scrubyt 0.4.25 → 0.4.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -17,13 +17,13 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.4.20'
20
+ s.version = '0.4.26'
21
21
  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
24
24
  s.test_files = FileList['test/unittests/**/*']
25
25
  # List of other files to be included.
26
- s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
26
+ s.files = FileList['COPYING', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
27
27
  s.author = 'Peter Szinek'
28
28
  s.email = 'peter@rubyrailways.com'
29
29
  s.homepage = 'http://www.scrubyt.org'
@@ -37,9 +37,9 @@ end
37
37
  ###################################################
38
38
 
39
39
  Rake::RDocTask.new do |generate_rdoc|
40
- files = ['lib/**/*.rb', 'README', 'CHANGELOG']
40
+ files = ['lib/**/*.rb', 'README.rdoc', 'CHANGELOG']
41
41
  generate_rdoc.rdoc_files.add(files)
42
- generate_rdoc.main = "README" # page to start on
42
+ generate_rdoc.main = "README.rdoc" # page to start on
43
43
  generate_rdoc.title = "Scrubyt Documentation"
44
44
  generate_rdoc.template = "resources/allison/allison.rb"
45
45
  generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
data/lib/scrubyt.rb CHANGED
@@ -30,11 +30,11 @@ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
30
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
31
 
32
32
  # -- Making Firewatir optional --
33
- if defined? Firewatir::Firefox
34
- require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
- else
36
- puts "The gem firewatir is not installed"
37
- end
33
+ begin
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
+ rescue LoadError
36
+ puts "The gem firewatir is not installed, you'll be able to use Mechanize as the agent only"
37
+ end
38
38
  # --
39
39
 
40
40
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
@@ -1,5 +1,5 @@
1
- require 'rubygems'
2
1
  require 'firewatir'
2
+
3
3
  module Scrubyt
4
4
  ##
5
5
  #=<tt>Fetching pages (and related functionality)</tt>
@@ -113,10 +113,16 @@ module Scrubyt
113
113
  Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
114
114
  end
115
115
 
116
- def self.click_by_xpath(xpath)
116
+ def self.click_by_xpath(xpath, wait_secs=0)
117
117
  Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
118
118
  @@agent.element_by_xpath(xpath).click
119
+ Scrubyt.log :INFO, "sleeping #{wait_secs}..."
120
+ sleep(wait_secs) if wait_secs > 0
119
121
  @@agent.wait
122
+
123
+ # evaluate the results
124
+ extractor.evaluate_extractor
125
+
120
126
  @@current_doc_url = @@agent.url
121
127
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
122
128
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -74,13 +74,13 @@ module Scrubyt
74
74
  def self.submit(index=nil, sleep_time=nil, type=nil)
75
75
  Scrubyt.log :ACTION, 'Submitting form...'
76
76
  if index == nil
77
- result_page = @@agent.submit(@@current_form)
78
- process_submit(@@current_form)
77
+ #result_page = @@agent.submit(@@current_form)
78
+ result_page = process_submit(@@current_form)
79
79
  #----- added by nickmerwin@gmail.com -----
80
80
  elsif index.class == String && !type.nil?
81
81
  button = @@current_form.buttons.detect{|b| b.name == index}
82
- result_page = @@current_form.submit(button)
83
- process_submit(@@current_form, button,type)
82
+ #result_page = @@current_form.submit(button)
83
+ result_page = process_submit(@@current_form, button,type)
84
84
  #-----------------------------------------
85
85
  else
86
86
  result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
@@ -215,7 +215,11 @@ module Scrubyt
215
215
 
216
216
  def self.fill_textfield(textfield_name, query_string, *unused)
217
217
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
218
- eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
218
+ if(@@current_form)
219
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
220
+ else
221
+ Scrubyt.log :ERROR, "Couldn't find the form that contains this textfield. Please report a bug!"
222
+ end
219
223
  end
220
224
 
221
225
  ##
@@ -253,16 +257,29 @@ module Scrubyt
253
257
  else
254
258
  result_page = @@agent.submit(current_form, button)
255
259
  end
256
- @@current_doc_url = result_page.uri.to_s
257
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
258
- fetch(@@current_doc_url, :mechanize_doc => result_page)
260
+ #@@current_doc_url = result_page.uri.to_s
261
+ #Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
262
+ #fetch(@@current_doc_url, :mechanize_doc => result_page)
263
+ result_page
259
264
  end
260
265
 
261
266
  def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
262
267
  Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
263
268
  widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
264
269
  form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
265
- find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
270
+ puts "=" * 100
271
+ puts ">>#{Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)}<<"
272
+ puts "=" * 100
273
+ xp = Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)
274
+ form_element = FetchAction.get_mechanize_doc/xp
275
+
276
+ FetchAction.get_mechanize_doc.forms.each do |f|
277
+ @@current_form = f
278
+ break if f.form_node == form_element
279
+ end
280
+
281
+
282
+ #find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
266
283
  end
267
284
 
268
285
  def self.find_form_based_on_tag(tag, possible_attrs)
@@ -274,10 +291,16 @@ module Scrubyt
274
291
  lookup_attribute_value = tag.attributes[a]
275
292
  break if lookup_attribute_value != nil
276
293
  }
294
+
295
+ #puts lookup_attribute_name
296
+ #puts lookup_attribute_value
297
+
277
298
  i = 0
278
299
  loop do
279
300
  @@current_form = FetchAction.get_mechanize_doc.forms[i]
301
+ #p @@current_form.form_node
280
302
  return nil if @@current_form == nil
303
+ #puts ">>#{@@current_form.form_node.attributes[lookup_attribute_name].to_s}<< :: >>#{lookup_attribute_value}<<"
281
304
  break if @@current_form.form_node.attributes[lookup_attribute_name].to_s == lookup_attribute_value
282
305
  i+= 1
283
306
  end
@@ -80,6 +80,10 @@ module Scrubyt
80
80
  FetchAction.click_by_xpath(xpath)
81
81
  end
82
82
 
83
+ def click_by_xpath_and_wait(xpath, secs)
84
+ FetchAction.click_by_xpath(xpath, secs)
85
+ end
86
+
83
87
  def click_image_map(index=0)
84
88
  FetchAction.click_image_map(index)
85
89
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubber-scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.25
4
+ version: 0.4.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek