scrubber-scrubyt 0.4.25 → 0.4.28

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -17,13 +17,13 @@ task "cleanup_readme" => ["rdoc"]
17
17
 
18
18
  gem_spec = Gem::Specification.new do |s|
19
19
  s.name = 'scrubyt'
20
- s.version = '0.4.20'
20
+ s.version = '0.4.26'
21
21
  s.summary = 'A powerful Web-scraping framework built on Mechanize and Hpricot (and FireWatir)'
22
22
  s.description = %{scRUBYt! is an easy to learn and use, yet powerful and effective web scraping framework. It's most interesting part is a Web-scraping DSL built on HPricot and WWW::Mechanize, which allows to navigate to the page of interest, then extract and query data records with a few lines of code. It is hard to describe scRUBYt! in a few sentences - you have to see it for yourself!}
23
23
  # Files containing Test::Unit test cases.
24
24
  s.test_files = FileList['test/unittests/**/*']
25
25
  # List of other files to be included.
26
- s.files = FileList['COPYING', 'README', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
26
+ s.files = FileList['COPYING', 'README.rdoc', 'CHANGELOG', 'Rakefile', 'lib/**/*.rb']
27
27
  s.author = 'Peter Szinek'
28
28
  s.email = 'peter@rubyrailways.com'
29
29
  s.homepage = 'http://www.scrubyt.org'
@@ -37,9 +37,9 @@ end
37
37
  ###################################################
38
38
 
39
39
  Rake::RDocTask.new do |generate_rdoc|
40
- files = ['lib/**/*.rb', 'README', 'CHANGELOG']
40
+ files = ['lib/**/*.rb', 'README.rdoc', 'CHANGELOG']
41
41
  generate_rdoc.rdoc_files.add(files)
42
- generate_rdoc.main = "README" # page to start on
42
+ generate_rdoc.main = "README.rdoc" # page to start on
43
43
  generate_rdoc.title = "Scrubyt Documentation"
44
44
  generate_rdoc.template = "resources/allison/allison.rb"
45
45
  generate_rdoc.rdoc_dir = 'doc' # rdoc output folder
data/lib/scrubyt.rb CHANGED
@@ -30,11 +30,11 @@ require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
30
30
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
31
31
 
32
32
  # -- Making Firewatir optional --
33
- if defined? Firewatir::Firefox
34
- require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
- else
36
- puts "The gem firewatir is not installed"
37
- end
33
+ begin
34
+ require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
35
+ rescue LoadError
36
+ puts "The gem firewatir is not installed, you'll be able to use Mechanize as the agent only"
37
+ end
38
38
  # --
39
39
 
40
40
  require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
@@ -1,5 +1,5 @@
1
- require 'rubygems'
2
1
  require 'firewatir'
2
+
3
3
  module Scrubyt
4
4
  ##
5
5
  #=<tt>Fetching pages (and related functionality)</tt>
@@ -113,10 +113,16 @@ module Scrubyt
113
113
  Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
114
114
  end
115
115
 
116
- def self.click_by_xpath(xpath)
116
+ def self.click_by_xpath(xpath, wait_secs=0)
117
117
  Scrubyt.log :ACTION, "Clicking by XPath : %p" % xpath
118
118
  @@agent.element_by_xpath(xpath).click
119
+ Scrubyt.log :INFO, "sleeping #{wait_secs}..."
120
+ sleep(wait_secs) if wait_secs > 0
119
121
  @@agent.wait
122
+
123
+ # evaluate the results
124
+ extractor.evaluate_extractor
125
+
120
126
  @@current_doc_url = @@agent.url
121
127
  @@mechanize_doc = "<html>#{@@agent.html}</html>"
122
128
  @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))
@@ -74,13 +74,13 @@ module Scrubyt
74
74
  def self.submit(index=nil, sleep_time=nil, type=nil)
75
75
  Scrubyt.log :ACTION, 'Submitting form...'
76
76
  if index == nil
77
- result_page = @@agent.submit(@@current_form)
78
- process_submit(@@current_form)
77
+ #result_page = @@agent.submit(@@current_form)
78
+ result_page = process_submit(@@current_form)
79
79
  #----- added by nickmerwin@gmail.com -----
80
80
  elsif index.class == String && !type.nil?
81
81
  button = @@current_form.buttons.detect{|b| b.name == index}
82
- result_page = @@current_form.submit(button)
83
- process_submit(@@current_form, button,type)
82
+ #result_page = @@current_form.submit(button)
83
+ result_page = process_submit(@@current_form, button,type)
84
84
  #-----------------------------------------
85
85
  else
86
86
  result_page = @@agent.submit(@@current_form, @@current_form.buttons[index])
@@ -215,7 +215,11 @@ module Scrubyt
215
215
 
216
216
  def self.fill_textfield(textfield_name, query_string, *unused)
217
217
  lookup_form_for_tag('input','textfield',textfield_name,query_string)
218
- eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
218
+ if(@@current_form)
219
+ eval("@@current_form['#{textfield_name}'] = '#{query_string}'")
220
+ else
221
+ Scrubyt.log :ERROR, "Couldn't find the form that contains this textfield. Please report a bug!"
222
+ end
219
223
  end
220
224
 
221
225
  ##
@@ -253,16 +257,29 @@ module Scrubyt
253
257
  else
254
258
  result_page = @@agent.submit(current_form, button)
255
259
  end
256
- @@current_doc_url = result_page.uri.to_s
257
- Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
258
- fetch(@@current_doc_url, :mechanize_doc => result_page)
260
+ #@@current_doc_url = result_page.uri.to_s
261
+ #Scrubyt.log :ACTION, "Fetching #{@@current_doc_url}"
262
+ #fetch(@@current_doc_url, :mechanize_doc => result_page)
263
+ result_page
259
264
  end
260
265
 
261
266
  def self.lookup_form_for_tag(tag, widget_name, name_attribute, query_string, index=0)
262
267
  Scrubyt.log :ACTION, "typing #{query_string} into the #{widget_name} named '#{name_attribute}'"
263
268
  widget = (FetchAction.get_hpricot_doc/"#{tag}[@name=#{name_attribute}]").map()[index]
264
269
  form_tag = Scrubyt::XPathUtils.traverse_up_until_name(widget, 'form')
265
- find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
270
+ puts "=" * 100
271
+ puts ">>#{Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)}<<"
272
+ puts "=" * 100
273
+ xp = Scrubyt::XPathUtils.generate_XPath(form_tag, nil, true)
274
+ form_element = FetchAction.get_mechanize_doc/xp
275
+
276
+ FetchAction.get_mechanize_doc.forms.each do |f|
277
+ @@current_form = f
278
+ break if f.form_node == form_element
279
+ end
280
+
281
+
282
+ #find_form_based_on_tag(form_tag, ['name', 'id', 'action'])
266
283
  end
267
284
 
268
285
  def self.find_form_based_on_tag(tag, possible_attrs)
@@ -274,10 +291,16 @@ module Scrubyt
274
291
  lookup_attribute_value = tag.attributes[a]
275
292
  break if lookup_attribute_value != nil
276
293
  }
294
+
295
+ #puts lookup_attribute_name
296
+ #puts lookup_attribute_value
297
+
277
298
  i = 0
278
299
  loop do
279
300
  @@current_form = FetchAction.get_mechanize_doc.forms[i]
301
+ #p @@current_form.form_node
280
302
  return nil if @@current_form == nil
303
+ #puts ">>#{@@current_form.form_node.attributes[lookup_attribute_name].to_s}<< :: >>#{lookup_attribute_value}<<"
281
304
  break if @@current_form.form_node.attributes[lookup_attribute_name].to_s == lookup_attribute_value
282
305
  i+= 1
283
306
  end
@@ -80,6 +80,10 @@ module Scrubyt
80
80
  FetchAction.click_by_xpath(xpath)
81
81
  end
82
82
 
83
+ def click_by_xpath_and_wait(xpath, secs)
84
+ FetchAction.click_by_xpath(xpath, secs)
85
+ end
86
+
83
87
  def click_image_map(index=0)
84
88
  FetchAction.click_image_map(index)
85
89
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrubber-scrubyt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.25
4
+ version: 0.4.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Szinek