RubyGems - scrubber-scrubyt - Versions diffs - 0.4.20 → 0.4.25 - Mend

scrubber-scrubyt 0.4.20 → 0.4.25

Files changed (9) hide show

data/CHANGELOG +6 -1
data/{README → README.rdoc} +0 -0
data/lib/scrubyt.rb +9 -1
data/lib/scrubyt/core/navigation/agents/firewatir.rb +4 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +9 -0
data/lib/scrubyt/core/shared/extractor.rb +7 -4
data/lib/scrubyt/output/result_node.rb +5 -0
data/lib/scrubyt/utils/shared_utils.rb +1 -1
metadata +2 -2

data/CHANGELOG CHANGED Viewed

@@ -1,10 +1,15 @@
 = scRUBYt! Changelog
 == 0.4.3
-== 20th April
+== 23th April
 - [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
+- [NEW] Added the ability to scrape, click_link, then scrape again.  (Only for firefox agent) (thanks to Mikkel Garcia)
 - [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
+- [NEW] added a wrapper around the firewatir requirement to make firewatir optional
+- [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
 == 0.4.05

data/{README → README.rdoc} RENAMED Viewed

File without changes

data/lib/scrubyt.rb CHANGED Viewed

@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
-require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
+# -- Making Firewatir optional --
+  if defined? Firewatir::Firefox
+    require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
+  else
+    puts "The gem firewatir is not installed"
+  end
+# --
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"

data/lib/scrubyt/core/navigation/agents/firewatir.rb CHANGED Viewed

@@ -103,6 +103,10 @@ module Scrubyt
             end
             sleep(wait_secs) if wait_secs > 0
             @@agent.wait
+            # evaluate the results
+            extractor.evaluate_extractor
             @@current_doc_url = @@agent.url
             @@mechanize_doc = "<html>#{@@agent.html}</html>"
             @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED Viewed

@@ -14,6 +14,15 @@ module Scrubyt
     @@host_name = nil
     @@history = []
     @@current_form = nil
+    @@extractor = nil
+    def self.extractor=(extractor)
+      @@extractor = extractor
+    end
+    def self.extractor
+      return @@extractor
+    end
     ##
     # At any given point, the current document can be queried with this method; Typically used

data/lib/scrubyt/core/shared/extractor.rb CHANGED Viewed

@@ -71,6 +71,7 @@ module Scrubyt
           root_pattern
         end
       end
+      FetchAction.extractor = self
       context.extractor = self
       context.instance_eval(&extractor_definition)
       @evaluating_extractor_definition = false
@@ -84,9 +85,10 @@ module Scrubyt
       #Once all is set up, evaluate the extractor from the root pattern!
       root_results = evaluate_extractor
       FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
       @result = ScrubytResult.new('root')
-      @result.push(*root_results)
+      @result.push(*@root_results)
       @result.root_patterns = @root_patterns
       @result.source_file = source_file
       @result.source_proc = extractor_definition
@@ -128,14 +130,14 @@ module Scrubyt
     end
     def evaluate_extractor
-      root_results = []
+      @root_results ||= []
       current_page_count = 1
       catch :quit_next_page_loop do
         loop do
           url = get_current_doc_url #TODO need absolute address here 2/4
           @processed_pages << url
           @root_patterns.each do |root_pattern|
-            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+            @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
           end
           while @processed_pages.include? url #TODO need absolute address here 3/4
@@ -161,7 +163,8 @@ module Scrubyt
           current_page_count += 1
         end
       end
-      root_results
+      @root_patterns = []
+      @root_results
     end
   end

data/lib/scrubyt/output/result_node.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module Scrubyt
     end
     def to_s
+      return "" if result.nil?
       text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
       text = SharedUtils.unescape_entities(text)
       text.strip!
@@ -29,6 +30,10 @@ module Scrubyt
         text
       end
     end
+    def inspect
+      to_s
+    end
     def to_libxml
       libxml_node = XML::Node.new(name)

data/lib/scrubyt/utils/shared_utils.rb CHANGED Viewed

@@ -39,7 +39,7 @@ module Scrubyt
           results << node
           results.delete node.parent if node.is_a? Hpricot::Elem
         end
-        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
+        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
       }
       traverse_for_match_inner.call(node,regexp)
       results

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrubber-scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.4.20
+  version: 0.4.25
 platform: ruby
 authors:
 - Peter Szinek
@@ -45,7 +45,7 @@ extra_rdoc_files: []
 files:
 - COPYING
-- README
+- README.rdoc
 - CHANGELOG
 - Rakefile
 - lib/scrubyt/core/navigation/agents/firewatir.rb