RubyGems - scrubber-scrubyt - Versions diffs - 0.4.20 → 0.4.25 - Mend

scrubber-scrubyt 0.4.20 → 0.4.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

data/CHANGELOG +6 -1
data/{README → README.rdoc} +0 -0
data/lib/scrubyt.rb +9 -1
data/lib/scrubyt/core/navigation/agents/firewatir.rb +4 -0
data/lib/scrubyt/core/navigation/fetch_action.rb +9 -0
data/lib/scrubyt/core/shared/extractor.rb +7 -4
data/lib/scrubyt/output/result_node.rb +5 -0
data/lib/scrubyt/utils/shared_utils.rb +1 -1
metadata +2 -2

data/CHANGELOG CHANGED Viewed

@@ -1,10 +1,15 @@
 = scRUBYt! Changelog
 == 0.4.3
-== 20th April
+== 23th April
 - [NEW] option to close the firefox window after the scraping is finished (thanks to Mikkel Garcia and Damien Garros)
+- [NEW] Added the ability to scrape, click_link, then scrape again.  (Only for firefox agent) (thanks to Mikkel Garcia)
 - [FIX] scRUBYt! now works with latest version of mechanize (thanks to nesquena, austinmoore and Leandro Nunes)
+- [NEW] added a wrapper around the firewatir requirement to make firewatir optional
+- [FIX] added test to prohibit traverse_from_match from attempting to traverse nil children (thanks to Dennis Sutch)
 == 0.4.05

data/{README → README.rdoc} RENAMED Viewed

File without changes

data/lib/scrubyt.rb CHANGED Viewed

@@ -28,7 +28,15 @@ require "#{File.dirname(__FILE__)}/scrubyt/core/scraping/compound_example.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/output/result_node.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/output/scrubyt_result.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/mechanize.rb"
-require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
+# -- Making Firewatir optional --
+  if defined? Firewatir::Firefox
+    require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/agents/firewatir.rb"
+  else
+    puts "The gem firewatir is not installed"
+  end
+# --
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/navigation_actions.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/navigation/fetch_action.rb"
 require "#{File.dirname(__FILE__)}/scrubyt/core/shared/extractor.rb"

data/lib/scrubyt/core/navigation/agents/firewatir.rb CHANGED Viewed

@@ -103,6 +103,10 @@ module Scrubyt
             end
             sleep(wait_secs) if wait_secs > 0
             @@agent.wait
+            # evaluate the results
+            extractor.evaluate_extractor
             @@current_doc_url = @@agent.url
             @@mechanize_doc = "<html>#{@@agent.html}</html>"
             @@hpricot_doc = Hpricot(PreFilterDocument.br_to_newline(@@mechanize_doc))

data/lib/scrubyt/core/navigation/fetch_action.rb CHANGED Viewed

@@ -14,6 +14,15 @@ module Scrubyt
     @@host_name = nil
     @@history = []
     @@current_form = nil
+    @@extractor = nil
+    def self.extractor=(extractor)
+      @@extractor = extractor
+    end
+    def self.extractor
+      return @@extractor
+    end
     ##
     # At any given point, the current document can be queried with this method; Typically used

data/lib/scrubyt/core/shared/extractor.rb CHANGED Viewed

@@ -71,6 +71,7 @@ module Scrubyt
           root_pattern
         end
       end
+      FetchAction.extractor = self
       context.extractor = self
       context.instance_eval(&extractor_definition)
       @evaluating_extractor_definition = false
@@ -84,9 +85,10 @@ module Scrubyt
       #Once all is set up, evaluate the extractor from the root pattern!
       root_results = evaluate_extractor
       FetchAction.close_firefox if @mode.is_a?(Hash) && @mode[:close]
       @result = ScrubytResult.new('root')
-      @result.push(*root_results)
+      @result.push(*@root_results)
       @result.root_patterns = @root_patterns
       @result.source_file = source_file
       @result.source_proc = extractor_definition
@@ -128,14 +130,14 @@ module Scrubyt
     end
     def evaluate_extractor
-      root_results = []
+      @root_results ||= []
       current_page_count = 1
       catch :quit_next_page_loop do
         loop do
           url = get_current_doc_url #TODO need absolute address here 2/4
           @processed_pages << url
           @root_patterns.each do |root_pattern|
-            root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
+            @root_results.push(*root_pattern.evaluate(get_hpricot_doc, nil))
           end
           while @processed_pages.include? url #TODO need absolute address here 3/4
@@ -161,7 +163,8 @@ module Scrubyt
           current_page_count += 1
         end
       end
-      root_results
+      @root_patterns = []
+      @root_results
     end
   end

data/lib/scrubyt/output/result_node.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module Scrubyt
     end
     def to_s
+      return "" if result.nil?
       text = (@result.is_a? String) ? @result : @result.inner_html.gsub(/<.*?>/, '')
       text = SharedUtils.unescape_entities(text)
       text.strip!
@@ -29,6 +30,10 @@ module Scrubyt
         text
       end
     end
+    def inspect
+      to_s
+    end
     def to_libxml
       libxml_node = XML::Node.new(name)

data/lib/scrubyt/utils/shared_utils.rb CHANGED Viewed

@@ -39,7 +39,7 @@ module Scrubyt
           results << node
           results.delete node.parent if node.is_a? Hpricot::Elem
         end
-        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) }
+        node.children.each { |child| traverse_for_match_inner.call(child, regexp) if (child.is_a? Hpricot::Elem) } if ! node.children.nil?
       }
       traverse_for_match_inner.call(node,regexp)
       results

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: scrubber-scrubyt
 version: !ruby/object:Gem::Version
-  version: 0.4.20
+  version: 0.4.25
 platform: ruby
 authors:
 - Peter Szinek
@@ -45,7 +45,7 @@ extra_rdoc_files: []
 files:
 - COPYING
-- README
+- README.rdoc
 - CHANGELOG
 - Rakefile
 - lib/scrubyt/core/navigation/agents/firewatir.rb