RubyGems - scrappy - Versions diffs - 0.1.16 → 0.1.17 - Mend

scrappy 0.1.16 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

data/History.txt +5 -0
data/README.rdoc +3 -3
data/bin/scrappy +3 -3
data/lib/scrappy.rb +1 -1
data/lib/scrappy/agent/agent.rb +3 -3
data/lib/scrappy/agent/formats.rb +5 -1
data/lib/scrappy/selectors/root.rb +3 -1
data/lib/scrappy/selectors/section.rb +2 -2
data/lib/scrappy/selectors/xpath.rb +1 -1
data/scrappy.gemspec +2 -2
metadata +3 -3

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.17 2011-02-15
+* Enabling headless yarf serialization in shell mode
+* Small fixes
 === 0.1.16 2011-02-09
 * Added support to RDF abbreviated output by loading kb's prefixes

data/README.rdoc CHANGED Viewed

@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
     require 'scrappy'
     # Parse a knowledge base
-    kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
+    kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
     # Create an agent
-    agent = scrappy::Agent.create :kb=>kb
+    agent = Scrappy::Agent.create :kb=>kb
     # Get RDF output
-    output = agent.request :method=>:get, :uri=>'http://www.example.com'
+    output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
     # Output all titles from the web page
     titles = output.find([], Node('dc:title'), nil)

data/bin/scrappy CHANGED Viewed

@@ -41,7 +41,7 @@ module Scrappy
         opts.on('-p URL', '--post URL')         { |url| Options.url = url; Options.http_method=:post }
         opts.on('-D', '--dump')                 { Agent::Options.dump = true; Agent::Options.format = :rdf }
         opts.on('-u', '--debug')                { Agent::Options.debug = true }
-        opts.on('-i', '--interactive')          { Options.shell = true }
+        opts.on('-i', '--interactive')          { Options.shell = true; Agent::Options.format_header = false }
         opts.on('-s', '--server')               { Options.server = true }
         opts.on('-S', '--proxy-server')         { Options.proxy = true }
         opts.on('-P P', '--port P')             { |p| Options.port = p }
@@ -109,8 +109,8 @@ Options
   -S, --proxy-server       Runs web proxy
   -P, --port PORT          Selects port number (default is 3434)
   -v, --visual             Uses visual agent (slow)
-  -r, --reference          Outputs referenceable data (requires -v)
-  -R, --reference-all      Outputs all HTML referenceable data (requires -v)
+  -r, --reference          Outputs referenceable data
+  -R, --reference-all      Outputs all HTML referenceable data
   -w, --window             Shows browser window (requires -v)
 Authors

data/lib/scrappy.rb CHANGED Viewed

@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
 Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
 module Scrappy
-  VERSION = '0.1.16'
+  VERSION = '0.1.17'
 end
 # Require selectors

data/lib/scrappy/agent/agent.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Scrappy
     include MapReduce
     include Cached
-    Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
+    Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
     ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
                      :rdf => 'application/rdf+xml' }
@@ -41,7 +41,7 @@ module Scrappy
     end
     def map args, queue=nil
-      depth = args[:depth]
+      depth = args[:depth] || options.depth
       request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
       # Expire cache
@@ -139,7 +139,7 @@ module Scrappy
           print "Serializing..."; $stdout.flush
         end
-        output = response.serialize request[:format]
+        output = response.serialize request[:format], @options.format_header
         puts 'done!'if options.debug

data/lib/scrappy/agent/formats.rb CHANGED Viewed

@@ -1,16 +1,20 @@
 module Scrappy
   module Formats
-    def format node, formats
+    def format node, formats, uri
       case formats.first
       when Node('sc:WikiText') then
         doc = Nokogiri::XML(node.to_html)
+        doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
         doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
         doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
         doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
         doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
         doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
         doc.search("b").each  {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
+        doc.search("td").each     {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
+        doc.search("tr").each     {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
+        doc.search("table").each  {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
         doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
         doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
         doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }

data/lib/scrappy/selectors/root.rb CHANGED Viewed

@@ -1,10 +1,12 @@
 module RootSelector
+  extend Scrappy::Formats
   def self.filter selector, doc
     if selector.sc::attribute.first
       # Select node's attribute if given
       selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
     else
-      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
+      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
     end
   end
 end

data/lib/scrappy/selectors/section.rb CHANGED Viewed

@@ -5,9 +5,9 @@ module SectionSelector
     selector.rdf::value.map do |pattern|
       doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
         found = false
-        content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
+        content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
-        [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
+        [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
       end
     end.flatten
   end

data/lib/scrappy/selectors/xpath.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module XPathSelector
           selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
         else
           # Select node
-          [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
+          [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
         end
       end
     end.flatten

data/scrappy.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scrappy}
-  s.version = "0.1.16"
+  s.version = "0.1.17"
   s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jose Ignacio"]
-  s.date = %q{2011-02-09}
+  s.date = %q{2011-02-15}
   s.default_executable = %q{scrappy}
   s.description = %q{RDF web scraper}
   s.email = %q{joseignacio.fernandez@gmail.com}

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 16
-  version: 0.1.16
+  - 17
+  version: 0.1.17
 platform: ruby
 authors:
 - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-02-09 00:00:00 +01:00
+date: 2011-02-15 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency