RubyGems - scrappy - Versions diffs - 0.1.16 → 0.1.17 - Mend

scrappy 0.1.16 → 0.1.17

Files changed (11) hide show

data/History.txt +5 -0
data/README.rdoc +3 -3
data/bin/scrappy +3 -3
data/lib/scrappy.rb +1 -1
data/lib/scrappy/agent/agent.rb +3 -3
data/lib/scrappy/agent/formats.rb +5 -1
data/lib/scrappy/selectors/root.rb +3 -1
data/lib/scrappy/selectors/section.rb +2 -2
data/lib/scrappy/selectors/xpath.rb +1 -1
data/scrappy.gemspec +2 -2
metadata +3 -3

data/History.txt CHANGED Viewed

@@ -1,3 +1,8 @@
+=== 0.1.17 2011-02-15
+* Enabling headless yarf serialization in shell mode
+* Small fixes
 === 0.1.16 2011-02-09
 * Added support to RDF abbreviated output by loading kb's prefixes

data/README.rdoc CHANGED Viewed

@@ -123,13 +123,13 @@ scrappy offers many different interfaces to get RDF data from a web page:
     require 'scrappy'
     # Parse a knowledge base
-    kb = RDF::Parser.parse(:rdf, open("kb.rdf").read)
+    kb = RDF::Parser.parse :yarf, open("https://github.com/josei/scrappy/raw/master/kb/elmundo.yarf").read
     # Create an agent
-    agent = scrappy::Agent.create :kb=>kb
+    agent = Scrappy::Agent.create :kb=>kb
     # Get RDF output
-    output = agent.request :method=>:get, :uri=>'http://www.example.com'
+    output = agent.request :method=>:get, :uri=>'http://www.elmundo.es'
     # Output all titles from the web page
     titles = output.find([], Node('dc:title'), nil)

data/bin/scrappy CHANGED Viewed

@@ -41,7 +41,7 @@ module Scrappy
         opts.on('-p URL', '--post URL')         { |url| Options.url = url; Options.http_method=:post }
         opts.on('-D', '--dump')                 { Agent::Options.dump = true; Agent::Options.format = :rdf }
         opts.on('-u', '--debug')                { Agent::Options.debug = true }
-        opts.on('-i', '--interactive')          { Options.shell = true }
+        opts.on('-i', '--interactive')          { Options.shell = true; Agent::Options.format_header = false }
         opts.on('-s', '--server')               { Options.server = true }
         opts.on('-S', '--proxy-server')         { Options.proxy = true }
         opts.on('-P P', '--port P')             { |p| Options.port = p }
@@ -109,8 +109,8 @@ Options
   -S, --proxy-server       Runs web proxy
   -P, --port PORT          Selects port number (default is 3434)
   -v, --visual             Uses visual agent (slow)
-  -r, --reference          Outputs referenceable data (requires -v)
-  -R, --reference-all      Outputs all HTML referenceable data (requires -v)
+  -r, --reference          Outputs referenceable data
+  -R, --reference-all      Outputs all HTML referenceable data
   -w, --window             Shows browser window (requires -v)
 Authors

data/lib/scrappy.rb CHANGED Viewed

@@ -21,7 +21,7 @@ require 'scrappy/agent/agent'
 Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
 module Scrappy
-  VERSION = '0.1.16'
+  VERSION = '0.1.17'
 end
 # Require selectors

data/lib/scrappy/agent/agent.rb CHANGED Viewed

@@ -5,7 +5,7 @@ module Scrappy
     include MapReduce
     include Cached
-    Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
+    Options = OpenStruct.new :format=>:yarf, :format_header=>true, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
     ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
                      :rdf => 'application/rdf+xml' }
@@ -41,7 +41,7 @@ module Scrappy
     end
     def map args, queue=nil
-      depth = args[:depth]
+      depth = args[:depth] || options.depth
       request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
       # Expire cache
@@ -139,7 +139,7 @@ module Scrappy
           print "Serializing..."; $stdout.flush
         end
-        output = response.serialize request[:format]
+        output = response.serialize request[:format], @options.format_header
         puts 'done!'if options.debug

data/lib/scrappy/agent/formats.rb CHANGED Viewed

@@ -1,16 +1,20 @@
 module Scrappy
   module Formats
-    def format node, formats
+    def format node, formats, uri
       case formats.first
       when Node('sc:WikiText') then
         doc = Nokogiri::XML(node.to_html)
+        doc.search("a").each {|n| n.replace(Nokogiri::XML::Text.new(URI.parse(uri).merge(n["href"]).to_s, n.document)) }
         doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
         doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
         doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
         doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
         doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
         doc.search("b").each  {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
+        doc.search("td").each     {|n| n.replace(Nokogiri::XML::Text.new("<td>#{n.text.strip}</td>", n.document)) }
+        doc.search("tr").each     {|n| n.replace(Nokogiri::XML::Text.new("<tr>#{n.text.strip}</tr>", n.document)) }
+        doc.search("table").each  {|n| n.replace(Nokogiri::XML::Text.new("<table>#{n.text.strip}</table>", n.document)) }
         doc.search("li li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("***** #{n.text.strip}", n.document)) }
         doc.search("li li li li").each {|n| n.replace(Nokogiri::XML::Text.new("**** #{n.text.strip}", n.document)) }
         doc.search("li li li").each {|n| n.replace(Nokogiri::XML::Text.new("*** #{n.text.strip}", n.document)) }

data/lib/scrappy/selectors/root.rb CHANGED Viewed

@@ -1,10 +1,12 @@
 module RootSelector
+  extend Scrappy::Formats
   def self.filter selector, doc
     if selector.sc::attribute.first
       # Select node's attribute if given
       selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
     else
-      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
+      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>format(doc[:content], selector.sc::format, doc[:uri]) } ]
     end
   end
 end

data/lib/scrappy/selectors/section.rb CHANGED Viewed

@@ -5,9 +5,9 @@ module SectionSelector
     selector.rdf::value.map do |pattern|
       doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
         found = false
-        content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
+        content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name or n.name=='div'); !found }
-        [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
+        [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format, doc[:uri])}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
       end
     end.flatten
   end

data/lib/scrappy/selectors/xpath.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module XPathSelector
           selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
         else
           # Select node
-          [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
+          [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format, doc[:uri]) } ]
         end
       end
     end.flatten

data/scrappy.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scrappy}
-  s.version = "0.1.16"
+  s.version = "0.1.17"
   s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jose Ignacio"]
-  s.date = %q{2011-02-09}
+  s.date = %q{2011-02-15}
   s.default_executable = %q{scrappy}
   s.description = %q{RDF web scraper}
   s.email = %q{joseignacio.fernandez@gmail.com}

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 16
-  version: 0.1.16
+  - 17
+  version: 0.1.17
 platform: ruby
 authors:
 - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-02-09 00:00:00 +01:00
+date: 2011-02-15 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency