RubyGems - scrappy - Versions diffs - 0.1.1 → 0.1.2 - Mend

scrappy 0.1.1 → 0.1.2

Files changed (15) hide show

data/History.txt +5 -1
data/Manifest +8 -1
data/README.rdoc +6 -0
data/bin/scrappy +3 -2
data/lib/scrappy/agent/extractor.rb +14 -31
data/lib/scrappy/selectors/base_uri.rb +5 -0
data/lib/scrappy/selectors/css.rb +6 -0
data/lib/scrappy/selectors/root.rb +5 -0
data/lib/scrappy/selectors/slice.rb +8 -0
data/lib/scrappy/selectors/uri.rb +10 -0
data/lib/scrappy/selectors/uri_pattern.rb +10 -0
data/lib/scrappy/selectors/xpath.rb +15 -0
data/lib/scrappy.rb +5 -1
data/scrappy.gemspec +4 -4
metadata +18 -4

data/History.txt CHANGED Viewed

@@ -1,4 +1,8 @@
-=== 0.1.1 2010-09-30
+=== 0.1.2 2010-11-03
+* Fix for script portability (shebang arguments)
+=== 0.1.1 2010-10-30
 * Extremely basic annotator tool
 * Visual agent forces no concurrency to prevent memory leaks

data/Manifest CHANGED Viewed

@@ -1,4 +1,5 @@
 History.txt
+Manifest
 README.rdoc
 Rakefile
 bin/scrappy
@@ -11,10 +12,16 @@ lib/scrappy/agent/cluster.rb
 lib/scrappy/agent/extractor.rb
 lib/scrappy/agent/visual_agent.rb
 lib/scrappy/proxy.rb
+lib/scrappy/selectors/base_uri.rb
+lib/scrappy/selectors/css.rb
+lib/scrappy/selectors/root.rb
+lib/scrappy/selectors/slice.rb
+lib/scrappy/selectors/uri.rb
+lib/scrappy/selectors/uri_pattern.rb
+lib/scrappy/selectors/xpath.rb
 lib/scrappy/server.rb
 lib/scrappy/shell.rb
 lib/scrappy/support.rb
 lib/scrappy/webkit/webkit.rb
 test/test_helper.rb
 test/test_scrappy.rb
-Manifest

data/README.rdoc CHANGED Viewed

@@ -150,6 +150,12 @@ Additionally, some extra libraries are needed for certain features:
 * PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
+== CONTRIBUTORS:
+* José Ignacio Fernández
+* Jacobo Blasco
 == LICENSE:
 (The MIT License)

data/bin/scrappy CHANGED Viewed

@@ -30,6 +30,7 @@ module Scrappy
       Options.port = 3434
       Options.concurrence = 10
       Agent::Options.depth = 1
+      args = ARGV.map { |arg| arg.split(" ") }.flatten
       OptionParser.new do |opts|
         opts.on('-V', '--version')              { output_version; exit 0 }
@@ -48,8 +49,8 @@ module Scrappy
         opts.on('-R', '--reference-all')        { Agent::Options.referenceable = :dump }
         opts.on('-w', '--window')               { Agent::Options.window = true }
         opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
-      end.parse!(ARGV)
-      @file = ARGV.shift
+      end.parse!(args)
+      @file = args.shift
     end
     def run

data/lib/scrappy/agent/extractor.rb CHANGED Viewed

@@ -5,10 +5,15 @@ module Scrappy
     def extract uri, html, referenceable=nil
       triples = []
       content = Nokogiri::HTML(html, nil, 'utf-8')
-      uri_selectors  = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')).select{ |n| n.rdf::value.include?(uri.match(/\A([^\?]*)(\?.*\Z)?/).captures.first) }
-      uri_selectors += kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).select{|n| n.rdf::value.any?{|v| /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ =~ uri} }
+      uri_selectors  = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).flatten.select do |uri_selector|
+        class_name = uri_selector.rdf::type.first.to_s.split('#').last
+        results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
+        !results.empty?
+      end
       fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
       fragments.each do |fragment|
         extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
                                    :parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
@@ -69,38 +74,16 @@ module Scrappy
     end
     def filter selector, doc
-      content = doc[:content]
-      uri = doc[:uri]
-      results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
-         selector.rdf::type.include?(Node('sc:XPathSelector'))
-        selector.rdf::value.map do |pattern|
-          content.search(pattern).map do |result|
-            if selector.sc::attribute.first
-              # Select node's attribute if given
-              selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
-            else
-              # Select node
-              [ { :uri=>uri, :content=>result, :value=>result.text } ]
-            end
-          end
-        end.flatten
-      elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
-        text = content.text
-        selector.rdf::value.map do |separator|
-          slices = text.split(separator)
-          selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
-        end.flatten
+      # From "BaseUriSelector" to "base_uri"
+      class_name = selector.rdf::type.first.to_s.split('#').last
-      elsif selector.rdf::type.include?(Node('sc:BaseUriSelector'))
-        [ { :uri=>uri, :content=>content, :value=>uri } ]
+      # Process selector
+      results = Kernel.const_get(class_name).filter selector, doc
-      else
-        [ { :uri=>uri, :content=>content, :value=>content.text } ]
-      end
-      # Process nested selectors, if any
+      # Return results if no nested selectors
       return results if selector.sc::selector.empty?
+      # Process nested selectors
       results.map do |result|
         selector.sc::selector.map { |s| filter s, result }
       end.flatten

data/lib/scrappy/selectors/base_uri.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module BaseUriSelector
+  def self.filter selector, doc
+    [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:uri] } ]
+  end
+end

data/lib/scrappy/selectors/css.rb ADDED Viewed

@@ -0,0 +1,6 @@
+module CssSelector
+  def self.filter selector, doc
+    # By using Nokogiri, CSS and XPath use the same search method
+    XPathSelector.filter selector, doc
+  end
+end

data/lib/scrappy/selectors/root.rb ADDED Viewed

@@ -0,0 +1,5 @@
+module RootSelector
+  def self.filter selector, doc
+    [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
+  end
+end

data/lib/scrappy/selectors/slice.rb ADDED Viewed

@@ -0,0 +1,8 @@
+module SliceSelector
+  def self.filter selector, doc
+    selector.rdf::value.map do |separator|
+      slices = doc[:content].text.split(separator)
+      selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
+    end.flatten
+  end
+end

data/lib/scrappy/selectors/uri.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module UriSelector
+  def self.filter selector, doc
+    # Check if the UriSelector has this URI as value (without params: ?param1=value1&param2=value2)
+    if selector.rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
+      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
+    else
+      []
+    end
+  end
+end

data/lib/scrappy/selectors/uri_pattern.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module UriPatternSelector
+  def self.filter selector, doc
+    # Check if the uri fits the pattern
+    if selector.rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
+      [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
+    else
+      []
+    end
+  end
+end

data/lib/scrappy/selectors/xpath.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module XPathSelector
+  def self.filter selector, doc
+    selector.rdf::value.map do |pattern|
+      doc[:content].search(pattern).map do |result|
+        if selector.sc::attribute.first
+          # Select node's attribute if given
+          selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
+        else
+          # Select node
+          [ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
+        end
+      end
+    end.flatten
+  end
+end

data/lib/scrappy.rb CHANGED Viewed

@@ -11,6 +11,7 @@ require 'tmpdir'
 require 'lightrdf'
 require 'scrappy/support'
 require 'scrappy/agent/extractor'
 require 'scrappy/agent/cluster'
 require 'scrappy/agent/agent'
@@ -18,5 +19,8 @@ require 'scrappy/agent/agent'
 Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
 module Scrappy
-  VERSION = '0.1.1'
+  VERSION = '0.1.2'
 end
+# Require selectors
+Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }

data/scrappy.gemspec CHANGED Viewed

@@ -2,17 +2,17 @@
 Gem::Specification.new do |s|
   s.name = %q{scrappy}
-  s.version = "0.1.1"
+  s.version = "0.1.2"
   s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jose Ignacio"]
-  s.date = %q{2010-10-29}
+  s.date = %q{2010-11-03}
   s.default_executable = %q{scrappy}
   s.description = %q{RDF web scraper}
   s.email = %q{joseignacio.fernandez@gmail.com}
   s.executables = ["scrappy"]
-  s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
-  s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "Manifest", "scrappy.gemspec"]
+  s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
+  s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
   s.homepage = %q{http://github.com/josei/scrappy}
   s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
   s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 1
-  version: 0.1.1
+  - 2
+  version: 0.1.2
 platform: ruby
 authors:
 - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-10-29 00:00:00 +02:00
+date: 2010-11-03 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -130,12 +130,20 @@ extra_rdoc_files:
 - lib/scrappy/agent/extractor.rb
 - lib/scrappy/agent/visual_agent.rb
 - lib/scrappy/proxy.rb
+- lib/scrappy/selectors/base_uri.rb
+- lib/scrappy/selectors/css.rb
+- lib/scrappy/selectors/root.rb
+- lib/scrappy/selectors/slice.rb
+- lib/scrappy/selectors/uri.rb
+- lib/scrappy/selectors/uri_pattern.rb
+- lib/scrappy/selectors/xpath.rb
 - lib/scrappy/server.rb
 - lib/scrappy/shell.rb
 - lib/scrappy/support.rb
 - lib/scrappy/webkit/webkit.rb
 files:
 - History.txt
+- Manifest
 - README.rdoc
 - Rakefile
 - bin/scrappy
@@ -148,13 +156,19 @@ files:
 - lib/scrappy/agent/extractor.rb
 - lib/scrappy/agent/visual_agent.rb
 - lib/scrappy/proxy.rb
+- lib/scrappy/selectors/base_uri.rb
+- lib/scrappy/selectors/css.rb
+- lib/scrappy/selectors/root.rb
+- lib/scrappy/selectors/slice.rb
+- lib/scrappy/selectors/uri.rb
+- lib/scrappy/selectors/uri_pattern.rb
+- lib/scrappy/selectors/xpath.rb
 - lib/scrappy/server.rb
 - lib/scrappy/shell.rb
 - lib/scrappy/support.rb
 - lib/scrappy/webkit/webkit.rb
 - test/test_helper.rb
 - test/test_scrappy.rb
-- Manifest
 - scrappy.gemspec
 has_rdoc: true
 homepage: http://github.com/josei/scrappy