RubyGems - scrappy - Versions diffs - 0.1.2 → 0.1.3 - Mend

scrappy 0.1.2 → 0.1.3

Files changed (10) hide show

data/History.txt CHANGED Viewed

@@ -1,3 +1,7 @@
+=== 0.1.3 2010-11-18
+* RDF node caching
 === 0.1.2 2010-11-03
 * Fix for script portability (shebang arguments)

data/README.rdoc CHANGED Viewed

@@ -129,7 +129,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
     agent = scrappy::Agent.create :kb=>kb
     # Get RDF output
-    output = agent.request :get, 'http://www.example.com'
+    output = agent.request :method=>:get, :uri=>'http://www.example.com'
     # Output all titles from the web page
     titles = output.find([], Node('dc:title'), nil)

data/bin/scrappy CHANGED Viewed

@@ -57,7 +57,7 @@ module Scrappy
       onload
       if Options.url
         Options.quiet = true
-        puts Agent.create.proxy(:get, Options.url)
+        puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
       elsif Options.proxy
         puts "Launching Scrappy Web Proxy..."
         Camping::Server.new(OpenStruct.new(:host => 'localhost', :port => Options.port, :server=>'mongrel'), ["#{Scrappy::Root}/lib/scrappy/proxy.rb"]).start

data/lib/scrappy.rb CHANGED Viewed

@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
 Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
 module Scrappy
-  VERSION = '0.1.2'
+  VERSION = '0.1.3'
 end
 # Require selectors

data/lib/scrappy/agent/agent.rb CHANGED Viewed

@@ -14,6 +14,9 @@ module Scrappy
     def self.[] id
       pool[id] || Agent.create(:id=>id)
     end
+    def self.cache
+      @cache ||= {}
+    end
     def self.create args={}
       if (args[:agent] || Options.agent) == :visual
@@ -25,7 +28,7 @@ module Scrappy
       end
     end
-    attr_accessor :id, :output, :content_type, :status, :options, :kb
+    attr_accessor :id, :options, :kb
     def initialize args={}
       super()
@@ -35,56 +38,70 @@ module Scrappy
       @options = Options.clone
     end
-    def request http_method, uri, inputs={}, depth=options.depth
+    def request args={}
       synchronize do
-        uri = "#{uri}.com" if uri =~ /\A\w+\Z/
-        uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
+        depth = args[:depth]
+        request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
+        # Expire cache
+        Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
-        # Perform the request
-        if http_method == :get
-          self.uri = uri
-          return RDF::Graph.new unless self.html_data?
+        # Lookup in cache
+        triples = if Agent::cache[request]
+          Agent::cache[request][:response]
         else
-          raise Exception, 'POST requests not supported yet'
-        end
+          # Perform the request
+          if request[:method] == :get
+            self.uri = request[:uri]
+          else
+            raise Exception, 'POST requests not supported yet'
+          end
+          response = if self.html_data?
+            add_visual_data! if options.referenceable     # Adds tags including visual information
+            extract self.uri, html, options.referenceable # Extract data
+          else
+            []
+          end
-        # Adds tags including visual information
-        add_visual_data! if options.referenceable
+          # Cache the request
+          Agent::cache[request]                       = { :time=>Time.now, :response=>response }
+          Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
-        # Extract data
-        triples = extract self.uri, html, options.referenceable
+          response
+        end
         # Iterate through subresources
         if depth > 0
           uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
           Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
         end
         RDF::Graph.new(triples.uniq)
       end
     end
-    def proxy http_method, uri, inputs={}, format=options.format, depth=options.depth
+    def proxy args={}
       synchronize do
-        if @status == :redirect and uri == self.uri
-          @status = :ok
-        else
-          @output = request(http_method, uri, inputs, depth).serialize(format)
-          @content_type = ContentTypes[format] || 'text/plain'
-          @status = if self.html_data?
-            self.uri == uri ? :ok : :redirect
-          else
-            :error
-          end
-        end
+        request  = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
-        @output
+        OpenStruct.new :output => self.request(request).serialize(request[:format]),
+                       :content_type => ContentTypes[request[:format]] || 'text/plain',
+                       :uri => self.uri,
+                       :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
       end
     end
     # Method used when consuming a list of uris
     def process uri, args={}
       sleep 0.001 * options.delay.to_f
-      request(:get, uri, {}, args[:depth]).triples
+      request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
+    end
+    def complete_uri uri
+      uri = "#{uri}.com" if uri =~ /\A\w+\Z/
+      uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
+      uri
     end
   end
 end

data/lib/scrappy/proxy.rb CHANGED Viewed

@@ -18,15 +18,15 @@ module Scrappy
       end
       protected
-      def process_request http_method
-        agent.proxy http_method, request.env["REQUEST_URI"], @input
+      def process_request method
+        response = agent.proxy :method=>method, :uri=>request.env["REQUEST_URI"], :inputs=>@input
-        case agent.status
+        case response.status
         when :redirect
-          redirect agent.uri
+          redirect response.uri
         when :ok
-          @headers['Content-Type'] = agent.content_type
-          agent.output
+          @headers['Content-Type'] = response.content_type
+          response.output
         else
           @status = 500
           'Error'

data/lib/scrappy/server.rb CHANGED Viewed

@@ -48,16 +48,16 @@ module Scrappy
       end
       protected
-      def process_request http_method, format, url
+      def process_request method, format, url
         callback = @input['callback']
-        agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
+        response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
-        case agent.status
+        case response.status
         when :redirect
-          redirect "/#{format}/#{agent.uri}#{inputs}"
+          redirect "/#{format}/#{response.uri}#{inputs}"
         when :ok
-          @headers['Content-Type'] = agent.content_type
-          callback ? "#{callback}(#{agent.output})" : agent.output
+          @headers['Content-Type'] = response.content_type
+          callback ? "#{callback}(#{response.output})" : response.output
         else
           @status = 500
           'Error'

data/lib/scrappy/shell.rb CHANGED Viewed

@@ -29,7 +29,7 @@ module Scrappy
       command = raw_command.strip
       code = if command =~ /\Aget\W(.*)\Z/
-        puts @agent.proxy :get, $1
+        puts @agent.proxy(:uri=>$1).output
         puts
       elsif command == 'help'
         puts 'Available commands:'

data/scrappy.gemspec CHANGED Viewed

@@ -2,11 +2,11 @@
 Gem::Specification.new do |s|
   s.name = %q{scrappy}
-  s.version = "0.1.2"
+  s.version = "0.1.3"
   s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
   s.authors = ["Jose Ignacio"]
-  s.date = %q{2010-11-03}
+  s.date = %q{2010-11-18}
   s.default_executable = %q{scrappy}
   s.description = %q{RDF web scraper}
   s.email = %q{joseignacio.fernandez@gmail.com}

metadata CHANGED Viewed

@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
   segments:
   - 0
   - 1
-  - 2
-  version: 0.1.2
+  - 3
+  version: 0.1.3
 platform: ruby
 authors:
 - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-11-03 00:00:00 +01:00
+date: 2010-11-18 00:00:00 +01:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency