RubyGems - scrappy - Versions diffs - 0.1 - Mend

scrappy 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

data/History.txt +3 -0
data/Manifest.txt +19 -0
data/README.rdoc +176 -0
data/Rakefile +20 -0
data/bin/scrappy +228 -0
data/kb/elmundo.yarf +92 -0
data/lib/scrappy.rb +22 -0
data/lib/scrappy/agent/agent.rb +90 -0
data/lib/scrappy/agent/blind_agent.rb +34 -0
data/lib/scrappy/agent/cluster.rb +35 -0
data/lib/scrappy/agent/extractor.rb +159 -0
data/lib/scrappy/agent/visual_agent.rb +72 -0
data/lib/scrappy/proxy.rb +41 -0
data/lib/scrappy/server.rb +77 -0
data/lib/scrappy/shell.rb +70 -0
data/lib/scrappy/support.rb +18 -0
data/lib/scrappy/webkit/webkit.rb +18 -0
data/test/test_helper.rb +3 -0
data/test/test_scrappy.rb +11 -0
metadata +233 -0

data/lib/scrappy/proxy.rb ADDED

@@ -0,0 +1,41 @@
+require 'camping'
+require 'camping/session'
+require 'open3'
+Camping.goes :Scrappy
+module Scrappy
+  module Controllers
+    class Index < R '.*'
+      include InputEscaping
+      def get
+        process_request :get
+      end
+      def post
+        process_request :post
+      end
+      protected
+      def process_request http_method
+        agent.proxy http_method, request.env["REQUEST_URI"], @input
+        case agent.status
+        when :redirect
+          redirect agent.uri
+        when :ok
+          @headers['Content-Type'] = agent.content_type
+          agent.output
+        else
+          @status = 500
+          'Error'
+        end
+      end
+      def agent
+        Scrappy::Agent[@request.env["REMOTE_ADDR"]]
+      end
+    end
+  end
+end

data/lib/scrappy/server.rb ADDED

@@ -0,0 +1,77 @@
+require 'camping'
+require 'camping/session'
+require 'open3'
+Camping.goes :Scrappy
+module Scrappy
+  include Camping::Session
+  secret '1a36591bceec49c832079e270d7e8b73'
+  module Controllers
+    class Index
+      def get
+        mab do
+          html do
+            head {}
+            body do
+              h1 "Scrappy Web Server"
+              p  "Use following URL format: http://[host]/[format]/[url]"
+              p  do
+                "For example: " + a("http://localhost:3434/rdfxml/http://www.google.com",
+                             :href=>"http://localhost:3434/rdfxml/http://www.google.com")
+              end
+              p do
+                "Remember to escape parameters: " +
+                "http://www.example.com/~user/%3Ftest%3D1%26test1%3D2<br/> or<br/> " +
+                "http%3A%2F%2Fwww.example.com%2F~user%2F%3Ftest%3D1%26test1%3D2<br/>" +
+                "instead of<br/> http://www.example.com/~user/?test=1&test1=2"
+              end
+              p do
+                "Available formats are png, yarf, rdfxml, ntriples, turtle, json, jsonrdf, ejson"
+              end
+            end
+          end
+        end
+      end
+    end
+    class Extract < R '/(\w+)/(.+)'
+      include InputEscaping
+      def get format, url
+        process_request :get, format, url
+      end
+      def post format, url
+        process_request :post, format, url
+      end
+      protected
+      def process_request http_method, format, url
+        callback = @input['callback']
+        agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
+        case agent.status
+        when :redirect
+          redirect "/#{format}/#{agent.uri}#{inputs}"
+        when :ok
+          @headers['Content-Type'] = agent.content_type
+          callback ? "#{callback}(#{agent.output})" : agent.output
+        else
+          @status = 500
+          'Error'
+        end
+      end
+      def agent
+        return @agent if @agent
+        if @state[:agent].nil? || @state[:token] != SESSION_TOKEN
+          @state[:token] = SESSION_TOKEN
+          @state[:agent] = Scrappy::Agent.create.id
+        end
+        @agent = Scrappy::Agent[@state[:agent]]
+      end
+    end
+  end
+end

data/lib/scrappy/shell.rb ADDED

@@ -0,0 +1,70 @@
+module Scrappy
+  class Shell
+    def initialize file=nil
+      @agent = Agent.create
+      @file = file
+    end
+    def run
+      commands = ['get', 'put', 'help']
+      Readline.completion_append_character = " "
+      Readline.completer_word_break_characters = ""
+      Readline.completion_proc = proc { |line| commands.grep(/^#{Regexp.escape(line)}/).sort }
+      if @file
+        open(@file, 'r').lines.each do |line|
+          break if process(line) == :quit
+        end
+      else
+        begin
+          line = Readline.readline(bash, true)
+          code = process line.nil? ? (puts 'quit' unless Options.quiet; 'quit') : line
+        end while code != :quit
+      end
+    end
+    protected
+    def process raw_command
+      command = raw_command.strip
+      code = if command =~ /\Aget\W(.*)\Z/
+        puts @agent.proxy :get, $1
+        puts ''
+      elsif command == 'help'
+        puts 'Available commands:'
+        puts '  get URL: Visit the specified URL'
+        puts '  help: Show this information'
+        puts '  quit: Exit scrappy shell'
+        puts ''
+      elsif command == 'quit'
+        :quit
+      elsif command == '' or command[0..0] == '#'
+        nil
+      else
+        puts "ERROR: Unknown command '#{command}'"
+        puts ''
+      end
+      code
+    end
+    def bash
+      return '' if Options.quiet
+      location = if @agent.uri
+        uri = URI::parse(@agent.uri)
+        path = uri.path.to_s
+        path = path[0..0] + "..." + path[-16..-1] if path.size > 20
+        if uri.query
+          query = "?" + uri.query
+          query = "?..." + query[-10..-1] if query.size > 13
+        else
+          query = ""
+        end
+        "#{uri.base}#{path}#{query}"
+      else
+        ''
+      end
+      "#{location}$ "
+    end
+  end
+end

data/lib/scrappy/support.rb ADDED

@@ -0,0 +1,18 @@
+require 'open-uri'
+require 'net/http'
+require 'net/https'
+module URI
+  def base
+    self.to_s.split('/')[0..2] * '/'
+  end
+end
+module Scrappy
+  module InputEscaping
+    def inputs
+      return '' if @input.empty?
+      "?" + (@input.map{|k,v| "#{CGI.escape(k)}=#{CGI.escape(v)}"}*'')
+    end
+  end
+end

data/lib/scrappy/webkit/webkit.rb ADDED

@@ -0,0 +1,18 @@
+require 'gtk2'
+module Gtk
+  module WebKit
+  end
+end
+require 'rbwebkitgtk.so'
+class Gtk::WebKit::WebView
+  alias :load_html_string_no_defaults :load_html_string
+  def load_html_string(content, base_uri=nil)
+    load_html_string_no_defaults(content, base_uri)
+  end
+  def mark_text_matches(test, case_sensitive=false, limit=0)
+    mark_text_matches_with_limit(test, case_sensitive, limit)
+  end
+end

data/test/test_helper.rb ADDED

@@ -0,0 +1,3 @@
+require 'stringio'
+require 'test/unit'
+require File.dirname(__FILE__) + '/../lib/scrappy'

data/test/test_scrappy.rb ADDED

@@ -0,0 +1,11 @@
+require File.dirname(__FILE__) + '/test_helper.rb'
+class TestScrappy < Test::Unit::TestCase
+  def setup
+  end
+  def test_truth
+    assert true
+  end
+end

metadata ADDED

@@ -0,0 +1,233 @@
+--- !ruby/object:Gem::Specification
+name: scrappy
+version: !ruby/object:Gem::Version
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  version: "0.1"
+platform: ruby
+authors:
+- Jose Ignacio
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-10-07 00:00:00 +02:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: activesupport
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 2
+        - 3
+        - 5
+        version: 2.3.5
+  type: :runtime
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: markaby
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 7
+        - 1
+        version: 0.7.1
+  type: :runtime
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: camping
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    requirements:
+    - - "="
+      - !ruby/object:Gem::Version
+        segments:
+        - 2
+        - 0
+        version: "2.0"
+  type: :runtime
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 1
+        - 4
+        - 1
+        version: 1.4.1
+  type: :runtime
+  version_requirements: *id004
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 1
+        - 0
+        - 0
+        version: 1.0.0
+  type: :runtime
+  version_requirements: *id005
+- !ruby/object:Gem::Dependency
+  name: lightrdf
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 0
+        - 1
+        version: "0.1"
+  type: :runtime
+  version_requirements: *id006
+- !ruby/object:Gem::Dependency
+  name: rubyforge
+  prerelease: false
+  requirement: &id007 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 2
+        - 0
+        - 4
+        version: 2.0.4
+  type: :development
+  version_requirements: *id007
+- !ruby/object:Gem::Dependency
+  name: hoe
+  prerelease: false
+  requirement: &id008 !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        segments:
+        - 2
+        - 6
+        - 0
+        version: 2.6.0
+  type: :development
+  version_requirements: *id008
+description: |-
+  Scrappy is a tool that allows extracting information from web pages and producing RDF data.
+  It uses the scraping ontology to define the mappings between HTML contents and RDF data.
+  An example of mapping is shown next, which allows extracting all titles from http://www.elmundo.es:
+    dc: http://purl.org/dc/elements/1.1/
+    rdf: http://www.w3.org/1999/02/22-rdf-syntax-ns#
+    sioc: http://rdfs.org/sioc/ns#
+    sc: http://lab.gsi.dit.upm.es/scraping.rdf#
+    *:
+      rdf:type: sc:Fragment
+      sc:selector:
+        *:
+          rdf:type: sc:UriSelector
+          rdf:value: "http://www.elmundo.es/"
+      sc:identifier:
+        *:
+          rdf:type: sc:BaseUriSelector
+      sc:subfragment:
+        *:
+          sc:type: sioc:Post
+          sc:selector:
+            *:
+              rdf:type: sc:CssSelector
+              rdf:value: ".noticia h2, .noticia h3, .noticia h4"
+          sc:identifier:
+            *:
+              rdf:type: sc:CssSelector
+              rdf:value: "a"
+              sc:attribute: "href"
+          sc:subfragment:
+            *:
+              sc:type:     rdf:Literal
+              sc:relation: dc:title
+              sc:selector:
+                *:
+                  rdf:type:  sc:CssSelector
+                  rdf:value: "a"
+  (The above code is serialized using YARF format, supported by LightRDF gem, as well as
+  RDFXML, JSON, NTriples formats, which can also be used to define the mappings).
+email:
+- joseignacio.fernandez@gmail.com
+executables:
+- scrappy
+extensions: []
+extra_rdoc_files:
+- History.txt
+- Manifest.txt
+files:
+- History.txt
+- Manifest.txt
+- README.rdoc
+- Rakefile
+- bin/scrappy
+- kb/elmundo.yarf
+- lib/scrappy.rb
+- lib/scrappy/agent/agent.rb
+- lib/scrappy/agent/blind_agent.rb
+- lib/scrappy/agent/cluster.rb
+- lib/scrappy/agent/extractor.rb
+- lib/scrappy/agent/visual_agent.rb
+- lib/scrappy/proxy.rb
+- lib/scrappy/server.rb
+- lib/scrappy/shell.rb
+- lib/scrappy/support.rb
+- lib/scrappy/webkit/webkit.rb
+- test/test_helper.rb
+- test/test_scrappy.rb
+has_rdoc: true
+homepage: http://github.com/josei/scrappy
+licenses: []
+post_install_message: "**(Optional) Remember to install rbwebkitgtk for visual parsing features**"
+rdoc_options:
+- --main
+- README.rdoc
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project: scrappy
+rubygems_version: 1.3.6
+signing_key:
+specification_version: 3
+summary: Web scraper that allows producing RDF data out of plain web pages
+test_files:
+- test/test_scrappy.rb
+- test/test_helper.rb