scrappy 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.3 2010-11-18
2
+
3
+ * RDF node caching
4
+
1
5
  === 0.1.2 2010-11-03
2
6
 
3
7
  * Fix for script portability (shebang arguments)
data/README.rdoc CHANGED
@@ -129,7 +129,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
129
129
  agent = scrappy::Agent.create :kb=>kb
130
130
 
131
131
  # Get RDF output
132
- output = agent.request :get, 'http://www.example.com'
132
+ output = agent.request :method=>:get, :uri=>'http://www.example.com'
133
133
 
134
134
  # Output all titles from the web page
135
135
  titles = output.find([], Node('dc:title'), nil)
data/bin/scrappy CHANGED
@@ -57,7 +57,7 @@ module Scrappy
57
57
  onload
58
58
  if Options.url
59
59
  Options.quiet = true
60
- puts Agent.create.proxy(:get, Options.url)
60
+ puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
61
61
  elsif Options.proxy
62
62
  puts "Launching Scrappy Web Proxy..."
63
63
  Camping::Server.new(OpenStruct.new(:host => 'localhost', :port => Options.port, :server=>'mongrel'), ["#{Scrappy::Root}/lib/scrappy/proxy.rb"]).start
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.2'
22
+ VERSION = '0.1.3'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -14,6 +14,9 @@ module Scrappy
14
14
  def self.[] id
15
15
  pool[id] || Agent.create(:id=>id)
16
16
  end
17
+ def self.cache
18
+ @cache ||= {}
19
+ end
17
20
 
18
21
  def self.create args={}
19
22
  if (args[:agent] || Options.agent) == :visual
@@ -25,7 +28,7 @@ module Scrappy
25
28
  end
26
29
  end
27
30
 
28
- attr_accessor :id, :output, :content_type, :status, :options, :kb
31
+ attr_accessor :id, :options, :kb
29
32
 
30
33
  def initialize args={}
31
34
  super()
@@ -35,56 +38,70 @@ module Scrappy
35
38
  @options = Options.clone
36
39
  end
37
40
 
38
- def request http_method, uri, inputs={}, depth=options.depth
41
+ def request args={}
39
42
  synchronize do
40
- uri = "#{uri}.com" if uri =~ /\A\w+\Z/
41
- uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
43
+ depth = args[:depth]
44
+ request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
45
+
46
+ # Expire cache
47
+ Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
42
48
 
43
- # Perform the request
44
- if http_method == :get
45
- self.uri = uri
46
- return RDF::Graph.new unless self.html_data?
49
+ # Lookup in cache
50
+ triples = if Agent::cache[request]
51
+ Agent::cache[request][:response]
47
52
  else
48
- raise Exception, 'POST requests not supported yet'
49
- end
53
+ # Perform the request
54
+ if request[:method] == :get
55
+ self.uri = request[:uri]
56
+ else
57
+ raise Exception, 'POST requests not supported yet'
58
+ end
59
+
60
+ response = if self.html_data?
61
+ add_visual_data! if options.referenceable # Adds tags including visual information
62
+ extract self.uri, html, options.referenceable # Extract data
63
+ else
64
+ []
65
+ end
50
66
 
51
- # Adds tags including visual information
52
- add_visual_data! if options.referenceable
67
+ # Cache the request
68
+ Agent::cache[request] = { :time=>Time.now, :response=>response }
69
+ Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
53
70
 
54
- # Extract data
55
- triples = extract self.uri, html, options.referenceable
71
+ response
72
+ end
56
73
 
57
74
  # Iterate through subresources
58
75
  if depth > 0
59
76
  uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
60
77
  Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
61
78
  end
79
+
62
80
  RDF::Graph.new(triples.uniq)
63
81
  end
64
82
  end
65
83
 
66
- def proxy http_method, uri, inputs={}, format=options.format, depth=options.depth
84
+ def proxy args={}
67
85
  synchronize do
68
- if @status == :redirect and uri == self.uri
69
- @status = :ok
70
- else
71
- @output = request(http_method, uri, inputs, depth).serialize(format)
72
- @content_type = ContentTypes[format] || 'text/plain'
73
- @status = if self.html_data?
74
- self.uri == uri ? :ok : :redirect
75
- else
76
- :error
77
- end
78
- end
86
+ request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
79
87
 
80
- @output
88
+ OpenStruct.new :output => self.request(request).serialize(request[:format]),
89
+ :content_type => ContentTypes[request[:format]] || 'text/plain',
90
+ :uri => self.uri,
91
+ :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
81
92
  end
82
93
  end
83
94
 
84
95
  # Method used when consuming a list of uris
85
96
  def process uri, args={}
86
97
  sleep 0.001 * options.delay.to_f
87
- request(:get, uri, {}, args[:depth]).triples
98
+ request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
99
+ end
100
+
101
+ def complete_uri uri
102
+ uri = "#{uri}.com" if uri =~ /\A\w+\Z/
103
+ uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
104
+ uri
88
105
  end
89
106
  end
90
107
  end
data/lib/scrappy/proxy.rb CHANGED
@@ -18,15 +18,15 @@ module Scrappy
18
18
  end
19
19
 
20
20
  protected
21
- def process_request http_method
22
- agent.proxy http_method, request.env["REQUEST_URI"], @input
21
+ def process_request method
22
+ response = agent.proxy :method=>method, :uri=>request.env["REQUEST_URI"], :inputs=>@input
23
23
 
24
- case agent.status
24
+ case response.status
25
25
  when :redirect
26
- redirect agent.uri
26
+ redirect response.uri
27
27
  when :ok
28
- @headers['Content-Type'] = agent.content_type
29
- agent.output
28
+ @headers['Content-Type'] = response.content_type
29
+ response.output
30
30
  else
31
31
  @status = 500
32
32
  'Error'
@@ -48,16 +48,16 @@ module Scrappy
48
48
  end
49
49
 
50
50
  protected
51
- def process_request http_method, format, url
51
+ def process_request method, format, url
52
52
  callback = @input['callback']
53
- agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
53
+ response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
54
54
 
55
- case agent.status
55
+ case response.status
56
56
  when :redirect
57
- redirect "/#{format}/#{agent.uri}#{inputs}"
57
+ redirect "/#{format}/#{response.uri}#{inputs}"
58
58
  when :ok
59
- @headers['Content-Type'] = agent.content_type
60
- callback ? "#{callback}(#{agent.output})" : agent.output
59
+ @headers['Content-Type'] = response.content_type
60
+ callback ? "#{callback}(#{response.output})" : response.output
61
61
  else
62
62
  @status = 500
63
63
  'Error'
data/lib/scrappy/shell.rb CHANGED
@@ -29,7 +29,7 @@ module Scrappy
29
29
  command = raw_command.strip
30
30
 
31
31
  code = if command =~ /\Aget\W(.*)\Z/
32
- puts @agent.proxy :get, $1
32
+ puts @agent.proxy(:uri=>$1).output
33
33
  puts
34
34
  elsif command == 'help'
35
35
  puts 'Available commands:'
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.2"
5
+ s.version = "0.1.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-03}
9
+ s.date = %q{2010-11-18}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 2
9
- version: 0.1.2
8
+ - 3
9
+ version: 0.1.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-03 00:00:00 +01:00
17
+ date: 2010-11-18 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency