scrappy 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.3 2010-11-18
2
+
3
+ * RDF node caching
4
+
1
5
  === 0.1.2 2010-11-03
2
6
 
3
7
  * Fix for script portability (shebang arguments)
data/README.rdoc CHANGED
@@ -129,7 +129,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
129
129
  agent = scrappy::Agent.create :kb=>kb
130
130
 
131
131
  # Get RDF output
132
- output = agent.request :get, 'http://www.example.com'
132
+ output = agent.request :method=>:get, :uri=>'http://www.example.com'
133
133
 
134
134
  # Output all titles from the web page
135
135
  titles = output.find([], Node('dc:title'), nil)
data/bin/scrappy CHANGED
@@ -57,7 +57,7 @@ module Scrappy
57
57
  onload
58
58
  if Options.url
59
59
  Options.quiet = true
60
- puts Agent.create.proxy(:get, Options.url)
60
+ puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
61
61
  elsif Options.proxy
62
62
  puts "Launching Scrappy Web Proxy..."
63
63
  Camping::Server.new(OpenStruct.new(:host => 'localhost', :port => Options.port, :server=>'mongrel'), ["#{Scrappy::Root}/lib/scrappy/proxy.rb"]).start
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.2'
22
+ VERSION = '0.1.3'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -14,6 +14,9 @@ module Scrappy
14
14
  def self.[] id
15
15
  pool[id] || Agent.create(:id=>id)
16
16
  end
17
+ def self.cache
18
+ @cache ||= {}
19
+ end
17
20
 
18
21
  def self.create args={}
19
22
  if (args[:agent] || Options.agent) == :visual
@@ -25,7 +28,7 @@ module Scrappy
25
28
  end
26
29
  end
27
30
 
28
- attr_accessor :id, :output, :content_type, :status, :options, :kb
31
+ attr_accessor :id, :options, :kb
29
32
 
30
33
  def initialize args={}
31
34
  super()
@@ -35,56 +38,70 @@ module Scrappy
35
38
  @options = Options.clone
36
39
  end
37
40
 
38
- def request http_method, uri, inputs={}, depth=options.depth
41
+ def request args={}
39
42
  synchronize do
40
- uri = "#{uri}.com" if uri =~ /\A\w+\Z/
41
- uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
43
+ depth = args[:depth]
44
+ request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
45
+
46
+ # Expire cache
47
+ Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
42
48
 
43
- # Perform the request
44
- if http_method == :get
45
- self.uri = uri
46
- return RDF::Graph.new unless self.html_data?
49
+ # Lookup in cache
50
+ triples = if Agent::cache[request]
51
+ Agent::cache[request][:response]
47
52
  else
48
- raise Exception, 'POST requests not supported yet'
49
- end
53
+ # Perform the request
54
+ if request[:method] == :get
55
+ self.uri = request[:uri]
56
+ else
57
+ raise Exception, 'POST requests not supported yet'
58
+ end
59
+
60
+ response = if self.html_data?
61
+ add_visual_data! if options.referenceable # Adds tags including visual information
62
+ extract self.uri, html, options.referenceable # Extract data
63
+ else
64
+ []
65
+ end
50
66
 
51
- # Adds tags including visual information
52
- add_visual_data! if options.referenceable
67
+ # Cache the request
68
+ Agent::cache[request] = { :time=>Time.now, :response=>response }
69
+ Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
53
70
 
54
- # Extract data
55
- triples = extract self.uri, html, options.referenceable
71
+ response
72
+ end
56
73
 
57
74
  # Iterate through subresources
58
75
  if depth > 0
59
76
  uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
60
77
  Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
61
78
  end
79
+
62
80
  RDF::Graph.new(triples.uniq)
63
81
  end
64
82
  end
65
83
 
66
- def proxy http_method, uri, inputs={}, format=options.format, depth=options.depth
84
+ def proxy args={}
67
85
  synchronize do
68
- if @status == :redirect and uri == self.uri
69
- @status = :ok
70
- else
71
- @output = request(http_method, uri, inputs, depth).serialize(format)
72
- @content_type = ContentTypes[format] || 'text/plain'
73
- @status = if self.html_data?
74
- self.uri == uri ? :ok : :redirect
75
- else
76
- :error
77
- end
78
- end
86
+ request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
79
87
 
80
- @output
88
+ OpenStruct.new :output => self.request(request).serialize(request[:format]),
89
+ :content_type => ContentTypes[request[:format]] || 'text/plain',
90
+ :uri => self.uri,
91
+ :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
81
92
  end
82
93
  end
83
94
 
84
95
  # Method used when consuming a list of uris
85
96
  def process uri, args={}
86
97
  sleep 0.001 * options.delay.to_f
87
- request(:get, uri, {}, args[:depth]).triples
98
+ request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
99
+ end
100
+
101
+ def complete_uri uri
102
+ uri = "#{uri}.com" if uri =~ /\A\w+\Z/
103
+ uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
104
+ uri
88
105
  end
89
106
  end
90
107
  end
data/lib/scrappy/proxy.rb CHANGED
@@ -18,15 +18,15 @@ module Scrappy
18
18
  end
19
19
 
20
20
  protected
21
- def process_request http_method
22
- agent.proxy http_method, request.env["REQUEST_URI"], @input
21
+ def process_request method
22
+ response = agent.proxy :method=>method, :uri=>request.env["REQUEST_URI"], :inputs=>@input
23
23
 
24
- case agent.status
24
+ case response.status
25
25
  when :redirect
26
- redirect agent.uri
26
+ redirect response.uri
27
27
  when :ok
28
- @headers['Content-Type'] = agent.content_type
29
- agent.output
28
+ @headers['Content-Type'] = response.content_type
29
+ response.output
30
30
  else
31
31
  @status = 500
32
32
  'Error'
@@ -48,16 +48,16 @@ module Scrappy
48
48
  end
49
49
 
50
50
  protected
51
- def process_request http_method, format, url
51
+ def process_request method, format, url
52
52
  callback = @input['callback']
53
- agent.proxy http_method, url, @input.reject{|k,v| k=='callback'}, format.to_sym
53
+ response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
54
54
 
55
- case agent.status
55
+ case response.status
56
56
  when :redirect
57
- redirect "/#{format}/#{agent.uri}#{inputs}"
57
+ redirect "/#{format}/#{response.uri}#{inputs}"
58
58
  when :ok
59
- @headers['Content-Type'] = agent.content_type
60
- callback ? "#{callback}(#{agent.output})" : agent.output
59
+ @headers['Content-Type'] = response.content_type
60
+ callback ? "#{callback}(#{response.output})" : response.output
61
61
  else
62
62
  @status = 500
63
63
  'Error'
data/lib/scrappy/shell.rb CHANGED
@@ -29,7 +29,7 @@ module Scrappy
29
29
  command = raw_command.strip
30
30
 
31
31
  code = if command =~ /\Aget\W(.*)\Z/
32
- puts @agent.proxy :get, $1
32
+ puts @agent.proxy(:uri=>$1).output
33
33
  puts
34
34
  elsif command == 'help'
35
35
  puts 'Available commands:'
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.2"
5
+ s.version = "0.1.3"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-03}
9
+ s.date = %q{2010-11-18}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 2
9
- version: 0.1.2
8
+ - 3
9
+ version: 0.1.3
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-03 00:00:00 +01:00
17
+ date: 2010-11-18 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency