scrappy 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/README.rdoc +1 -1
- data/bin/scrappy +1 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +45 -28
- data/lib/scrappy/proxy.rb +6 -6
- data/lib/scrappy/server.rb +6 -6
- data/lib/scrappy/shell.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
@@ -129,7 +129,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
129
129
|
agent = scrappy::Agent.create :kb=>kb
|
130
130
|
|
131
131
|
# Get RDF output
|
132
|
-
output = agent.request :get, 'http://www.example.com'
|
132
|
+
output = agent.request :method=>:get, :uri=>'http://www.example.com'
|
133
133
|
|
134
134
|
# Output all titles from the web page
|
135
135
|
titles = output.find([], Node('dc:title'), nil)
|
data/bin/scrappy
CHANGED
@@ -57,7 +57,7 @@ module Scrappy
|
|
57
57
|
onload
|
58
58
|
if Options.url
|
59
59
|
Options.quiet = true
|
60
|
-
puts Agent.create.proxy(:get, Options.url)
|
60
|
+
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
|
61
61
|
elsif Options.proxy
|
62
62
|
puts "Launching Scrappy Web Proxy..."
|
63
63
|
Camping::Server.new(OpenStruct.new(:host => 'localhost', :port => Options.port, :server=>'mongrel'), ["#{Scrappy::Root}/lib/scrappy/proxy.rb"]).start
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -14,6 +14,9 @@ module Scrappy
|
|
14
14
|
def self.[] id
|
15
15
|
pool[id] || Agent.create(:id=>id)
|
16
16
|
end
|
17
|
+
def self.cache
|
18
|
+
@cache ||= {}
|
19
|
+
end
|
17
20
|
|
18
21
|
def self.create args={}
|
19
22
|
if (args[:agent] || Options.agent) == :visual
|
@@ -25,7 +28,7 @@ module Scrappy
|
|
25
28
|
end
|
26
29
|
end
|
27
30
|
|
28
|
-
attr_accessor :id, :
|
31
|
+
attr_accessor :id, :options, :kb
|
29
32
|
|
30
33
|
def initialize args={}
|
31
34
|
super()
|
@@ -35,56 +38,70 @@ module Scrappy
|
|
35
38
|
@options = Options.clone
|
36
39
|
end
|
37
40
|
|
38
|
-
def request
|
41
|
+
def request args={}
|
39
42
|
synchronize do
|
40
|
-
|
41
|
-
|
43
|
+
depth = args[:depth]
|
44
|
+
request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
|
45
|
+
|
46
|
+
# Expire cache
|
47
|
+
Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
|
42
48
|
|
43
|
-
#
|
44
|
-
|
45
|
-
|
46
|
-
return RDF::Graph.new unless self.html_data?
|
49
|
+
# Lookup in cache
|
50
|
+
triples = if Agent::cache[request]
|
51
|
+
Agent::cache[request][:response]
|
47
52
|
else
|
48
|
-
|
49
|
-
|
53
|
+
# Perform the request
|
54
|
+
if request[:method] == :get
|
55
|
+
self.uri = request[:uri]
|
56
|
+
else
|
57
|
+
raise Exception, 'POST requests not supported yet'
|
58
|
+
end
|
59
|
+
|
60
|
+
response = if self.html_data?
|
61
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
62
|
+
extract self.uri, html, options.referenceable # Extract data
|
63
|
+
else
|
64
|
+
[]
|
65
|
+
end
|
50
66
|
|
51
|
-
|
52
|
-
|
67
|
+
# Cache the request
|
68
|
+
Agent::cache[request] = { :time=>Time.now, :response=>response }
|
69
|
+
Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
53
70
|
|
54
|
-
|
55
|
-
|
71
|
+
response
|
72
|
+
end
|
56
73
|
|
57
74
|
# Iterate through subresources
|
58
75
|
if depth > 0
|
59
76
|
uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
|
60
77
|
Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
|
61
78
|
end
|
79
|
+
|
62
80
|
RDF::Graph.new(triples.uniq)
|
63
81
|
end
|
64
82
|
end
|
65
83
|
|
66
|
-
def proxy
|
84
|
+
def proxy args={}
|
67
85
|
synchronize do
|
68
|
-
|
69
|
-
@status = :ok
|
70
|
-
else
|
71
|
-
@output = request(http_method, uri, inputs, depth).serialize(format)
|
72
|
-
@content_type = ContentTypes[format] || 'text/plain'
|
73
|
-
@status = if self.html_data?
|
74
|
-
self.uri == uri ? :ok : :redirect
|
75
|
-
else
|
76
|
-
:error
|
77
|
-
end
|
78
|
-
end
|
86
|
+
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
79
87
|
|
80
|
-
|
88
|
+
OpenStruct.new :output => self.request(request).serialize(request[:format]),
|
89
|
+
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
90
|
+
:uri => self.uri,
|
91
|
+
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
81
92
|
end
|
82
93
|
end
|
83
94
|
|
84
95
|
# Method used when consuming a list of uris
|
85
96
|
def process uri, args={}
|
86
97
|
sleep 0.001 * options.delay.to_f
|
87
|
-
request(:get, uri,
|
98
|
+
request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
|
99
|
+
end
|
100
|
+
|
101
|
+
def complete_uri uri
|
102
|
+
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
103
|
+
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
104
|
+
uri
|
88
105
|
end
|
89
106
|
end
|
90
107
|
end
|
data/lib/scrappy/proxy.rb
CHANGED
@@ -18,15 +18,15 @@ module Scrappy
|
|
18
18
|
end
|
19
19
|
|
20
20
|
protected
|
21
|
-
def process_request
|
22
|
-
agent.proxy
|
21
|
+
def process_request method
|
22
|
+
response = agent.proxy :method=>method, :uri=>request.env["REQUEST_URI"], :inputs=>@input
|
23
23
|
|
24
|
-
case
|
24
|
+
case response.status
|
25
25
|
when :redirect
|
26
|
-
redirect
|
26
|
+
redirect response.uri
|
27
27
|
when :ok
|
28
|
-
@headers['Content-Type'] =
|
29
|
-
|
28
|
+
@headers['Content-Type'] = response.content_type
|
29
|
+
response.output
|
30
30
|
else
|
31
31
|
@status = 500
|
32
32
|
'Error'
|
data/lib/scrappy/server.rb
CHANGED
@@ -48,16 +48,16 @@ module Scrappy
|
|
48
48
|
end
|
49
49
|
|
50
50
|
protected
|
51
|
-
def process_request
|
51
|
+
def process_request method, format, url
|
52
52
|
callback = @input['callback']
|
53
|
-
agent.proxy
|
53
|
+
response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
|
54
54
|
|
55
|
-
case
|
55
|
+
case response.status
|
56
56
|
when :redirect
|
57
|
-
redirect "/#{format}/#{
|
57
|
+
redirect "/#{format}/#{response.uri}#{inputs}"
|
58
58
|
when :ok
|
59
|
-
@headers['Content-Type'] =
|
60
|
-
callback ? "#{callback}(#{
|
59
|
+
@headers['Content-Type'] = response.content_type
|
60
|
+
callback ? "#{callback}(#{response.output})" : response.output
|
61
61
|
else
|
62
62
|
@status = 500
|
63
63
|
'Error'
|
data/lib/scrappy/shell.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.3"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-11-
|
9
|
+
s.date = %q{2010-11-18}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 3
|
9
|
+
version: 0.1.3
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-18 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|