scrappy 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/README.rdoc +1 -1
- data/bin/scrappy +1 -1
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +45 -28
- data/lib/scrappy/proxy.rb +6 -6
- data/lib/scrappy/server.rb +6 -6
- data/lib/scrappy/shell.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/README.rdoc
CHANGED
|
@@ -129,7 +129,7 @@ scrappy offers many different interfaces to get RDF data from a web page:
|
|
|
129
129
|
agent = scrappy::Agent.create :kb=>kb
|
|
130
130
|
|
|
131
131
|
# Get RDF output
|
|
132
|
-
output = agent.request :get, 'http://www.example.com'
|
|
132
|
+
output = agent.request :method=>:get, :uri=>'http://www.example.com'
|
|
133
133
|
|
|
134
134
|
# Output all titles from the web page
|
|
135
135
|
titles = output.find([], Node('dc:title'), nil)
|
data/bin/scrappy
CHANGED
|
@@ -57,7 +57,7 @@ module Scrappy
|
|
|
57
57
|
onload
|
|
58
58
|
if Options.url
|
|
59
59
|
Options.quiet = true
|
|
60
|
-
puts Agent.create.proxy(:get, Options.url)
|
|
60
|
+
puts Agent.create.proxy(:http_method=>:get, :uri=>Options.url).output
|
|
61
61
|
elsif Options.proxy
|
|
62
62
|
puts "Launching Scrappy Web Proxy..."
|
|
63
63
|
Camping::Server.new(OpenStruct.new(:host => 'localhost', :port => Options.port, :server=>'mongrel'), ["#{Scrappy::Root}/lib/scrappy/proxy.rb"]).start
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
|
@@ -14,6 +14,9 @@ module Scrappy
|
|
|
14
14
|
def self.[] id
|
|
15
15
|
pool[id] || Agent.create(:id=>id)
|
|
16
16
|
end
|
|
17
|
+
def self.cache
|
|
18
|
+
@cache ||= {}
|
|
19
|
+
end
|
|
17
20
|
|
|
18
21
|
def self.create args={}
|
|
19
22
|
if (args[:agent] || Options.agent) == :visual
|
|
@@ -25,7 +28,7 @@ module Scrappy
|
|
|
25
28
|
end
|
|
26
29
|
end
|
|
27
30
|
|
|
28
|
-
attr_accessor :id, :
|
|
31
|
+
attr_accessor :id, :options, :kb
|
|
29
32
|
|
|
30
33
|
def initialize args={}
|
|
31
34
|
super()
|
|
@@ -35,56 +38,70 @@ module Scrappy
|
|
|
35
38
|
@options = Options.clone
|
|
36
39
|
end
|
|
37
40
|
|
|
38
|
-
def request
|
|
41
|
+
def request args={}
|
|
39
42
|
synchronize do
|
|
40
|
-
|
|
41
|
-
|
|
43
|
+
depth = args[:depth]
|
|
44
|
+
request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
|
|
45
|
+
|
|
46
|
+
# Expire cache
|
|
47
|
+
Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
|
|
42
48
|
|
|
43
|
-
#
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
return RDF::Graph.new unless self.html_data?
|
|
49
|
+
# Lookup in cache
|
|
50
|
+
triples = if Agent::cache[request]
|
|
51
|
+
Agent::cache[request][:response]
|
|
47
52
|
else
|
|
48
|
-
|
|
49
|
-
|
|
53
|
+
# Perform the request
|
|
54
|
+
if request[:method] == :get
|
|
55
|
+
self.uri = request[:uri]
|
|
56
|
+
else
|
|
57
|
+
raise Exception, 'POST requests not supported yet'
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
response = if self.html_data?
|
|
61
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
|
62
|
+
extract self.uri, html, options.referenceable # Extract data
|
|
63
|
+
else
|
|
64
|
+
[]
|
|
65
|
+
end
|
|
50
66
|
|
|
51
|
-
|
|
52
|
-
|
|
67
|
+
# Cache the request
|
|
68
|
+
Agent::cache[request] = { :time=>Time.now, :response=>response }
|
|
69
|
+
Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
|
53
70
|
|
|
54
|
-
|
|
55
|
-
|
|
71
|
+
response
|
|
72
|
+
end
|
|
56
73
|
|
|
57
74
|
# Iterate through subresources
|
|
58
75
|
if depth > 0
|
|
59
76
|
uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
|
|
60
77
|
Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
|
|
61
78
|
end
|
|
79
|
+
|
|
62
80
|
RDF::Graph.new(triples.uniq)
|
|
63
81
|
end
|
|
64
82
|
end
|
|
65
83
|
|
|
66
|
-
def proxy
|
|
84
|
+
def proxy args={}
|
|
67
85
|
synchronize do
|
|
68
|
-
|
|
69
|
-
@status = :ok
|
|
70
|
-
else
|
|
71
|
-
@output = request(http_method, uri, inputs, depth).serialize(format)
|
|
72
|
-
@content_type = ContentTypes[format] || 'text/plain'
|
|
73
|
-
@status = if self.html_data?
|
|
74
|
-
self.uri == uri ? :ok : :redirect
|
|
75
|
-
else
|
|
76
|
-
:error
|
|
77
|
-
end
|
|
78
|
-
end
|
|
86
|
+
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
|
79
87
|
|
|
80
|
-
|
|
88
|
+
OpenStruct.new :output => self.request(request).serialize(request[:format]),
|
|
89
|
+
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
|
90
|
+
:uri => self.uri,
|
|
91
|
+
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
|
81
92
|
end
|
|
82
93
|
end
|
|
83
94
|
|
|
84
95
|
# Method used when consuming a list of uris
|
|
85
96
|
def process uri, args={}
|
|
86
97
|
sleep 0.001 * options.delay.to_f
|
|
87
|
-
request(:get, uri,
|
|
98
|
+
request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def complete_uri uri
|
|
102
|
+
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
|
103
|
+
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
|
104
|
+
uri
|
|
88
105
|
end
|
|
89
106
|
end
|
|
90
107
|
end
|
data/lib/scrappy/proxy.rb
CHANGED
|
@@ -18,15 +18,15 @@ module Scrappy
|
|
|
18
18
|
end
|
|
19
19
|
|
|
20
20
|
protected
|
|
21
|
-
def process_request
|
|
22
|
-
agent.proxy
|
|
21
|
+
def process_request method
|
|
22
|
+
response = agent.proxy :method=>method, :uri=>request.env["REQUEST_URI"], :inputs=>@input
|
|
23
23
|
|
|
24
|
-
case
|
|
24
|
+
case response.status
|
|
25
25
|
when :redirect
|
|
26
|
-
redirect
|
|
26
|
+
redirect response.uri
|
|
27
27
|
when :ok
|
|
28
|
-
@headers['Content-Type'] =
|
|
29
|
-
|
|
28
|
+
@headers['Content-Type'] = response.content_type
|
|
29
|
+
response.output
|
|
30
30
|
else
|
|
31
31
|
@status = 500
|
|
32
32
|
'Error'
|
data/lib/scrappy/server.rb
CHANGED
|
@@ -48,16 +48,16 @@ module Scrappy
|
|
|
48
48
|
end
|
|
49
49
|
|
|
50
50
|
protected
|
|
51
|
-
def process_request
|
|
51
|
+
def process_request method, format, url
|
|
52
52
|
callback = @input['callback']
|
|
53
|
-
agent.proxy
|
|
53
|
+
response = agent.proxy :method=>method, :uri=>url, :inputs=>@input.reject{|k,v| k=='callback'}, :format=>format.to_sym
|
|
54
54
|
|
|
55
|
-
case
|
|
55
|
+
case response.status
|
|
56
56
|
when :redirect
|
|
57
|
-
redirect "/#{format}/#{
|
|
57
|
+
redirect "/#{format}/#{response.uri}#{inputs}"
|
|
58
58
|
when :ok
|
|
59
|
-
@headers['Content-Type'] =
|
|
60
|
-
callback ? "#{callback}(#{
|
|
59
|
+
@headers['Content-Type'] = response.content_type
|
|
60
|
+
callback ? "#{callback}(#{response.output})" : response.output
|
|
61
61
|
else
|
|
62
62
|
@status = 500
|
|
63
63
|
'Error'
|
data/lib/scrappy/shell.rb
CHANGED
data/scrappy.gemspec
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = %q{scrappy}
|
|
5
|
-
s.version = "0.1.
|
|
5
|
+
s.version = "0.1.3"
|
|
6
6
|
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
|
8
8
|
s.authors = ["Jose Ignacio"]
|
|
9
|
-
s.date = %q{2010-11-
|
|
9
|
+
s.date = %q{2010-11-18}
|
|
10
10
|
s.default_executable = %q{scrappy}
|
|
11
11
|
s.description = %q{RDF web scraper}
|
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
|
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
|
5
5
|
segments:
|
|
6
6
|
- 0
|
|
7
7
|
- 1
|
|
8
|
-
-
|
|
9
|
-
version: 0.1.
|
|
8
|
+
- 3
|
|
9
|
+
version: 0.1.3
|
|
10
10
|
platform: ruby
|
|
11
11
|
authors:
|
|
12
12
|
- Jose Ignacio
|
|
@@ -14,7 +14,7 @@ autorequire:
|
|
|
14
14
|
bindir: bin
|
|
15
15
|
cert_chain: []
|
|
16
16
|
|
|
17
|
-
date: 2010-11-
|
|
17
|
+
date: 2010-11-18 00:00:00 +01:00
|
|
18
18
|
default_executable:
|
|
19
19
|
dependencies:
|
|
20
20
|
- !ruby/object:Gem::Dependency
|