scrappy 0.1.10 → 0.1.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +7 -0
- data/Rakefile +1 -1
- data/bin/scrappy +2 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +46 -8
- data/lib/scrappy/agent/blind_agent.rb +2 -0
- data/lib/scrappy/agent/extractor.rb +11 -7
- data/scrappy.gemspec +5 -2
- metadata +17 -3
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -37,6 +37,7 @@ module Scrappy
|
|
37
37
|
opts.on('-h', '--help') { output_help; exit 0 }
|
38
38
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
39
39
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
40
|
+
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
40
41
|
opts.on('-i', '--interactive') { Options.shell = true }
|
41
42
|
opts.on('-s', '--server') { Options.server = true }
|
42
43
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
@@ -98,6 +99,7 @@ Options
|
|
98
99
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
99
100
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
|
100
101
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
102
|
+
-u, --debug Shows debugging traces
|
101
103
|
-i, --interactive Runs interactive shell
|
102
104
|
-s, --server Runs web server
|
103
105
|
-S, --proxy-server Runs web proxy
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -44,23 +44,28 @@ module Scrappy
|
|
44
44
|
depth = args[:depth]
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
|
-
# Expire cache
|
48
|
-
cache.expire! 300 # 5 minutes
|
49
|
-
|
50
47
|
# Lookup in cache
|
51
48
|
triples = if cache[request]
|
49
|
+
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
50
|
+
|
52
51
|
cache[request][:response]
|
53
52
|
else
|
54
53
|
# Perform the request
|
55
54
|
|
56
55
|
sleep 0.001 * options.delay.to_f # Sleep if requested
|
57
|
-
|
56
|
+
|
57
|
+
if options.debug
|
58
|
+
print "Opening #{request[:uri]}..."; $stdout.flush
|
59
|
+
end
|
60
|
+
|
58
61
|
if request[:method] == :get
|
59
62
|
self.uri = request[:uri]
|
60
63
|
else
|
61
64
|
raise Exception, 'POST requests not supported yet'
|
62
65
|
end
|
63
66
|
|
67
|
+
puts 'done!' if options.debug
|
68
|
+
|
64
69
|
response = if self.html_data?
|
65
70
|
add_visual_data! if options.referenceable # Adds tags including visual information
|
66
71
|
extract self.uri, html, options.referenceable # Extract data
|
@@ -76,8 +81,21 @@ module Scrappy
|
|
76
81
|
end
|
77
82
|
|
78
83
|
# Enqueue subresources
|
79
|
-
if depth
|
80
|
-
|
84
|
+
if depth >= 0
|
85
|
+
# Pages are enqueued without reducing depth
|
86
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
87
|
+
|
88
|
+
# All other URIS are enqueued with depth reduced
|
89
|
+
uris = if depth > 0
|
90
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
91
|
+
else
|
92
|
+
[]
|
93
|
+
end
|
94
|
+
|
95
|
+
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
|
96
|
+
|
97
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
98
|
+
|
81
99
|
if queue.nil?
|
82
100
|
triples += process items
|
83
101
|
else
|
@@ -89,18 +107,38 @@ module Scrappy
|
|
89
107
|
end
|
90
108
|
|
91
109
|
def reduce results
|
110
|
+
if options.debug
|
111
|
+
print "Merging results..."; $stdout.flush
|
112
|
+
end
|
113
|
+
|
92
114
|
triples = []; results.each { |result| triples += result }
|
115
|
+
|
116
|
+
puts 'done!'if options.debug
|
117
|
+
|
93
118
|
triples
|
94
119
|
end
|
95
120
|
|
96
121
|
def request args={}
|
97
|
-
|
122
|
+
# Expire cache
|
123
|
+
cache.expire! 300 # 5 minutes
|
124
|
+
|
125
|
+
RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
|
98
126
|
end
|
99
127
|
|
100
128
|
def proxy args={}
|
101
129
|
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
130
|
+
|
131
|
+
response = self.request(request)
|
132
|
+
|
133
|
+
if options.debug
|
134
|
+
print "Serializing..."; $stdout.flush
|
135
|
+
end
|
136
|
+
|
137
|
+
output = response.serialize(request[:format])
|
138
|
+
|
139
|
+
puts 'done!'if options.debug
|
102
140
|
|
103
|
-
OpenStruct.new :output =>
|
141
|
+
OpenStruct.new :output => output,
|
104
142
|
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
105
143
|
:uri => self.uri,
|
106
144
|
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
@@ -3,6 +3,10 @@ require 'digest/md5'
|
|
3
3
|
module Scrappy
|
4
4
|
module Extractor
|
5
5
|
def extract uri, html, referenceable=nil
|
6
|
+
if options.debug
|
7
|
+
print "Extracting #{uri}..."; $stdout.flush
|
8
|
+
end
|
9
|
+
|
6
10
|
triples = []
|
7
11
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
8
12
|
|
@@ -21,6 +25,8 @@ module Scrappy
|
|
21
25
|
|
22
26
|
add_referenceable_data content, triples, referenceable if referenceable
|
23
27
|
|
28
|
+
puts "done!" if options.debug
|
29
|
+
|
24
30
|
triples
|
25
31
|
end
|
26
32
|
|
@@ -40,7 +46,7 @@ module Scrappy
|
|
40
46
|
|
41
47
|
nodes.each do |node|
|
42
48
|
# Build the object
|
43
|
-
object = if fragment.sc::type.
|
49
|
+
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
44
50
|
value = doc[:value].to_s.strip
|
45
51
|
if options[:referenceable]
|
46
52
|
bnode = Node(nil)
|
@@ -52,15 +58,13 @@ module Scrappy
|
|
52
58
|
value
|
53
59
|
end
|
54
60
|
else
|
55
|
-
|
56
|
-
options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
|
57
|
-
end
|
61
|
+
fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
|
58
62
|
fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
|
59
63
|
fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
|
60
64
|
node
|
61
65
|
end
|
62
66
|
fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
|
63
|
-
|
67
|
+
|
64
68
|
# Add referenceable data if requested
|
65
69
|
if options[:referenceable]
|
66
70
|
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
@@ -81,7 +85,7 @@ module Scrappy
|
|
81
85
|
# From "BaseUriSelector" to "base_uri"
|
82
86
|
class_name = selector.rdf::type.first.to_s.split('#').last
|
83
87
|
|
84
|
-
if !selector.sc::debug.empty?
|
88
|
+
if !selector.sc::debug.empty? and options.debug
|
85
89
|
puts '== DEBUG'
|
86
90
|
puts '== Selector:'
|
87
91
|
puts selector.serialize(:yarf, false)
|
@@ -94,7 +98,7 @@ module Scrappy
|
|
94
98
|
# Process selector
|
95
99
|
results = Kernel.const_get(class_name).filter selector, doc
|
96
100
|
|
97
|
-
if !selector.sc::debug.empty?
|
101
|
+
if !selector.sc::debug.empty? and options.debug
|
98
102
|
puts "== No results" if results.empty?
|
99
103
|
results.each_with_index do |result, i|
|
100
104
|
puts "== Result ##{i}:"
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.11"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-12-
|
9
|
+
s.date = %q{2010-12-23}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
35
|
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
|
36
36
|
s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
|
+
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
38
|
else
|
38
39
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
39
40
|
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
@@ -42,6 +43,7 @@ Gem::Specification.new do |s|
|
|
42
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
43
44
|
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
45
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
46
|
+
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
45
47
|
end
|
46
48
|
else
|
47
49
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -51,5 +53,6 @@ Gem::Specification.new do |s|
|
|
51
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
52
54
|
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
53
55
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
56
|
+
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
54
57
|
end
|
55
58
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 11
|
9
|
+
version: 0.1.11
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-23 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -113,6 +113,20 @@ dependencies:
|
|
113
113
|
version: 1.1.5
|
114
114
|
type: :runtime
|
115
115
|
version_requirements: *id007
|
116
|
+
- !ruby/object:Gem::Dependency
|
117
|
+
name: i18n
|
118
|
+
prerelease: false
|
119
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
segments:
|
124
|
+
- 0
|
125
|
+
- 4
|
126
|
+
- 2
|
127
|
+
version: 0.4.2
|
128
|
+
type: :runtime
|
129
|
+
version_requirements: *id008
|
116
130
|
description: RDF web scraper
|
117
131
|
email: joseignacio.fernandez@gmail.com
|
118
132
|
executables:
|