scrappy 0.1.10 → 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +7 -0
- data/Rakefile +1 -1
- data/bin/scrappy +2 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +46 -8
- data/lib/scrappy/agent/blind_agent.rb +2 -0
- data/lib/scrappy/agent/extractor.rb +11 -7
- data/scrappy.gemspec +5 -2
- metadata +17 -3
data/History.txt
CHANGED
data/Rakefile
CHANGED
@@ -11,7 +11,7 @@ Echoe.new('scrappy', Scrappy::VERSION) do |p|
|
|
11
11
|
p.email = "joseignacio.fernandez@gmail.com"
|
12
12
|
p.install_message = '**(Optional) Remember to install rbwebkitgtk for visual parsing features**'
|
13
13
|
p.ignore_pattern = ["pkg/*"]
|
14
|
-
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5']]
|
14
|
+
p.dependencies = [['activesupport','>= 2.3.5'], ['markaby', '>= 0.7.1'], ['camping', '= 2.0'], ['nokogiri', '>= 1.4.1'], ['mechanize','>= 1.0.0'], ['lightrdf','>= 0.1'], ['mongrel', '>= 1.1.5'], ['i18n', '>= 0.4.2']]
|
15
15
|
end
|
16
16
|
|
17
17
|
Rake::RDocTask.new(:rdoc) do |rdoc|
|
data/bin/scrappy
CHANGED
@@ -37,6 +37,7 @@ module Scrappy
|
|
37
37
|
opts.on('-h', '--help') { output_help; exit 0 }
|
38
38
|
opts.on('-g URL', '--get URL') { |url| Options.url = url; Options.http_method=:get }
|
39
39
|
opts.on('-p URL', '--post URL') { |url| Options.url = url; Options.http_method=:post }
|
40
|
+
opts.on('-u', '--debug') { Agent::Options.debug = true }
|
40
41
|
opts.on('-i', '--interactive') { Options.shell = true }
|
41
42
|
opts.on('-s', '--server') { Options.server = true }
|
42
43
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
@@ -98,6 +99,7 @@ Options
|
|
98
99
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
99
100
|
-l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
|
100
101
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
102
|
+
-u, --debug Shows debugging traces
|
101
103
|
-i, --interactive Runs interactive shell
|
102
104
|
-s, --server Runs web server
|
103
105
|
-S, --proxy-server Runs web proxy
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -44,23 +44,28 @@ module Scrappy
|
|
44
44
|
depth = args[:depth]
|
45
45
|
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
46
46
|
|
47
|
-
# Expire cache
|
48
|
-
cache.expire! 300 # 5 minutes
|
49
|
-
|
50
47
|
# Lookup in cache
|
51
48
|
triples = if cache[request]
|
49
|
+
puts "Retrieving cached #{request[:uri]}...done!" if options.debug
|
50
|
+
|
52
51
|
cache[request][:response]
|
53
52
|
else
|
54
53
|
# Perform the request
|
55
54
|
|
56
55
|
sleep 0.001 * options.delay.to_f # Sleep if requested
|
57
|
-
|
56
|
+
|
57
|
+
if options.debug
|
58
|
+
print "Opening #{request[:uri]}..."; $stdout.flush
|
59
|
+
end
|
60
|
+
|
58
61
|
if request[:method] == :get
|
59
62
|
self.uri = request[:uri]
|
60
63
|
else
|
61
64
|
raise Exception, 'POST requests not supported yet'
|
62
65
|
end
|
63
66
|
|
67
|
+
puts 'done!' if options.debug
|
68
|
+
|
64
69
|
response = if self.html_data?
|
65
70
|
add_visual_data! if options.referenceable # Adds tags including visual information
|
66
71
|
extract self.uri, html, options.referenceable # Extract data
|
@@ -76,8 +81,21 @@ module Scrappy
|
|
76
81
|
end
|
77
82
|
|
78
83
|
# Enqueue subresources
|
79
|
-
if depth
|
80
|
-
|
84
|
+
if depth >= 0
|
85
|
+
# Pages are enqueued without reducing depth
|
86
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
87
|
+
|
88
|
+
# All other URIS are enqueued with depth reduced
|
89
|
+
uris = if depth > 0
|
90
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
91
|
+
else
|
92
|
+
[]
|
93
|
+
end
|
94
|
+
|
95
|
+
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
|
96
|
+
|
97
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
98
|
+
|
81
99
|
if queue.nil?
|
82
100
|
triples += process items
|
83
101
|
else
|
@@ -89,18 +107,38 @@ module Scrappy
|
|
89
107
|
end
|
90
108
|
|
91
109
|
def reduce results
|
110
|
+
if options.debug
|
111
|
+
print "Merging results..."; $stdout.flush
|
112
|
+
end
|
113
|
+
|
92
114
|
triples = []; results.each { |result| triples += result }
|
115
|
+
|
116
|
+
puts 'done!'if options.debug
|
117
|
+
|
93
118
|
triples
|
94
119
|
end
|
95
120
|
|
96
121
|
def request args={}
|
97
|
-
|
122
|
+
# Expire cache
|
123
|
+
cache.expire! 300 # 5 minutes
|
124
|
+
|
125
|
+
RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or ![Node('sc:Index'), Node('sc:Page')].include?(o) })
|
98
126
|
end
|
99
127
|
|
100
128
|
def proxy args={}
|
101
129
|
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
130
|
+
|
131
|
+
response = self.request(request)
|
132
|
+
|
133
|
+
if options.debug
|
134
|
+
print "Serializing..."; $stdout.flush
|
135
|
+
end
|
136
|
+
|
137
|
+
output = response.serialize(request[:format])
|
138
|
+
|
139
|
+
puts 'done!'if options.debug
|
102
140
|
|
103
|
-
OpenStruct.new :output =>
|
141
|
+
OpenStruct.new :output => output,
|
104
142
|
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
105
143
|
:uri => self.uri,
|
106
144
|
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
@@ -3,6 +3,10 @@ require 'digest/md5'
|
|
3
3
|
module Scrappy
|
4
4
|
module Extractor
|
5
5
|
def extract uri, html, referenceable=nil
|
6
|
+
if options.debug
|
7
|
+
print "Extracting #{uri}..."; $stdout.flush
|
8
|
+
end
|
9
|
+
|
6
10
|
triples = []
|
7
11
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
8
12
|
|
@@ -21,6 +25,8 @@ module Scrappy
|
|
21
25
|
|
22
26
|
add_referenceable_data content, triples, referenceable if referenceable
|
23
27
|
|
28
|
+
puts "done!" if options.debug
|
29
|
+
|
24
30
|
triples
|
25
31
|
end
|
26
32
|
|
@@ -40,7 +46,7 @@ module Scrappy
|
|
40
46
|
|
41
47
|
nodes.each do |node|
|
42
48
|
# Build the object
|
43
|
-
object = if fragment.sc::type.
|
49
|
+
object = if fragment.sc::type.include?(Node('rdf:Literal'))
|
44
50
|
value = doc[:value].to_s.strip
|
45
51
|
if options[:referenceable]
|
46
52
|
bnode = Node(nil)
|
@@ -52,15 +58,13 @@ module Scrappy
|
|
52
58
|
value
|
53
59
|
end
|
54
60
|
else
|
55
|
-
|
56
|
-
options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
|
57
|
-
end
|
61
|
+
fragment.sc::type.each { |type| options[:triples] << [node, Node('rdf:type'), type] if type != Node('rdf:Resource') }
|
58
62
|
fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
|
59
63
|
fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
|
60
64
|
node
|
61
65
|
end
|
62
66
|
fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
|
63
|
-
|
67
|
+
|
64
68
|
# Add referenceable data if requested
|
65
69
|
if options[:referenceable]
|
66
70
|
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
@@ -81,7 +85,7 @@ module Scrappy
|
|
81
85
|
# From "BaseUriSelector" to "base_uri"
|
82
86
|
class_name = selector.rdf::type.first.to_s.split('#').last
|
83
87
|
|
84
|
-
if !selector.sc::debug.empty?
|
88
|
+
if !selector.sc::debug.empty? and options.debug
|
85
89
|
puts '== DEBUG'
|
86
90
|
puts '== Selector:'
|
87
91
|
puts selector.serialize(:yarf, false)
|
@@ -94,7 +98,7 @@ module Scrappy
|
|
94
98
|
# Process selector
|
95
99
|
results = Kernel.const_get(class_name).filter selector, doc
|
96
100
|
|
97
|
-
if !selector.sc::debug.empty?
|
101
|
+
if !selector.sc::debug.empty? and options.debug
|
98
102
|
puts "== No results" if results.empty?
|
99
103
|
results.each_with_index do |result, i|
|
100
104
|
puts "== Result ##{i}:"
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.11"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-12-
|
9
|
+
s.date = %q{2010-12-23}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
@@ -34,6 +34,7 @@ Gem::Specification.new do |s|
|
|
34
34
|
s.add_runtime_dependency(%q<mechanize>, [">= 1.0.0"])
|
35
35
|
s.add_runtime_dependency(%q<lightrdf>, [">= 0.1"])
|
36
36
|
s.add_runtime_dependency(%q<mongrel>, [">= 1.1.5"])
|
37
|
+
s.add_runtime_dependency(%q<i18n>, [">= 0.4.2"])
|
37
38
|
else
|
38
39
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
39
40
|
s.add_dependency(%q<markaby>, [">= 0.7.1"])
|
@@ -42,6 +43,7 @@ Gem::Specification.new do |s|
|
|
42
43
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
43
44
|
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
44
45
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
46
|
+
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
45
47
|
end
|
46
48
|
else
|
47
49
|
s.add_dependency(%q<activesupport>, [">= 2.3.5"])
|
@@ -51,5 +53,6 @@ Gem::Specification.new do |s|
|
|
51
53
|
s.add_dependency(%q<mechanize>, [">= 1.0.0"])
|
52
54
|
s.add_dependency(%q<lightrdf>, [">= 0.1"])
|
53
55
|
s.add_dependency(%q<mongrel>, [">= 1.1.5"])
|
56
|
+
s.add_dependency(%q<i18n>, [">= 0.4.2"])
|
54
57
|
end
|
55
58
|
end
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 11
|
9
|
+
version: 0.1.11
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-23 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -113,6 +113,20 @@ dependencies:
|
|
113
113
|
version: 1.1.5
|
114
114
|
type: :runtime
|
115
115
|
version_requirements: *id007
|
116
|
+
- !ruby/object:Gem::Dependency
|
117
|
+
name: i18n
|
118
|
+
prerelease: false
|
119
|
+
requirement: &id008 !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
segments:
|
124
|
+
- 0
|
125
|
+
- 4
|
126
|
+
- 2
|
127
|
+
version: 0.4.2
|
128
|
+
type: :runtime
|
129
|
+
version_requirements: *id008
|
116
130
|
description: RDF web scraper
|
117
131
|
email: joseignacio.fernandez@gmail.com
|
118
132
|
executables:
|