scrappy 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +4 -0
- data/Manifest +2 -1
- data/bin/scrappy +4 -8
- data/lib/scrappy.rb +3 -3
- data/lib/scrappy/agent/agent.rb +60 -52
- data/lib/scrappy/agent/cache.rb +37 -0
- data/lib/scrappy/agent/extractor.rb +8 -4
- data/lib/scrappy/agent/map_reduce.rb +66 -0
- data/lib/scrappy/selectors/xpath.rb +2 -1
- data/scrappy.gemspec +4 -4
- metadata +7 -5
- data/lib/scrappy/agent/cluster.rb +0 -35
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -8,7 +8,8 @@ lib/js/annotator.js
|
|
8
8
|
lib/scrappy.rb
|
9
9
|
lib/scrappy/agent/agent.rb
|
10
10
|
lib/scrappy/agent/blind_agent.rb
|
11
|
-
lib/scrappy/agent/
|
11
|
+
lib/scrappy/agent/cache.rb
|
12
|
+
lib/scrappy/agent/map_reduce.rb
|
12
13
|
lib/scrappy/agent/extractor.rb
|
13
14
|
lib/scrappy/agent/visual_agent.rb
|
14
15
|
lib/scrappy/proxy.rb
|
data/bin/scrappy
CHANGED
@@ -28,7 +28,7 @@ module Scrappy
|
|
28
28
|
|
29
29
|
def initialize
|
30
30
|
Options.port = 3434
|
31
|
-
Options.
|
31
|
+
Agent::Options.workers = 10
|
32
32
|
Agent::Options.depth = 1
|
33
33
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
34
34
|
|
@@ -41,10 +41,10 @@ module Scrappy
|
|
41
41
|
opts.on('-s', '--server') { Options.server = true }
|
42
42
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
43
43
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
44
|
-
opts.on('-c C', '--concurrence C') { |c| Options.
|
45
|
-
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.
|
44
|
+
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
45
|
+
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
46
46
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
47
|
-
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.
|
47
|
+
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
|
48
48
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
49
49
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
50
50
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -139,10 +139,6 @@ Copyright
|
|
139
139
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
140
140
|
data
|
141
141
|
end
|
142
|
-
|
143
|
-
# Create cluster of agents
|
144
|
-
Agent.create_cluster Options.concurrence, :referenceable=>Agent::Options.referenceable,
|
145
|
-
:agent=>Agent::Options.agent, :window=>false
|
146
142
|
end
|
147
143
|
end
|
148
144
|
|
data/lib/scrappy.rb
CHANGED
@@ -3,7 +3,6 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
3
3
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'thread'
|
6
|
-
require 'monitor'
|
7
6
|
require 'mechanize'
|
8
7
|
require 'ostruct'
|
9
8
|
require 'active_support'
|
@@ -13,13 +12,14 @@ require 'lightrdf'
|
|
13
12
|
require 'scrappy/support'
|
14
13
|
|
15
14
|
require 'scrappy/agent/extractor'
|
16
|
-
require 'scrappy/agent/
|
15
|
+
require 'scrappy/agent/map_reduce'
|
16
|
+
require 'scrappy/agent/cache'
|
17
17
|
require 'scrappy/agent/agent'
|
18
18
|
|
19
19
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
20
20
|
|
21
21
|
module Scrappy
|
22
|
-
VERSION = '0.1.
|
22
|
+
VERSION = '0.1.5'
|
23
23
|
end
|
24
24
|
|
25
25
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
module Scrappy
|
2
2
|
class Agent
|
3
|
-
include Extractor
|
4
3
|
include MonitorMixin
|
5
|
-
include
|
4
|
+
include Extractor
|
5
|
+
include MapReduce
|
6
|
+
include Cached
|
6
7
|
|
7
|
-
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
|
8
|
+
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
8
9
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
9
10
|
:rdf => 'application/rdf+xml' }
|
10
11
|
|
@@ -14,9 +15,6 @@ module Scrappy
|
|
14
15
|
def self.[] id
|
15
16
|
pool[id] || Agent.create(:id=>id)
|
16
17
|
end
|
17
|
-
def self.cache
|
18
|
-
@cache ||= {}
|
19
|
-
end
|
20
18
|
|
21
19
|
def self.create args={}
|
22
20
|
if (args[:agent] || Options.agent) == :visual
|
@@ -32,72 +30,82 @@ module Scrappy
|
|
32
30
|
|
33
31
|
def initialize args={}
|
34
32
|
super()
|
33
|
+
@cluster_count = args[:workers] || Options.workers
|
34
|
+
@cluster_options = [ { :referenceable=>Options.referenceable, :agent=>Options.agent,
|
35
|
+
:workers=>1, :window=>false } ]
|
36
|
+
@cluster = args[:parent]
|
35
37
|
@id = args[:id] || Agent.pool.keys.size
|
36
38
|
Agent.pool[@id] = self
|
37
39
|
@kb = args[:kb] || Options.kb
|
38
40
|
@options = Options.clone
|
39
41
|
end
|
40
42
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
44
|
-
request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
|
43
|
+
def map args, queue=nil
|
44
|
+
depth = args[:depth]
|
45
|
+
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
# Expire cache
|
48
|
+
cache.expire! 300 # 5 minutes
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
50
|
+
# Lookup in cache
|
51
|
+
triples = if cache[request]
|
52
|
+
cache[request][:response]
|
53
|
+
else
|
54
|
+
# Perform the request
|
55
|
+
|
56
|
+
sleep 0.001 * options.delay.to_f # Sleep if requested
|
57
|
+
|
58
|
+
if request[:method] == :get
|
59
|
+
self.uri = request[:uri]
|
60
|
+
else
|
61
|
+
raise Exception, 'POST requests not supported yet'
|
62
|
+
end
|
63
|
+
|
64
|
+
response = if self.html_data?
|
65
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
66
|
+
extract self.uri, html, options.referenceable # Extract data
|
52
67
|
else
|
53
|
-
|
54
|
-
|
55
|
-
self.uri = request[:uri]
|
56
|
-
else
|
57
|
-
raise Exception, 'POST requests not supported yet'
|
58
|
-
end
|
59
|
-
|
60
|
-
response = if self.html_data?
|
61
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
62
|
-
extract self.uri, html, options.referenceable # Extract data
|
63
|
-
else
|
64
|
-
[]
|
65
|
-
end
|
68
|
+
[]
|
69
|
+
end
|
66
70
|
|
67
|
-
|
68
|
-
|
69
|
-
|
71
|
+
# Cache the request
|
72
|
+
cache[request] = { :time=>Time.now, :response=>response }
|
73
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
70
74
|
|
71
|
-
|
72
|
-
|
75
|
+
response
|
76
|
+
end
|
73
77
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
+
# Enqueue subresources
|
79
|
+
if depth > 0
|
80
|
+
items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
|
81
|
+
if queue.nil?
|
82
|
+
triples += process items
|
83
|
+
else
|
84
|
+
items.each { |item| queue << item }
|
78
85
|
end
|
79
|
-
|
80
|
-
RDF::Graph.new(triples.uniq)
|
81
86
|
end
|
87
|
+
|
88
|
+
triples
|
89
|
+
end
|
90
|
+
|
91
|
+
def reduce results
|
92
|
+
triples = []; results.each { |result| triples += result }
|
93
|
+
triples
|
94
|
+
end
|
95
|
+
|
96
|
+
def request args={}
|
97
|
+
RDF::Graph.new map(args).uniq
|
82
98
|
end
|
83
99
|
|
84
100
|
def proxy args={}
|
85
|
-
|
86
|
-
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
101
|
+
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
87
102
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
end
|
103
|
+
OpenStruct.new :output => self.request(request).serialize(request[:format]),
|
104
|
+
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
105
|
+
:uri => self.uri,
|
106
|
+
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
93
107
|
end
|
94
108
|
|
95
|
-
# Method used when consuming a list of uris
|
96
|
-
def process uri, args={}
|
97
|
-
sleep 0.001 * options.delay.to_f
|
98
|
-
request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
|
99
|
-
end
|
100
|
-
|
101
109
|
def complete_uri uri
|
102
110
|
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
103
111
|
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
3
|
+
module Scrappy
|
4
|
+
module Cached
|
5
|
+
def self.included base
|
6
|
+
base.extend Cached::ClassMethods
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def cache
|
11
|
+
@cache ||= Cache.new
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def cache
|
16
|
+
self.class.cache
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Cache < Hash
|
21
|
+
include MonitorMixin
|
22
|
+
|
23
|
+
def expire! timeout
|
24
|
+
synchronize do
|
25
|
+
keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def []= key, value
|
30
|
+
synchronize { super }
|
31
|
+
end
|
32
|
+
|
33
|
+
def [] key
|
34
|
+
synchronize { super }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -79,17 +79,21 @@ module Scrappy
|
|
79
79
|
# From "BaseUriSelector" to "base_uri"
|
80
80
|
class_name = selector.rdf::type.first.to_s.split('#').last
|
81
81
|
|
82
|
-
# Process selector
|
83
|
-
results = Kernel.const_get(class_name).filter selector, doc
|
84
|
-
|
85
82
|
if !selector.sc::debug.empty?
|
86
83
|
puts '== DEBUG'
|
87
84
|
puts '== Selector:'
|
88
85
|
puts selector.serialize(:yarf, false)
|
89
|
-
puts '==
|
86
|
+
puts '== On fragment:'
|
90
87
|
puts "URI: #{doc[:uri]}"
|
91
88
|
puts "Content: #{doc[:content]}"
|
92
89
|
puts "Value: #{doc[:value]}"
|
90
|
+
end
|
91
|
+
|
92
|
+
# Process selector
|
93
|
+
results = Kernel.const_get(class_name).filter selector, doc
|
94
|
+
|
95
|
+
if !selector.sc::debug.empty?
|
96
|
+
puts "== No results" if results.empty?
|
93
97
|
results.each_with_index do |result, i|
|
94
98
|
puts "== Result ##{i}:"
|
95
99
|
puts "URI: #{result[:uri]}"
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'monitor'
|
3
|
+
|
4
|
+
module MapReduce
|
5
|
+
|
6
|
+
class Queue
|
7
|
+
def initialize
|
8
|
+
@items = []
|
9
|
+
@items.extend MonitorMixin
|
10
|
+
end
|
11
|
+
|
12
|
+
def pop
|
13
|
+
yielded = false
|
14
|
+
item = nil
|
15
|
+
@items.synchronize do
|
16
|
+
item = @items.shift
|
17
|
+
if @items.empty?
|
18
|
+
yield item if (block_given? and item)
|
19
|
+
yielded = true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
yield item if (block_given? and not yielded)
|
23
|
+
item
|
24
|
+
end
|
25
|
+
|
26
|
+
def << value
|
27
|
+
@items << value
|
28
|
+
end
|
29
|
+
|
30
|
+
def push value
|
31
|
+
self << value
|
32
|
+
end
|
33
|
+
|
34
|
+
def empty?
|
35
|
+
@items.synchronize { @items.empty? }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def cluster
|
41
|
+
@cluster ||= (1..@cluster_count || 1).map { self.class.new(*(@cluster_options || [])) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def process list
|
45
|
+
results = []
|
46
|
+
results.extend MonitorMixin
|
47
|
+
|
48
|
+
queue = Queue.new
|
49
|
+
list.each { |element| queue << element }
|
50
|
+
|
51
|
+
cluster.map { |obj| Thread.new { obj.work queue, results } }.each { |t| t.join }
|
52
|
+
|
53
|
+
reduce results
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def work queue, results
|
58
|
+
begin
|
59
|
+
queue.pop do |item|
|
60
|
+
result = map item, queue
|
61
|
+
results.synchronize { results << result }
|
62
|
+
end
|
63
|
+
end until queue.empty?
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -6,7 +6,8 @@ module XPathSelector
|
|
6
6
|
else
|
7
7
|
(0..-1)
|
8
8
|
end
|
9
|
-
|
9
|
+
patterns = selector.sc::keyword
|
10
|
+
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
10
11
|
if selector.sc::attribute.first
|
11
12
|
# Select node's attribute if given
|
12
13
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-11-
|
9
|
+
s.date = %q{2010-11-29}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 5
|
9
|
+
version: 0.1.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-29 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -126,7 +126,8 @@ extra_rdoc_files:
|
|
126
126
|
- lib/scrappy.rb
|
127
127
|
- lib/scrappy/agent/agent.rb
|
128
128
|
- lib/scrappy/agent/blind_agent.rb
|
129
|
-
- lib/scrappy/agent/
|
129
|
+
- lib/scrappy/agent/cache.rb
|
130
|
+
- lib/scrappy/agent/map_reduce.rb
|
130
131
|
- lib/scrappy/agent/extractor.rb
|
131
132
|
- lib/scrappy/agent/visual_agent.rb
|
132
133
|
- lib/scrappy/proxy.rb
|
@@ -152,7 +153,8 @@ files:
|
|
152
153
|
- lib/scrappy.rb
|
153
154
|
- lib/scrappy/agent/agent.rb
|
154
155
|
- lib/scrappy/agent/blind_agent.rb
|
155
|
-
- lib/scrappy/agent/
|
156
|
+
- lib/scrappy/agent/cache.rb
|
157
|
+
- lib/scrappy/agent/map_reduce.rb
|
156
158
|
- lib/scrappy/agent/extractor.rb
|
157
159
|
- lib/scrappy/agent/visual_agent.rb
|
158
160
|
- lib/scrappy/proxy.rb
|
@@ -1,35 +0,0 @@
|
|
1
|
-
module Cluster
|
2
|
-
|
3
|
-
def self.included(klass)
|
4
|
-
klass.extend ClassMethods
|
5
|
-
klass.extend MonitorMixin
|
6
|
-
end
|
7
|
-
|
8
|
-
def consume(list, results, args={})
|
9
|
-
begin
|
10
|
-
element = list.synchronize { list.pop }
|
11
|
-
unless element.nil?
|
12
|
-
result = process(element, args)
|
13
|
-
results.synchronize { results << result }
|
14
|
-
end
|
15
|
-
end until element.nil?
|
16
|
-
end
|
17
|
-
|
18
|
-
module ClassMethods
|
19
|
-
def cluster; @cluster; end
|
20
|
-
def cluster= value; @cluster=value; end
|
21
|
-
|
22
|
-
def create_cluster count, *args
|
23
|
-
self.cluster = (1..count).map { args.nil? ? create : create(*args) }
|
24
|
-
end
|
25
|
-
|
26
|
-
def process(list=[], args={})
|
27
|
-
results = []
|
28
|
-
list.extend MonitorMixin
|
29
|
-
results.extend MonitorMixin
|
30
|
-
cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
|
31
|
-
results
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|