scrappy 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +4 -0
- data/Manifest +2 -1
- data/bin/scrappy +4 -8
- data/lib/scrappy.rb +3 -3
- data/lib/scrappy/agent/agent.rb +60 -52
- data/lib/scrappy/agent/cache.rb +37 -0
- data/lib/scrappy/agent/extractor.rb +8 -4
- data/lib/scrappy/agent/map_reduce.rb +66 -0
- data/lib/scrappy/selectors/xpath.rb +2 -1
- data/scrappy.gemspec +4 -4
- metadata +7 -5
- data/lib/scrappy/agent/cluster.rb +0 -35
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -8,7 +8,8 @@ lib/js/annotator.js
|
|
8
8
|
lib/scrappy.rb
|
9
9
|
lib/scrappy/agent/agent.rb
|
10
10
|
lib/scrappy/agent/blind_agent.rb
|
11
|
-
lib/scrappy/agent/
|
11
|
+
lib/scrappy/agent/cache.rb
|
12
|
+
lib/scrappy/agent/map_reduce.rb
|
12
13
|
lib/scrappy/agent/extractor.rb
|
13
14
|
lib/scrappy/agent/visual_agent.rb
|
14
15
|
lib/scrappy/proxy.rb
|
data/bin/scrappy
CHANGED
@@ -28,7 +28,7 @@ module Scrappy
|
|
28
28
|
|
29
29
|
def initialize
|
30
30
|
Options.port = 3434
|
31
|
-
Options.
|
31
|
+
Agent::Options.workers = 10
|
32
32
|
Agent::Options.depth = 1
|
33
33
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
34
34
|
|
@@ -41,10 +41,10 @@ module Scrappy
|
|
41
41
|
opts.on('-s', '--server') { Options.server = true }
|
42
42
|
opts.on('-S', '--proxy-server') { Options.proxy = true }
|
43
43
|
opts.on('-P P', '--port P') { |p| Options.port = p }
|
44
|
-
opts.on('-c C', '--concurrence C') { |c| Options.
|
45
|
-
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.
|
44
|
+
opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
|
45
|
+
opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
|
46
46
|
opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
|
47
|
-
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.
|
47
|
+
opts.on('-v', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
|
48
48
|
opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
|
49
49
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
50
50
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
@@ -139,10 +139,6 @@ Copyright
|
|
139
139
|
open(cache_file, "w") { |f| Marshal.dump(data, f) }
|
140
140
|
data
|
141
141
|
end
|
142
|
-
|
143
|
-
# Create cluster of agents
|
144
|
-
Agent.create_cluster Options.concurrence, :referenceable=>Agent::Options.referenceable,
|
145
|
-
:agent=>Agent::Options.agent, :window=>false
|
146
142
|
end
|
147
143
|
end
|
148
144
|
|
data/lib/scrappy.rb
CHANGED
@@ -3,7 +3,6 @@ $:.unshift(File.dirname(__FILE__)) unless
|
|
3
3
|
|
4
4
|
require 'nokogiri'
|
5
5
|
require 'thread'
|
6
|
-
require 'monitor'
|
7
6
|
require 'mechanize'
|
8
7
|
require 'ostruct'
|
9
8
|
require 'active_support'
|
@@ -13,13 +12,14 @@ require 'lightrdf'
|
|
13
12
|
require 'scrappy/support'
|
14
13
|
|
15
14
|
require 'scrappy/agent/extractor'
|
16
|
-
require 'scrappy/agent/
|
15
|
+
require 'scrappy/agent/map_reduce'
|
16
|
+
require 'scrappy/agent/cache'
|
17
17
|
require 'scrappy/agent/agent'
|
18
18
|
|
19
19
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
20
20
|
|
21
21
|
module Scrappy
|
22
|
-
VERSION = '0.1.
|
22
|
+
VERSION = '0.1.5'
|
23
23
|
end
|
24
24
|
|
25
25
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
module Scrappy
|
2
2
|
class Agent
|
3
|
-
include Extractor
|
4
3
|
include MonitorMixin
|
5
|
-
include
|
4
|
+
include Extractor
|
5
|
+
include MapReduce
|
6
|
+
include Cached
|
6
7
|
|
7
|
-
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
|
8
|
+
Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
|
8
9
|
ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
|
9
10
|
:rdf => 'application/rdf+xml' }
|
10
11
|
|
@@ -14,9 +15,6 @@ module Scrappy
|
|
14
15
|
def self.[] id
|
15
16
|
pool[id] || Agent.create(:id=>id)
|
16
17
|
end
|
17
|
-
def self.cache
|
18
|
-
@cache ||= {}
|
19
|
-
end
|
20
18
|
|
21
19
|
def self.create args={}
|
22
20
|
if (args[:agent] || Options.agent) == :visual
|
@@ -32,72 +30,82 @@ module Scrappy
|
|
32
30
|
|
33
31
|
def initialize args={}
|
34
32
|
super()
|
33
|
+
@cluster_count = args[:workers] || Options.workers
|
34
|
+
@cluster_options = [ { :referenceable=>Options.referenceable, :agent=>Options.agent,
|
35
|
+
:workers=>1, :window=>false } ]
|
36
|
+
@cluster = args[:parent]
|
35
37
|
@id = args[:id] || Agent.pool.keys.size
|
36
38
|
Agent.pool[@id] = self
|
37
39
|
@kb = args[:kb] || Options.kb
|
38
40
|
@options = Options.clone
|
39
41
|
end
|
40
42
|
|
41
|
-
def
|
42
|
-
|
43
|
-
|
44
|
-
request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
|
43
|
+
def map args, queue=nil
|
44
|
+
depth = args[:depth]
|
45
|
+
request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
|
45
46
|
|
46
|
-
|
47
|
-
|
47
|
+
# Expire cache
|
48
|
+
cache.expire! 300 # 5 minutes
|
48
49
|
|
49
|
-
|
50
|
-
|
51
|
-
|
50
|
+
# Lookup in cache
|
51
|
+
triples = if cache[request]
|
52
|
+
cache[request][:response]
|
53
|
+
else
|
54
|
+
# Perform the request
|
55
|
+
|
56
|
+
sleep 0.001 * options.delay.to_f # Sleep if requested
|
57
|
+
|
58
|
+
if request[:method] == :get
|
59
|
+
self.uri = request[:uri]
|
60
|
+
else
|
61
|
+
raise Exception, 'POST requests not supported yet'
|
62
|
+
end
|
63
|
+
|
64
|
+
response = if self.html_data?
|
65
|
+
add_visual_data! if options.referenceable # Adds tags including visual information
|
66
|
+
extract self.uri, html, options.referenceable # Extract data
|
52
67
|
else
|
53
|
-
|
54
|
-
|
55
|
-
self.uri = request[:uri]
|
56
|
-
else
|
57
|
-
raise Exception, 'POST requests not supported yet'
|
58
|
-
end
|
59
|
-
|
60
|
-
response = if self.html_data?
|
61
|
-
add_visual_data! if options.referenceable # Adds tags including visual information
|
62
|
-
extract self.uri, html, options.referenceable # Extract data
|
63
|
-
else
|
64
|
-
[]
|
65
|
-
end
|
68
|
+
[]
|
69
|
+
end
|
66
70
|
|
67
|
-
|
68
|
-
|
69
|
-
|
71
|
+
# Cache the request
|
72
|
+
cache[request] = { :time=>Time.now, :response=>response }
|
73
|
+
cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
|
70
74
|
|
71
|
-
|
72
|
-
|
75
|
+
response
|
76
|
+
end
|
73
77
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
+
# Enqueue subresources
|
79
|
+
if depth > 0
|
80
|
+
items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
|
81
|
+
if queue.nil?
|
82
|
+
triples += process items
|
83
|
+
else
|
84
|
+
items.each { |item| queue << item }
|
78
85
|
end
|
79
|
-
|
80
|
-
RDF::Graph.new(triples.uniq)
|
81
86
|
end
|
87
|
+
|
88
|
+
triples
|
89
|
+
end
|
90
|
+
|
91
|
+
def reduce results
|
92
|
+
triples = []; results.each { |result| triples += result }
|
93
|
+
triples
|
94
|
+
end
|
95
|
+
|
96
|
+
def request args={}
|
97
|
+
RDF::Graph.new map(args).uniq
|
82
98
|
end
|
83
99
|
|
84
100
|
def proxy args={}
|
85
|
-
|
86
|
-
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
101
|
+
request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
|
87
102
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
end
|
103
|
+
OpenStruct.new :output => self.request(request).serialize(request[:format]),
|
104
|
+
:content_type => ContentTypes[request[:format]] || 'text/plain',
|
105
|
+
:uri => self.uri,
|
106
|
+
:status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
|
93
107
|
end
|
94
108
|
|
95
|
-
# Method used when consuming a list of uris
|
96
|
-
def process uri, args={}
|
97
|
-
sleep 0.001 * options.delay.to_f
|
98
|
-
request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
|
99
|
-
end
|
100
|
-
|
101
109
|
def complete_uri uri
|
102
110
|
uri = "#{uri}.com" if uri =~ /\A\w+\Z/
|
103
111
|
uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'monitor'
|
2
|
+
|
3
|
+
module Scrappy
|
4
|
+
module Cached
|
5
|
+
def self.included base
|
6
|
+
base.extend Cached::ClassMethods
|
7
|
+
end
|
8
|
+
|
9
|
+
module ClassMethods
|
10
|
+
def cache
|
11
|
+
@cache ||= Cache.new
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def cache
|
16
|
+
self.class.cache
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class Cache < Hash
|
21
|
+
include MonitorMixin
|
22
|
+
|
23
|
+
def expire! timeout
|
24
|
+
synchronize do
|
25
|
+
keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def []= key, value
|
30
|
+
synchronize { super }
|
31
|
+
end
|
32
|
+
|
33
|
+
def [] key
|
34
|
+
synchronize { super }
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -79,17 +79,21 @@ module Scrappy
|
|
79
79
|
# From "BaseUriSelector" to "base_uri"
|
80
80
|
class_name = selector.rdf::type.first.to_s.split('#').last
|
81
81
|
|
82
|
-
# Process selector
|
83
|
-
results = Kernel.const_get(class_name).filter selector, doc
|
84
|
-
|
85
82
|
if !selector.sc::debug.empty?
|
86
83
|
puts '== DEBUG'
|
87
84
|
puts '== Selector:'
|
88
85
|
puts selector.serialize(:yarf, false)
|
89
|
-
puts '==
|
86
|
+
puts '== On fragment:'
|
90
87
|
puts "URI: #{doc[:uri]}"
|
91
88
|
puts "Content: #{doc[:content]}"
|
92
89
|
puts "Value: #{doc[:value]}"
|
90
|
+
end
|
91
|
+
|
92
|
+
# Process selector
|
93
|
+
results = Kernel.const_get(class_name).filter selector, doc
|
94
|
+
|
95
|
+
if !selector.sc::debug.empty?
|
96
|
+
puts "== No results" if results.empty?
|
93
97
|
results.each_with_index do |result, i|
|
94
98
|
puts "== Result ##{i}:"
|
95
99
|
puts "URI: #{result[:uri]}"
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'thread'
|
2
|
+
require 'monitor'
|
3
|
+
|
4
|
+
module MapReduce
|
5
|
+
|
6
|
+
class Queue
|
7
|
+
def initialize
|
8
|
+
@items = []
|
9
|
+
@items.extend MonitorMixin
|
10
|
+
end
|
11
|
+
|
12
|
+
def pop
|
13
|
+
yielded = false
|
14
|
+
item = nil
|
15
|
+
@items.synchronize do
|
16
|
+
item = @items.shift
|
17
|
+
if @items.empty?
|
18
|
+
yield item if (block_given? and item)
|
19
|
+
yielded = true
|
20
|
+
end
|
21
|
+
end
|
22
|
+
yield item if (block_given? and not yielded)
|
23
|
+
item
|
24
|
+
end
|
25
|
+
|
26
|
+
def << value
|
27
|
+
@items << value
|
28
|
+
end
|
29
|
+
|
30
|
+
def push value
|
31
|
+
self << value
|
32
|
+
end
|
33
|
+
|
34
|
+
def empty?
|
35
|
+
@items.synchronize { @items.empty? }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
def cluster
|
41
|
+
@cluster ||= (1..@cluster_count || 1).map { self.class.new(*(@cluster_options || [])) }
|
42
|
+
end
|
43
|
+
|
44
|
+
def process list
|
45
|
+
results = []
|
46
|
+
results.extend MonitorMixin
|
47
|
+
|
48
|
+
queue = Queue.new
|
49
|
+
list.each { |element| queue << element }
|
50
|
+
|
51
|
+
cluster.map { |obj| Thread.new { obj.work queue, results } }.each { |t| t.join }
|
52
|
+
|
53
|
+
reduce results
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
def work queue, results
|
58
|
+
begin
|
59
|
+
queue.pop do |item|
|
60
|
+
result = map item, queue
|
61
|
+
results.synchronize { results << result }
|
62
|
+
end
|
63
|
+
end until queue.empty?
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -6,7 +6,8 @@ module XPathSelector
|
|
6
6
|
else
|
7
7
|
(0..-1)
|
8
8
|
end
|
9
|
-
|
9
|
+
patterns = selector.sc::keyword
|
10
|
+
(doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
|
10
11
|
if selector.sc::attribute.first
|
11
12
|
# Select node's attribute if given
|
12
13
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.5"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-11-
|
9
|
+
s.date = %q{2010-11-29}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 5
|
9
|
+
version: 0.1.5
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-29 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -126,7 +126,8 @@ extra_rdoc_files:
|
|
126
126
|
- lib/scrappy.rb
|
127
127
|
- lib/scrappy/agent/agent.rb
|
128
128
|
- lib/scrappy/agent/blind_agent.rb
|
129
|
-
- lib/scrappy/agent/
|
129
|
+
- lib/scrappy/agent/cache.rb
|
130
|
+
- lib/scrappy/agent/map_reduce.rb
|
130
131
|
- lib/scrappy/agent/extractor.rb
|
131
132
|
- lib/scrappy/agent/visual_agent.rb
|
132
133
|
- lib/scrappy/proxy.rb
|
@@ -152,7 +153,8 @@ files:
|
|
152
153
|
- lib/scrappy.rb
|
153
154
|
- lib/scrappy/agent/agent.rb
|
154
155
|
- lib/scrappy/agent/blind_agent.rb
|
155
|
-
- lib/scrappy/agent/
|
156
|
+
- lib/scrappy/agent/cache.rb
|
157
|
+
- lib/scrappy/agent/map_reduce.rb
|
156
158
|
- lib/scrappy/agent/extractor.rb
|
157
159
|
- lib/scrappy/agent/visual_agent.rb
|
158
160
|
- lib/scrappy/proxy.rb
|
@@ -1,35 +0,0 @@
|
|
1
|
-
module Cluster
|
2
|
-
|
3
|
-
def self.included(klass)
|
4
|
-
klass.extend ClassMethods
|
5
|
-
klass.extend MonitorMixin
|
6
|
-
end
|
7
|
-
|
8
|
-
def consume(list, results, args={})
|
9
|
-
begin
|
10
|
-
element = list.synchronize { list.pop }
|
11
|
-
unless element.nil?
|
12
|
-
result = process(element, args)
|
13
|
-
results.synchronize { results << result }
|
14
|
-
end
|
15
|
-
end until element.nil?
|
16
|
-
end
|
17
|
-
|
18
|
-
module ClassMethods
|
19
|
-
def cluster; @cluster; end
|
20
|
-
def cluster= value; @cluster=value; end
|
21
|
-
|
22
|
-
def create_cluster count, *args
|
23
|
-
self.cluster = (1..count).map { args.nil? ? create : create(*args) }
|
24
|
-
end
|
25
|
-
|
26
|
-
def process(list=[], args={})
|
27
|
-
results = []
|
28
|
-
list.extend MonitorMixin
|
29
|
-
results.extend MonitorMixin
|
30
|
-
cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
|
31
|
-
results
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
end
|