scrappy 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.5 2010-11-29
2
+
3
+ * Better map-reduce-based design
4
+
1
5
  === 0.1.4 2010-11-24
2
6
 
3
7
  * Support for node sets in extractions
data/Manifest CHANGED
@@ -8,7 +8,8 @@ lib/js/annotator.js
8
8
  lib/scrappy.rb
9
9
  lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
- lib/scrappy/agent/cluster.rb
11
+ lib/scrappy/agent/cache.rb
12
+ lib/scrappy/agent/map_reduce.rb
12
13
  lib/scrappy/agent/extractor.rb
13
14
  lib/scrappy/agent/visual_agent.rb
14
15
  lib/scrappy/proxy.rb
data/bin/scrappy CHANGED
@@ -28,7 +28,7 @@ module Scrappy
28
28
 
29
29
  def initialize
30
30
  Options.port = 3434
31
- Options.concurrence = 10
31
+ Agent::Options.workers = 10
32
32
  Agent::Options.depth = 1
33
33
  args = ARGV.map { |arg| arg.split(" ") }.flatten
34
34
 
@@ -41,10 +41,10 @@ module Scrappy
41
41
  opts.on('-s', '--server') { Options.server = true }
42
42
  opts.on('-S', '--proxy-server') { Options.proxy = true }
43
43
  opts.on('-P P', '--port P') { |p| Options.port = p }
44
- opts.on('-c C', '--concurrence C') { |c| Options.concurrence = c.to_i }
45
- opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.concurrence = 1 }
44
+ opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
45
+ opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
46
46
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
47
- opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.concurrence = 1 }
47
+ opts.on('-v', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
48
48
  opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
49
49
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
50
50
  opts.on('-w', '--window') { Agent::Options.window = true }
@@ -139,10 +139,6 @@ Copyright
139
139
  open(cache_file, "w") { |f| Marshal.dump(data, f) }
140
140
  data
141
141
  end
142
-
143
- # Create cluster of agents
144
- Agent.create_cluster Options.concurrence, :referenceable=>Agent::Options.referenceable,
145
- :agent=>Agent::Options.agent, :window=>false
146
142
  end
147
143
  end
148
144
 
data/lib/scrappy.rb CHANGED
@@ -3,7 +3,6 @@ $:.unshift(File.dirname(__FILE__)) unless
3
3
 
4
4
  require 'nokogiri'
5
5
  require 'thread'
6
- require 'monitor'
7
6
  require 'mechanize'
8
7
  require 'ostruct'
9
8
  require 'active_support'
@@ -13,13 +12,14 @@ require 'lightrdf'
13
12
  require 'scrappy/support'
14
13
 
15
14
  require 'scrappy/agent/extractor'
16
- require 'scrappy/agent/cluster'
15
+ require 'scrappy/agent/map_reduce'
16
+ require 'scrappy/agent/cache'
17
17
  require 'scrappy/agent/agent'
18
18
 
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.4'
22
+ VERSION = '0.1.5'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -1,10 +1,11 @@
1
1
  module Scrappy
2
2
  class Agent
3
- include Extractor
4
3
  include MonitorMixin
5
- include Cluster
4
+ include Extractor
5
+ include MapReduce
6
+ include Cached
6
7
 
7
- Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
8
+ Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
8
9
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
9
10
  :rdf => 'application/rdf+xml' }
10
11
 
@@ -14,9 +15,6 @@ module Scrappy
14
15
  def self.[] id
15
16
  pool[id] || Agent.create(:id=>id)
16
17
  end
17
- def self.cache
18
- @cache ||= {}
19
- end
20
18
 
21
19
  def self.create args={}
22
20
  if (args[:agent] || Options.agent) == :visual
@@ -32,72 +30,82 @@ module Scrappy
32
30
 
33
31
  def initialize args={}
34
32
  super()
33
+ @cluster_count = args[:workers] || Options.workers
34
+ @cluster_options = [ { :referenceable=>Options.referenceable, :agent=>Options.agent,
35
+ :workers=>1, :window=>false } ]
36
+ @cluster = args[:parent]
35
37
  @id = args[:id] || Agent.pool.keys.size
36
38
  Agent.pool[@id] = self
37
39
  @kb = args[:kb] || Options.kb
38
40
  @options = Options.clone
39
41
  end
40
42
 
41
- def request args={}
42
- synchronize do
43
- depth = args[:depth]
44
- request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
43
+ def map args, queue=nil
44
+ depth = args[:depth]
45
+ request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
45
46
 
46
- # Expire cache
47
- Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
47
+ # Expire cache
48
+ cache.expire! 300 # 5 minutes
48
49
 
49
- # Lookup in cache
50
- triples = if Agent::cache[request]
51
- Agent::cache[request][:response]
50
+ # Lookup in cache
51
+ triples = if cache[request]
52
+ cache[request][:response]
53
+ else
54
+ # Perform the request
55
+
56
+ sleep 0.001 * options.delay.to_f # Sleep if requested
57
+
58
+ if request[:method] == :get
59
+ self.uri = request[:uri]
60
+ else
61
+ raise Exception, 'POST requests not supported yet'
62
+ end
63
+
64
+ response = if self.html_data?
65
+ add_visual_data! if options.referenceable # Adds tags including visual information
66
+ extract self.uri, html, options.referenceable # Extract data
52
67
  else
53
- # Perform the request
54
- if request[:method] == :get
55
- self.uri = request[:uri]
56
- else
57
- raise Exception, 'POST requests not supported yet'
58
- end
59
-
60
- response = if self.html_data?
61
- add_visual_data! if options.referenceable # Adds tags including visual information
62
- extract self.uri, html, options.referenceable # Extract data
63
- else
64
- []
65
- end
68
+ []
69
+ end
66
70
 
67
- # Cache the request
68
- Agent::cache[request] = { :time=>Time.now, :response=>response }
69
- Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
71
+ # Cache the request
72
+ cache[request] = { :time=>Time.now, :response=>response }
73
+ cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
70
74
 
71
- response
72
- end
75
+ response
76
+ end
73
77
 
74
- # Iterate through subresources
75
- if depth > 0
76
- uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
77
- Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
78
+ # Enqueue subresources
79
+ if depth > 0
80
+ items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
81
+ if queue.nil?
82
+ triples += process items
83
+ else
84
+ items.each { |item| queue << item }
78
85
  end
79
-
80
- RDF::Graph.new(triples.uniq)
81
86
  end
87
+
88
+ triples
89
+ end
90
+
91
+ def reduce results
92
+ triples = []; results.each { |result| triples += result }
93
+ triples
94
+ end
95
+
96
+ def request args={}
97
+ RDF::Graph.new map(args).uniq
82
98
  end
83
99
 
84
100
  def proxy args={}
85
- synchronize do
86
- request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
101
+ request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
87
102
 
88
- OpenStruct.new :output => self.request(request).serialize(request[:format]),
89
- :content_type => ContentTypes[request[:format]] || 'text/plain',
90
- :uri => self.uri,
91
- :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
92
- end
103
+ OpenStruct.new :output => self.request(request).serialize(request[:format]),
104
+ :content_type => ContentTypes[request[:format]] || 'text/plain',
105
+ :uri => self.uri,
106
+ :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
93
107
  end
94
108
 
95
- # Method used when consuming a list of uris
96
- def process uri, args={}
97
- sleep 0.001 * options.delay.to_f
98
- request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
99
- end
100
-
101
109
  def complete_uri uri
102
110
  uri = "#{uri}.com" if uri =~ /\A\w+\Z/
103
111
  uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
@@ -0,0 +1,37 @@
1
+ require 'monitor'
2
+
3
+ module Scrappy
4
+ module Cached
5
+ def self.included base
6
+ base.extend Cached::ClassMethods
7
+ end
8
+
9
+ module ClassMethods
10
+ def cache
11
+ @cache ||= Cache.new
12
+ end
13
+ end
14
+
15
+ def cache
16
+ self.class.cache
17
+ end
18
+ end
19
+
20
+ class Cache < Hash
21
+ include MonitorMixin
22
+
23
+ def expire! timeout
24
+ synchronize do
25
+ keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
26
+ end
27
+ end
28
+
29
+ def []= key, value
30
+ synchronize { super }
31
+ end
32
+
33
+ def [] key
34
+ synchronize { super }
35
+ end
36
+ end
37
+ end
@@ -79,17 +79,21 @@ module Scrappy
79
79
  # From "BaseUriSelector" to "base_uri"
80
80
  class_name = selector.rdf::type.first.to_s.split('#').last
81
81
 
82
- # Process selector
83
- results = Kernel.const_get(class_name).filter selector, doc
84
-
85
82
  if !selector.sc::debug.empty?
86
83
  puts '== DEBUG'
87
84
  puts '== Selector:'
88
85
  puts selector.serialize(:yarf, false)
89
- puts '== Applied on fragment:'
86
+ puts '== On fragment:'
90
87
  puts "URI: #{doc[:uri]}"
91
88
  puts "Content: #{doc[:content]}"
92
89
  puts "Value: #{doc[:value]}"
90
+ end
91
+
92
+ # Process selector
93
+ results = Kernel.const_get(class_name).filter selector, doc
94
+
95
+ if !selector.sc::debug.empty?
96
+ puts "== No results" if results.empty?
93
97
  results.each_with_index do |result, i|
94
98
  puts "== Result ##{i}:"
95
99
  puts "URI: #{result[:uri]}"
@@ -0,0 +1,66 @@
1
+ require 'thread'
2
+ require 'monitor'
3
+
4
+ module MapReduce
5
+
6
+ class Queue
7
+ def initialize
8
+ @items = []
9
+ @items.extend MonitorMixin
10
+ end
11
+
12
+ def pop
13
+ yielded = false
14
+ item = nil
15
+ @items.synchronize do
16
+ item = @items.shift
17
+ if @items.empty?
18
+ yield item if (block_given? and item)
19
+ yielded = true
20
+ end
21
+ end
22
+ yield item if (block_given? and not yielded)
23
+ item
24
+ end
25
+
26
+ def << value
27
+ @items << value
28
+ end
29
+
30
+ def push value
31
+ self << value
32
+ end
33
+
34
+ def empty?
35
+ @items.synchronize { @items.empty? }
36
+ end
37
+ end
38
+
39
+
40
+ def cluster
41
+ @cluster ||= (1..@cluster_count || 1).map { self.class.new(*(@cluster_options || [])) }
42
+ end
43
+
44
+ def process list
45
+ results = []
46
+ results.extend MonitorMixin
47
+
48
+ queue = Queue.new
49
+ list.each { |element| queue << element }
50
+
51
+ cluster.map { |obj| Thread.new { obj.work queue, results } }.each { |t| t.join }
52
+
53
+ reduce results
54
+ end
55
+
56
+
57
+ def work queue, results
58
+ begin
59
+ queue.pop do |item|
60
+ result = map item, queue
61
+ results.synchronize { results << result }
62
+ end
63
+ end until queue.empty?
64
+ end
65
+
66
+ end
@@ -6,7 +6,8 @@ module XPathSelector
6
6
  else
7
7
  (0..-1)
8
8
  end
9
- (doc[:content].search(pattern)[interval] || []).map do |result|
9
+ patterns = selector.sc::keyword
10
+ (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
10
11
  if selector.sc::attribute.first
11
12
  # Select node's attribute if given
12
13
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.4"
5
+ s.version = "0.1.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-24}
9
+ s.date = %q{2010-11-29}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 4
9
- version: 0.1.4
8
+ - 5
9
+ version: 0.1.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-24 00:00:00 +01:00
17
+ date: 2010-11-29 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -126,7 +126,8 @@ extra_rdoc_files:
126
126
  - lib/scrappy.rb
127
127
  - lib/scrappy/agent/agent.rb
128
128
  - lib/scrappy/agent/blind_agent.rb
129
- - lib/scrappy/agent/cluster.rb
129
+ - lib/scrappy/agent/cache.rb
130
+ - lib/scrappy/agent/map_reduce.rb
130
131
  - lib/scrappy/agent/extractor.rb
131
132
  - lib/scrappy/agent/visual_agent.rb
132
133
  - lib/scrappy/proxy.rb
@@ -152,7 +153,8 @@ files:
152
153
  - lib/scrappy.rb
153
154
  - lib/scrappy/agent/agent.rb
154
155
  - lib/scrappy/agent/blind_agent.rb
155
- - lib/scrappy/agent/cluster.rb
156
+ - lib/scrappy/agent/cache.rb
157
+ - lib/scrappy/agent/map_reduce.rb
156
158
  - lib/scrappy/agent/extractor.rb
157
159
  - lib/scrappy/agent/visual_agent.rb
158
160
  - lib/scrappy/proxy.rb
@@ -1,35 +0,0 @@
1
- module Cluster
2
-
3
- def self.included(klass)
4
- klass.extend ClassMethods
5
- klass.extend MonitorMixin
6
- end
7
-
8
- def consume(list, results, args={})
9
- begin
10
- element = list.synchronize { list.pop }
11
- unless element.nil?
12
- result = process(element, args)
13
- results.synchronize { results << result }
14
- end
15
- end until element.nil?
16
- end
17
-
18
- module ClassMethods
19
- def cluster; @cluster; end
20
- def cluster= value; @cluster=value; end
21
-
22
- def create_cluster count, *args
23
- self.cluster = (1..count).map { args.nil? ? create : create(*args) }
24
- end
25
-
26
- def process(list=[], args={})
27
- results = []
28
- list.extend MonitorMixin
29
- results.extend MonitorMixin
30
- cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
31
- results
32
- end
33
- end
34
-
35
- end