scrappy 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,7 @@
1
+ === 0.1.5 2010-11-29
2
+
3
+ * Better map-reduce-based design
4
+
1
5
  === 0.1.4 2010-11-24
2
6
 
3
7
  * Support for node sets in extractions
data/Manifest CHANGED
@@ -8,7 +8,8 @@ lib/js/annotator.js
8
8
  lib/scrappy.rb
9
9
  lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
- lib/scrappy/agent/cluster.rb
11
+ lib/scrappy/agent/cache.rb
12
+ lib/scrappy/agent/map_reduce.rb
12
13
  lib/scrappy/agent/extractor.rb
13
14
  lib/scrappy/agent/visual_agent.rb
14
15
  lib/scrappy/proxy.rb
data/bin/scrappy CHANGED
@@ -28,7 +28,7 @@ module Scrappy
28
28
 
29
29
  def initialize
30
30
  Options.port = 3434
31
- Options.concurrence = 10
31
+ Agent::Options.workers = 10
32
32
  Agent::Options.depth = 1
33
33
  args = ARGV.map { |arg| arg.split(" ") }.flatten
34
34
 
@@ -41,10 +41,10 @@ module Scrappy
41
41
  opts.on('-s', '--server') { Options.server = true }
42
42
  opts.on('-S', '--proxy-server') { Options.proxy = true }
43
43
  opts.on('-P P', '--port P') { |p| Options.port = p }
44
- opts.on('-c C', '--concurrence C') { |c| Options.concurrence = c.to_i }
45
- opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Options.concurrence = 1 }
44
+ opts.on('-c C', '--concurrence C') { |c| Agent::Options.workers = c.to_i }
45
+ opts.on('-d D', '--delay D') { |d| Agent::Options.delay = d; Agent::Options.workers = 1 }
46
46
  opts.on('-l L', '--levels L') { |l| Agent::Options.depth = l.to_i }
47
- opts.on('-v', '--visual') { Agent::Options.agent = :visual; Options.concurrence = 1 }
47
+ opts.on('-v', '--visual') { Agent::Options.agent = :visual; Agent::Options.workers = 1 }
48
48
  opts.on('-r', '--reference') { Agent::Options.referenceable = :minimum }
49
49
  opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
50
50
  opts.on('-w', '--window') { Agent::Options.window = true }
@@ -139,10 +139,6 @@ Copyright
139
139
  open(cache_file, "w") { |f| Marshal.dump(data, f) }
140
140
  data
141
141
  end
142
-
143
- # Create cluster of agents
144
- Agent.create_cluster Options.concurrence, :referenceable=>Agent::Options.referenceable,
145
- :agent=>Agent::Options.agent, :window=>false
146
142
  end
147
143
  end
148
144
 
data/lib/scrappy.rb CHANGED
@@ -3,7 +3,6 @@ $:.unshift(File.dirname(__FILE__)) unless
3
3
 
4
4
  require 'nokogiri'
5
5
  require 'thread'
6
- require 'monitor'
7
6
  require 'mechanize'
8
7
  require 'ostruct'
9
8
  require 'active_support'
@@ -13,13 +12,14 @@ require 'lightrdf'
13
12
  require 'scrappy/support'
14
13
 
15
14
  require 'scrappy/agent/extractor'
16
- require 'scrappy/agent/cluster'
15
+ require 'scrappy/agent/map_reduce'
16
+ require 'scrappy/agent/cache'
17
17
  require 'scrappy/agent/agent'
18
18
 
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.4'
22
+ VERSION = '0.1.5'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -1,10 +1,11 @@
1
1
  module Scrappy
2
2
  class Agent
3
- include Extractor
4
3
  include MonitorMixin
5
- include Cluster
4
+ include Extractor
5
+ include MapReduce
6
+ include Cached
6
7
 
7
- Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0
8
+ Options = OpenStruct.new :format=>:yarf, :depth=>0, :agent=>:blind, :delay=>0, :workers=>10
8
9
  ContentTypes = { :png => 'image/png', :rdfxml => 'application/rdf+xml',
9
10
  :rdf => 'application/rdf+xml' }
10
11
 
@@ -14,9 +15,6 @@ module Scrappy
14
15
  def self.[] id
15
16
  pool[id] || Agent.create(:id=>id)
16
17
  end
17
- def self.cache
18
- @cache ||= {}
19
- end
20
18
 
21
19
  def self.create args={}
22
20
  if (args[:agent] || Options.agent) == :visual
@@ -32,72 +30,82 @@ module Scrappy
32
30
 
33
31
  def initialize args={}
34
32
  super()
33
+ @cluster_count = args[:workers] || Options.workers
34
+ @cluster_options = [ { :referenceable=>Options.referenceable, :agent=>Options.agent,
35
+ :workers=>1, :window=>false } ]
36
+ @cluster = args[:parent]
35
37
  @id = args[:id] || Agent.pool.keys.size
36
38
  Agent.pool[@id] = self
37
39
  @kb = args[:kb] || Options.kb
38
40
  @options = Options.clone
39
41
  end
40
42
 
41
- def request args={}
42
- synchronize do
43
- depth = args[:depth]
44
- request = { :method=>:get, :inputs=>{} }.merge :method=>args[:method], :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{}
43
+ def map args, queue=nil
44
+ depth = args[:depth]
45
+ request = { :method=>args[:method]||:get, :uri=>complete_uri(args[:uri]), :inputs=>args[:inputs]||{} }
45
46
 
46
- # Expire cache
47
- Agent::cache.keys.each { |req| Agent::cache.delete(req) if Time.now.to_i - Agent::cache[req][:time].to_i > 300 }
47
+ # Expire cache
48
+ cache.expire! 300 # 5 minutes
48
49
 
49
- # Lookup in cache
50
- triples = if Agent::cache[request]
51
- Agent::cache[request][:response]
50
+ # Lookup in cache
51
+ triples = if cache[request]
52
+ cache[request][:response]
53
+ else
54
+ # Perform the request
55
+
56
+ sleep 0.001 * options.delay.to_f # Sleep if requested
57
+
58
+ if request[:method] == :get
59
+ self.uri = request[:uri]
60
+ else
61
+ raise Exception, 'POST requests not supported yet'
62
+ end
63
+
64
+ response = if self.html_data?
65
+ add_visual_data! if options.referenceable # Adds tags including visual information
66
+ extract self.uri, html, options.referenceable # Extract data
52
67
  else
53
- # Perform the request
54
- if request[:method] == :get
55
- self.uri = request[:uri]
56
- else
57
- raise Exception, 'POST requests not supported yet'
58
- end
59
-
60
- response = if self.html_data?
61
- add_visual_data! if options.referenceable # Adds tags including visual information
62
- extract self.uri, html, options.referenceable # Extract data
63
- else
64
- []
65
- end
68
+ []
69
+ end
66
70
 
67
- # Cache the request
68
- Agent::cache[request] = { :time=>Time.now, :response=>response }
69
- Agent::cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
71
+ # Cache the request
72
+ cache[request] = { :time=>Time.now, :response=>response }
73
+ cache[request.merge(:uri=>self.uri)] = { :time=>Time.now, :response=>response } unless self.uri.nil?
70
74
 
71
- response
72
- end
75
+ response
76
+ end
73
77
 
74
- # Iterate through subresources
75
- if depth > 0
76
- uris = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map(&:to_s)
77
- Agent.process(uris, :depth=>depth-1).each { |result| triples += result }
78
+ # Enqueue subresources
79
+ if depth > 0
80
+ items = (triples.map{|t| [t[0],t[2]]}.flatten-[Node(self.uri)]).uniq.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }
81
+ if queue.nil?
82
+ triples += process items
83
+ else
84
+ items.each { |item| queue << item }
78
85
  end
79
-
80
- RDF::Graph.new(triples.uniq)
81
86
  end
87
+
88
+ triples
89
+ end
90
+
91
+ def reduce results
92
+ triples = []; results.each { |result| triples += result }
93
+ triples
94
+ end
95
+
96
+ def request args={}
97
+ RDF::Graph.new map(args).uniq
82
98
  end
83
99
 
84
100
  def proxy args={}
85
- synchronize do
86
- request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
101
+ request = { :method=>:get, :inputs=>{}, :format=>options.format, :depth=>options.depth }.merge(args)
87
102
 
88
- OpenStruct.new :output => self.request(request).serialize(request[:format]),
89
- :content_type => ContentTypes[request[:format]] || 'text/plain',
90
- :uri => self.uri,
91
- :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
92
- end
103
+ OpenStruct.new :output => self.request(request).serialize(request[:format]),
104
+ :content_type => ContentTypes[request[:format]] || 'text/plain',
105
+ :uri => self.uri,
106
+ :status => self.html_data? ? (self.uri == request[:uri] ? :ok : :redirect) : :error
93
107
  end
94
108
 
95
- # Method used when consuming a list of uris
96
- def process uri, args={}
97
- sleep 0.001 * options.delay.to_f
98
- request(:method=>:get, :uri=>uri, :depth=>args[:depth]).triples
99
- end
100
-
101
109
  def complete_uri uri
102
110
  uri = "#{uri}.com" if uri =~ /\A\w+\Z/
103
111
  uri = "http://#{uri}" if uri.index(/\A\w*:/) != 0
@@ -0,0 +1,37 @@
1
+ require 'monitor'
2
+
3
+ module Scrappy
4
+ module Cached
5
+ def self.included base
6
+ base.extend Cached::ClassMethods
7
+ end
8
+
9
+ module ClassMethods
10
+ def cache
11
+ @cache ||= Cache.new
12
+ end
13
+ end
14
+
15
+ def cache
16
+ self.class.cache
17
+ end
18
+ end
19
+
20
+ class Cache < Hash
21
+ include MonitorMixin
22
+
23
+ def expire! timeout
24
+ synchronize do
25
+ keys.each { |req| delete(req) if Time.now.to_i - self[req][:time].to_i > timeout }
26
+ end
27
+ end
28
+
29
+ def []= key, value
30
+ synchronize { super }
31
+ end
32
+
33
+ def [] key
34
+ synchronize { super }
35
+ end
36
+ end
37
+ end
@@ -79,17 +79,21 @@ module Scrappy
79
79
  # From "BaseUriSelector" to "base_uri"
80
80
  class_name = selector.rdf::type.first.to_s.split('#').last
81
81
 
82
- # Process selector
83
- results = Kernel.const_get(class_name).filter selector, doc
84
-
85
82
  if !selector.sc::debug.empty?
86
83
  puts '== DEBUG'
87
84
  puts '== Selector:'
88
85
  puts selector.serialize(:yarf, false)
89
- puts '== Applied on fragment:'
86
+ puts '== On fragment:'
90
87
  puts "URI: #{doc[:uri]}"
91
88
  puts "Content: #{doc[:content]}"
92
89
  puts "Value: #{doc[:value]}"
90
+ end
91
+
92
+ # Process selector
93
+ results = Kernel.const_get(class_name).filter selector, doc
94
+
95
+ if !selector.sc::debug.empty?
96
+ puts "== No results" if results.empty?
93
97
  results.each_with_index do |result, i|
94
98
  puts "== Result ##{i}:"
95
99
  puts "URI: #{result[:uri]}"
@@ -0,0 +1,66 @@
1
+ require 'thread'
2
+ require 'monitor'
3
+
4
+ module MapReduce
5
+
6
+ class Queue
7
+ def initialize
8
+ @items = []
9
+ @items.extend MonitorMixin
10
+ end
11
+
12
+ def pop
13
+ yielded = false
14
+ item = nil
15
+ @items.synchronize do
16
+ item = @items.shift
17
+ if @items.empty?
18
+ yield item if (block_given? and item)
19
+ yielded = true
20
+ end
21
+ end
22
+ yield item if (block_given? and not yielded)
23
+ item
24
+ end
25
+
26
+ def << value
27
+ @items << value
28
+ end
29
+
30
+ def push value
31
+ self << value
32
+ end
33
+
34
+ def empty?
35
+ @items.synchronize { @items.empty? }
36
+ end
37
+ end
38
+
39
+
40
+ def cluster
41
+ @cluster ||= (1..@cluster_count || 1).map { self.class.new(*(@cluster_options || [])) }
42
+ end
43
+
44
+ def process list
45
+ results = []
46
+ results.extend MonitorMixin
47
+
48
+ queue = Queue.new
49
+ list.each { |element| queue << element }
50
+
51
+ cluster.map { |obj| Thread.new { obj.work queue, results } }.each { |t| t.join }
52
+
53
+ reduce results
54
+ end
55
+
56
+
57
+ def work queue, results
58
+ begin
59
+ queue.pop do |item|
60
+ result = map item, queue
61
+ results.synchronize { results << result }
62
+ end
63
+ end until queue.empty?
64
+ end
65
+
66
+ end
@@ -6,7 +6,8 @@ module XPathSelector
6
6
  else
7
7
  (0..-1)
8
8
  end
9
- (doc[:content].search(pattern)[interval] || []).map do |result|
9
+ patterns = selector.sc::keyword
10
+ (doc[:content].search(pattern)[interval] || []).select { |node| patterns.any? ? patterns.include?(node.text.downcase.strip) : true }.map do |result|
10
11
  if selector.sc::attribute.first
11
12
  # Select node's attribute if given
12
13
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.4"
5
+ s.version = "0.1.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-24}
9
+ s.date = %q{2010-11-29}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 4
9
- version: 0.1.4
8
+ - 5
9
+ version: 0.1.5
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-24 00:00:00 +01:00
17
+ date: 2010-11-29 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -126,7 +126,8 @@ extra_rdoc_files:
126
126
  - lib/scrappy.rb
127
127
  - lib/scrappy/agent/agent.rb
128
128
  - lib/scrappy/agent/blind_agent.rb
129
- - lib/scrappy/agent/cluster.rb
129
+ - lib/scrappy/agent/cache.rb
130
+ - lib/scrappy/agent/map_reduce.rb
130
131
  - lib/scrappy/agent/extractor.rb
131
132
  - lib/scrappy/agent/visual_agent.rb
132
133
  - lib/scrappy/proxy.rb
@@ -152,7 +153,8 @@ files:
152
153
  - lib/scrappy.rb
153
154
  - lib/scrappy/agent/agent.rb
154
155
  - lib/scrappy/agent/blind_agent.rb
155
- - lib/scrappy/agent/cluster.rb
156
+ - lib/scrappy/agent/cache.rb
157
+ - lib/scrappy/agent/map_reduce.rb
156
158
  - lib/scrappy/agent/extractor.rb
157
159
  - lib/scrappy/agent/visual_agent.rb
158
160
  - lib/scrappy/proxy.rb
@@ -1,35 +0,0 @@
1
- module Cluster
2
-
3
- def self.included(klass)
4
- klass.extend ClassMethods
5
- klass.extend MonitorMixin
6
- end
7
-
8
- def consume(list, results, args={})
9
- begin
10
- element = list.synchronize { list.pop }
11
- unless element.nil?
12
- result = process(element, args)
13
- results.synchronize { results << result }
14
- end
15
- end until element.nil?
16
- end
17
-
18
- module ClassMethods
19
- def cluster; @cluster; end
20
- def cluster= value; @cluster=value; end
21
-
22
- def create_cluster count, *args
23
- self.cluster = (1..count).map { args.nil? ? create : create(*args) }
24
- end
25
-
26
- def process(list=[], args={})
27
- results = []
28
- list.extend MonitorMixin
29
- results.extend MonitorMixin
30
- cluster.map { |o| Thread.new { o.consume(list, results, args) } }.each { |t| t.join }
31
- results
32
- end
33
- end
34
-
35
- end