scrappy 0.1.14 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.15 2011-02-04
2
+
3
+ * Added literal output formats such as HTML or Wikitext
4
+ * Added support to infinite crawling
5
+
1
6
  === 0.1.14 2011-01-30
2
7
 
3
8
  * Added missing file to Manifest
data/Manifest CHANGED
@@ -10,6 +10,7 @@ lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
11
  lib/scrappy/agent/cache.rb
12
12
  lib/scrappy/agent/dumper.rb
13
+ lib/scrappy/agent/formats.rb
13
14
  lib/scrappy/agent/map_reduce.rb
14
15
  lib/scrappy/agent/extractor.rb
15
16
  lib/scrappy/agent/visual_agent.rb
data/bin/scrappy CHANGED
@@ -31,7 +31,7 @@ module Scrappy
31
31
  def initialize
32
32
  Options.port = 3434
33
33
  Agent::Options.workers = 10
34
- Agent::Options.depth = 1
34
+ Agent::Options.depth = -1
35
35
  args = ARGV.map { |arg| arg.split(" ") }.flatten
36
36
 
37
37
  OptionParser.new do |opts|
@@ -100,7 +100,7 @@ Options
100
100
  -g, --get URL Gets requested URL
101
101
  -p, --post URL Posts requested URL
102
102
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
103
- -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
103
+ -l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
104
104
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
105
105
  -D, --dump Dumps RDF data to disk
106
106
  -u, --debug Shows debugging traces
data/lib/scrappy.rb CHANGED
@@ -15,12 +15,13 @@ require 'scrappy/agent/extractor'
15
15
  require 'scrappy/agent/map_reduce'
16
16
  require 'scrappy/agent/cache'
17
17
  require 'scrappy/agent/dumper'
18
+ require 'scrappy/agent/formats'
18
19
  require 'scrappy/agent/agent'
19
20
 
20
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
21
22
 
22
23
  module Scrappy
23
- VERSION = '0.1.14'
24
+ VERSION = '0.1.15'
24
25
  end
25
26
 
26
27
  # Require selectors
@@ -86,26 +86,24 @@ module Scrappy
86
86
  end
87
87
 
88
88
  # Enqueue subresources
89
- if depth >= 0
90
- # Pages are enqueued without reducing depth
91
- pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
89
+ # Pages are enqueued without reducing depth
90
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
92
91
 
93
- # All other URIS are enqueued with depth reduced
94
- uris = if depth > 0
95
- (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
96
- else
97
- []
98
- end
99
-
100
- items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
101
-
102
- items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
103
-
104
- if queue.nil?
105
- triples += process items
106
- else
107
- items.each { |item| queue << item }
108
- end
92
+ # All other URIS are enqueued with depth reduced
93
+ uris = if depth != 0
94
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
95
+ else
96
+ []
97
+ end
98
+
99
+ items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} }).uniq
100
+
101
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
102
+
103
+ if queue.nil?
104
+ triples += process items
105
+ else
106
+ items.each { |item| queue.push_unless_done item }
109
107
  end
110
108
 
111
109
  triples unless options.dump
@@ -136,7 +136,6 @@ module Scrappy
136
136
 
137
137
  selector.rdf::type = Node('sc:UnivocalSelector')
138
138
  selector.sc::path = '/'
139
- selector.sc::children = content.search('*').size.to_s
140
139
  selector.sc::uri = uri
141
140
 
142
141
  fragment.sc::selector = selector
@@ -0,0 +1,27 @@
1
+ module Scrappy
2
+ module Formats
3
+
4
+ def format node, formats
5
+ case formats.first
6
+ when Node('sc:WikiText') then
7
+ doc = Nokogiri::XML(node.to_html)
8
+ doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
9
+ doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
10
+ doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
11
+ doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
12
+ doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
13
+ doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
14
+ doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
15
+ doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
16
+ doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>\n#{n.text.strip}\n</pre>", n.document)) }
17
+ doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
18
+ doc.text.strip
19
+ when Node('sc:Html') then
20
+ node.to_html
21
+ else
22
+ node.text
23
+ end
24
+ end
25
+
26
+ end
27
+ end
@@ -4,16 +4,20 @@ require 'monitor'
4
4
  module MapReduce
5
5
 
6
6
  class Queue
7
+ include MonitorMixin
8
+
7
9
  def initialize
8
- @items = []
9
- @items.extend MonitorMixin
10
+ super
11
+ @items = []
12
+ @history = []
10
13
  end
11
14
 
12
15
  def pop
13
16
  yielded = false
14
17
  item = nil
15
- @items.synchronize do
18
+ synchronize do
16
19
  item = @items.shift
20
+ @history << item
17
21
  if @items.empty?
18
22
  yield item if (block_given? and item)
19
23
  yielded = true
@@ -24,15 +28,19 @@ module MapReduce
24
28
  end
25
29
 
26
30
  def << value
27
- @items << value
31
+ push value
28
32
  end
29
33
 
30
34
  def push value
31
- self << value
35
+ synchronize { @items << value }
32
36
  end
33
37
 
38
+ def push_unless_done value
39
+ synchronize { @items << value unless @history.include?(value) }
40
+ end
41
+
34
42
  def empty?
35
- @items.synchronize { @items.empty? }
43
+ synchronize { @items.empty? }
36
44
  end
37
45
  end
38
46
 
@@ -1,10 +1,13 @@
1
1
  module SectionSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  selector.rdf::value.map do |pattern|
4
6
  doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
5
7
  found = false
6
8
  content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
7
- [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| t.text.strip}.select{|t| t.to_s.strip!=''}*"\n" } ]
9
+
10
+ [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
8
11
  end
9
12
  end.flatten
10
13
  end
@@ -1,4 +1,6 @@
1
1
  module XPathSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  selector.rdf::value.map do |pattern|
4
6
  interval = if selector.sc::index.first
@@ -13,7 +15,7 @@ module XPathSelector
13
15
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
14
16
  else
15
17
  # Select node
16
- [ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
18
+ [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
17
19
  end
18
20
  end
19
21
  end.flatten
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.14"
5
+ s.version = "0.1.15"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-01-30}
9
+ s.date = %q{2011-02-04}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 14
9
- version: 0.1.14
8
+ - 15
9
+ version: 0.1.15
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-30 00:00:00 +01:00
17
+ date: 2011-02-04 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -142,6 +142,7 @@ extra_rdoc_files:
142
142
  - lib/scrappy/agent/blind_agent.rb
143
143
  - lib/scrappy/agent/cache.rb
144
144
  - lib/scrappy/agent/dumper.rb
145
+ - lib/scrappy/agent/formats.rb
145
146
  - lib/scrappy/agent/map_reduce.rb
146
147
  - lib/scrappy/agent/extractor.rb
147
148
  - lib/scrappy/agent/visual_agent.rb
@@ -172,6 +173,7 @@ files:
172
173
  - lib/scrappy/agent/blind_agent.rb
173
174
  - lib/scrappy/agent/cache.rb
174
175
  - lib/scrappy/agent/dumper.rb
176
+ - lib/scrappy/agent/formats.rb
175
177
  - lib/scrappy/agent/map_reduce.rb
176
178
  - lib/scrappy/agent/extractor.rb
177
179
  - lib/scrappy/agent/visual_agent.rb