scrappy 0.1.14 → 0.1.15

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,8 @@
1
+ === 0.1.15 2011-02-04
2
+
3
+ * Added literal output formats such as HTML or Wikitext
4
+ * Added support to infinite crawling
5
+
1
6
  === 0.1.14 2011-01-30
2
7
 
3
8
  * Added missing file to Manifest
data/Manifest CHANGED
@@ -10,6 +10,7 @@ lib/scrappy/agent/agent.rb
10
10
  lib/scrappy/agent/blind_agent.rb
11
11
  lib/scrappy/agent/cache.rb
12
12
  lib/scrappy/agent/dumper.rb
13
+ lib/scrappy/agent/formats.rb
13
14
  lib/scrappy/agent/map_reduce.rb
14
15
  lib/scrappy/agent/extractor.rb
15
16
  lib/scrappy/agent/visual_agent.rb
data/bin/scrappy CHANGED
@@ -31,7 +31,7 @@ module Scrappy
31
31
  def initialize
32
32
  Options.port = 3434
33
33
  Agent::Options.workers = 10
34
- Agent::Options.depth = 1
34
+ Agent::Options.depth = -1
35
35
  args = ARGV.map { |arg| arg.split(" ") }.flatten
36
36
 
37
37
  OptionParser.new do |opts|
@@ -100,7 +100,7 @@ Options
100
100
  -g, --get URL Gets requested URL
101
101
  -p, --post URL Posts requested URL
102
102
  -c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
103
- -l, --levels VALUE Sets recursion levels for resource crawling (default is 1)
103
+ -l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
104
104
  -d, --delay VALUE Sets delay (in ms) between requests (default is 0)
105
105
  -D, --dump Dumps RDF data to disk
106
106
  -u, --debug Shows debugging traces
data/lib/scrappy.rb CHANGED
@@ -15,12 +15,13 @@ require 'scrappy/agent/extractor'
15
15
  require 'scrappy/agent/map_reduce'
16
16
  require 'scrappy/agent/cache'
17
17
  require 'scrappy/agent/dumper'
18
+ require 'scrappy/agent/formats'
18
19
  require 'scrappy/agent/agent'
19
20
 
20
21
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
21
22
 
22
23
  module Scrappy
23
- VERSION = '0.1.14'
24
+ VERSION = '0.1.15'
24
25
  end
25
26
 
26
27
  # Require selectors
@@ -86,26 +86,24 @@ module Scrappy
86
86
  end
87
87
 
88
88
  # Enqueue subresources
89
- if depth >= 0
90
- # Pages are enqueued without reducing depth
91
- pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
89
+ # Pages are enqueued without reducing depth
90
+ pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
92
91
 
93
- # All other URIS are enqueued with depth reduced
94
- uris = if depth > 0
95
- (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
96
- else
97
- []
98
- end
99
-
100
- items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>depth} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>depth-1} }).uniq
101
-
102
- items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
103
-
104
- if queue.nil?
105
- triples += process items
106
- else
107
- items.each { |item| queue << item }
108
- end
92
+ # All other URIS are enqueued with depth reduced
93
+ uris = if depth != 0
94
+ (triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
95
+ else
96
+ []
97
+ end
98
+
99
+ items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} }).uniq
100
+
101
+ items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
102
+
103
+ if queue.nil?
104
+ triples += process items
105
+ else
106
+ items.each { |item| queue.push_unless_done item }
109
107
  end
110
108
 
111
109
  triples unless options.dump
@@ -136,7 +136,6 @@ module Scrappy
136
136
 
137
137
  selector.rdf::type = Node('sc:UnivocalSelector')
138
138
  selector.sc::path = '/'
139
- selector.sc::children = content.search('*').size.to_s
140
139
  selector.sc::uri = uri
141
140
 
142
141
  fragment.sc::selector = selector
@@ -0,0 +1,27 @@
1
+ module Scrappy
2
+ module Formats
3
+
4
+ def format node, formats
5
+ case formats.first
6
+ when Node('sc:WikiText') then
7
+ doc = Nokogiri::XML(node.to_html)
8
+ doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
9
+ doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
10
+ doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
11
+ doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
12
+ doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
13
+ doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
14
+ doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
15
+ doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
16
+ doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>\n#{n.text.strip}\n</pre>", n.document)) }
17
+ doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
18
+ doc.text.strip
19
+ when Node('sc:Html') then
20
+ node.to_html
21
+ else
22
+ node.text
23
+ end
24
+ end
25
+
26
+ end
27
+ end
@@ -4,16 +4,20 @@ require 'monitor'
4
4
  module MapReduce
5
5
 
6
6
  class Queue
7
+ include MonitorMixin
8
+
7
9
  def initialize
8
- @items = []
9
- @items.extend MonitorMixin
10
+ super
11
+ @items = []
12
+ @history = []
10
13
  end
11
14
 
12
15
  def pop
13
16
  yielded = false
14
17
  item = nil
15
- @items.synchronize do
18
+ synchronize do
16
19
  item = @items.shift
20
+ @history << item
17
21
  if @items.empty?
18
22
  yield item if (block_given? and item)
19
23
  yielded = true
@@ -24,15 +28,19 @@ module MapReduce
24
28
  end
25
29
 
26
30
  def << value
27
- @items << value
31
+ push value
28
32
  end
29
33
 
30
34
  def push value
31
- self << value
35
+ synchronize { @items << value }
32
36
  end
33
37
 
38
+ def push_unless_done value
39
+ synchronize { @items << value unless @history.include?(value) }
40
+ end
41
+
34
42
  def empty?
35
- @items.synchronize { @items.empty? }
43
+ synchronize { @items.empty? }
36
44
  end
37
45
  end
38
46
 
@@ -1,10 +1,13 @@
1
1
  module SectionSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  selector.rdf::value.map do |pattern|
4
6
  doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
5
7
  found = false
6
8
  content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
7
- [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| t.text.strip}.select{|t| t.to_s.strip!=''}*"\n" } ]
9
+
10
+ [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
8
11
  end
9
12
  end.flatten
10
13
  end
@@ -1,4 +1,6 @@
1
1
  module XPathSelector
2
+ extend Scrappy::Formats
3
+
2
4
  def self.filter selector, doc
3
5
  selector.rdf::value.map do |pattern|
4
6
  interval = if selector.sc::index.first
@@ -13,7 +15,7 @@ module XPathSelector
13
15
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
14
16
  else
15
17
  # Select node
16
- [ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
18
+ [ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
17
19
  end
18
20
  end
19
21
  end.flatten
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.14"
5
+ s.version = "0.1.15"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-01-30}
9
+ s.date = %q{2011-02-04}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 14
9
- version: 0.1.14
8
+ - 15
9
+ version: 0.1.15
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-01-30 00:00:00 +01:00
17
+ date: 2011-02-04 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -142,6 +142,7 @@ extra_rdoc_files:
142
142
  - lib/scrappy/agent/blind_agent.rb
143
143
  - lib/scrappy/agent/cache.rb
144
144
  - lib/scrappy/agent/dumper.rb
145
+ - lib/scrappy/agent/formats.rb
145
146
  - lib/scrappy/agent/map_reduce.rb
146
147
  - lib/scrappy/agent/extractor.rb
147
148
  - lib/scrappy/agent/visual_agent.rb
@@ -172,6 +173,7 @@ files:
172
173
  - lib/scrappy/agent/blind_agent.rb
173
174
  - lib/scrappy/agent/cache.rb
174
175
  - lib/scrappy/agent/dumper.rb
176
+ - lib/scrappy/agent/formats.rb
175
177
  - lib/scrappy/agent/map_reduce.rb
176
178
  - lib/scrappy/agent/extractor.rb
177
179
  - lib/scrappy/agent/visual_agent.rb