scrappy 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/Manifest +1 -0
- data/bin/scrappy +2 -2
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +17 -19
- data/lib/scrappy/agent/extractor.rb +0 -1
- data/lib/scrappy/agent/formats.rb +27 -0
- data/lib/scrappy/agent/map_reduce.rb +14 -6
- data/lib/scrappy/selectors/section.rb +4 -1
- data/lib/scrappy/selectors/xpath.rb +3 -1
- data/scrappy.gemspec +4 -4
- metadata +5 -3
data/History.txt
CHANGED
data/Manifest
CHANGED
data/bin/scrappy
CHANGED
@@ -31,7 +31,7 @@ module Scrappy
|
|
31
31
|
def initialize
|
32
32
|
Options.port = 3434
|
33
33
|
Agent::Options.workers = 10
|
34
|
-
Agent::Options.depth = 1
|
34
|
+
Agent::Options.depth = -1
|
35
35
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
36
36
|
|
37
37
|
OptionParser.new do |opts|
|
@@ -100,7 +100,7 @@ Options
|
|
100
100
|
-g, --get URL Gets requested URL
|
101
101
|
-p, --post URL Posts requested URL
|
102
102
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
103
|
-
-l, --levels VALUE Sets recursion levels for resource crawling (default is
|
103
|
+
-l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
|
104
104
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
105
105
|
-D, --dump Dumps RDF data to disk
|
106
106
|
-u, --debug Shows debugging traces
|
data/lib/scrappy.rb
CHANGED
@@ -15,12 +15,13 @@ require 'scrappy/agent/extractor'
|
|
15
15
|
require 'scrappy/agent/map_reduce'
|
16
16
|
require 'scrappy/agent/cache'
|
17
17
|
require 'scrappy/agent/dumper'
|
18
|
+
require 'scrappy/agent/formats'
|
18
19
|
require 'scrappy/agent/agent'
|
19
20
|
|
20
21
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
21
22
|
|
22
23
|
module Scrappy
|
23
|
-
VERSION = '0.1.
|
24
|
+
VERSION = '0.1.15'
|
24
25
|
end
|
25
26
|
|
26
27
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -86,26 +86,24 @@ module Scrappy
|
|
86
86
|
end
|
87
87
|
|
88
88
|
# Enqueue subresources
|
89
|
-
|
90
|
-
|
91
|
-
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
89
|
+
# Pages are enqueued without reducing depth
|
90
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
92
91
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
end
|
92
|
+
# All other URIS are enqueued with depth reduced
|
93
|
+
uris = if depth != 0
|
94
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
95
|
+
else
|
96
|
+
[]
|
97
|
+
end
|
98
|
+
|
99
|
+
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} }).uniq
|
100
|
+
|
101
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
102
|
+
|
103
|
+
if queue.nil?
|
104
|
+
triples += process items
|
105
|
+
else
|
106
|
+
items.each { |item| queue.push_unless_done item }
|
109
107
|
end
|
110
108
|
|
111
109
|
triples unless options.dump
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
def format node, formats
|
5
|
+
case formats.first
|
6
|
+
when Node('sc:WikiText') then
|
7
|
+
doc = Nokogiri::XML(node.to_html)
|
8
|
+
doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
|
9
|
+
doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
|
10
|
+
doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
|
11
|
+
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
|
+
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
|
+
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
14
|
+
doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
|
15
|
+
doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
|
16
|
+
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>\n#{n.text.strip}\n</pre>", n.document)) }
|
17
|
+
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
18
|
+
doc.text.strip
|
19
|
+
when Node('sc:Html') then
|
20
|
+
node.to_html
|
21
|
+
else
|
22
|
+
node.text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -4,16 +4,20 @@ require 'monitor'
|
|
4
4
|
module MapReduce
|
5
5
|
|
6
6
|
class Queue
|
7
|
+
include MonitorMixin
|
8
|
+
|
7
9
|
def initialize
|
8
|
-
|
9
|
-
@items
|
10
|
+
super
|
11
|
+
@items = []
|
12
|
+
@history = []
|
10
13
|
end
|
11
14
|
|
12
15
|
def pop
|
13
16
|
yielded = false
|
14
17
|
item = nil
|
15
|
-
|
18
|
+
synchronize do
|
16
19
|
item = @items.shift
|
20
|
+
@history << item
|
17
21
|
if @items.empty?
|
18
22
|
yield item if (block_given? and item)
|
19
23
|
yielded = true
|
@@ -24,15 +28,19 @@ module MapReduce
|
|
24
28
|
end
|
25
29
|
|
26
30
|
def << value
|
27
|
-
|
31
|
+
push value
|
28
32
|
end
|
29
33
|
|
30
34
|
def push value
|
31
|
-
|
35
|
+
synchronize { @items << value }
|
32
36
|
end
|
33
37
|
|
38
|
+
def push_unless_done value
|
39
|
+
synchronize { @items << value unless @history.include?(value) }
|
40
|
+
end
|
41
|
+
|
34
42
|
def empty?
|
35
|
-
|
43
|
+
synchronize { @items.empty? }
|
36
44
|
end
|
37
45
|
end
|
38
46
|
|
@@ -1,10 +1,13 @@
|
|
1
1
|
module SectionSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
selector.rdf::value.map do |pattern|
|
4
6
|
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
5
7
|
found = false
|
6
8
|
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
|
7
|
-
|
9
|
+
|
10
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
8
11
|
end
|
9
12
|
end.flatten
|
10
13
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
module XPathSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
selector.rdf::value.map do |pattern|
|
4
6
|
interval = if selector.sc::index.first
|
@@ -13,7 +15,7 @@ module XPathSelector
|
|
13
15
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
14
16
|
else
|
15
17
|
# Select node
|
16
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>result.
|
18
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end.flatten
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.15"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-02-04}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 15
|
9
|
+
version: 0.1.15
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-02-04 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -142,6 +142,7 @@ extra_rdoc_files:
|
|
142
142
|
- lib/scrappy/agent/blind_agent.rb
|
143
143
|
- lib/scrappy/agent/cache.rb
|
144
144
|
- lib/scrappy/agent/dumper.rb
|
145
|
+
- lib/scrappy/agent/formats.rb
|
145
146
|
- lib/scrappy/agent/map_reduce.rb
|
146
147
|
- lib/scrappy/agent/extractor.rb
|
147
148
|
- lib/scrappy/agent/visual_agent.rb
|
@@ -172,6 +173,7 @@ files:
|
|
172
173
|
- lib/scrappy/agent/blind_agent.rb
|
173
174
|
- lib/scrappy/agent/cache.rb
|
174
175
|
- lib/scrappy/agent/dumper.rb
|
176
|
+
- lib/scrappy/agent/formats.rb
|
175
177
|
- lib/scrappy/agent/map_reduce.rb
|
176
178
|
- lib/scrappy/agent/extractor.rb
|
177
179
|
- lib/scrappy/agent/visual_agent.rb
|