scrappy 0.1.14 → 0.1.15
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/Manifest +1 -0
- data/bin/scrappy +2 -2
- data/lib/scrappy.rb +2 -1
- data/lib/scrappy/agent/agent.rb +17 -19
- data/lib/scrappy/agent/extractor.rb +0 -1
- data/lib/scrappy/agent/formats.rb +27 -0
- data/lib/scrappy/agent/map_reduce.rb +14 -6
- data/lib/scrappy/selectors/section.rb +4 -1
- data/lib/scrappy/selectors/xpath.rb +3 -1
- data/scrappy.gemspec +4 -4
- metadata +5 -3
data/History.txt
CHANGED
data/Manifest
CHANGED
data/bin/scrappy
CHANGED
@@ -31,7 +31,7 @@ module Scrappy
|
|
31
31
|
def initialize
|
32
32
|
Options.port = 3434
|
33
33
|
Agent::Options.workers = 10
|
34
|
-
Agent::Options.depth = 1
|
34
|
+
Agent::Options.depth = -1
|
35
35
|
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
36
36
|
|
37
37
|
OptionParser.new do |opts|
|
@@ -100,7 +100,7 @@ Options
|
|
100
100
|
-g, --get URL Gets requested URL
|
101
101
|
-p, --post URL Posts requested URL
|
102
102
|
-c, --concurrence VALUE Sets number of concurrent connections for crawling (default is 10)
|
103
|
-
-l, --levels VALUE Sets recursion levels for resource crawling (default is
|
103
|
+
-l, --levels VALUE Sets recursion levels for resource crawling (default is infinite crawling)
|
104
104
|
-d, --delay VALUE Sets delay (in ms) between requests (default is 0)
|
105
105
|
-D, --dump Dumps RDF data to disk
|
106
106
|
-u, --debug Shows debugging traces
|
data/lib/scrappy.rb
CHANGED
@@ -15,12 +15,13 @@ require 'scrappy/agent/extractor'
|
|
15
15
|
require 'scrappy/agent/map_reduce'
|
16
16
|
require 'scrappy/agent/cache'
|
17
17
|
require 'scrappy/agent/dumper'
|
18
|
+
require 'scrappy/agent/formats'
|
18
19
|
require 'scrappy/agent/agent'
|
19
20
|
|
20
21
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
21
22
|
|
22
23
|
module Scrappy
|
23
|
-
VERSION = '0.1.
|
24
|
+
VERSION = '0.1.15'
|
24
25
|
end
|
25
26
|
|
26
27
|
# Require selectors
|
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -86,26 +86,24 @@ module Scrappy
|
|
86
86
|
end
|
87
87
|
|
88
88
|
# Enqueue subresources
|
89
|
-
|
90
|
-
|
91
|
-
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
89
|
+
# Pages are enqueued without reducing depth
|
90
|
+
pages = triples.select { |s,p,o| p==Node("rdf:type") and o==Node("sc:Page") }.map{|s,p,o| s}.select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
92
91
|
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
end
|
92
|
+
# All other URIS are enqueued with depth reduced
|
93
|
+
uris = if depth != 0
|
94
|
+
(triples.map { |s, p, o| [s,o] }.flatten - [Node(self.uri)] - pages).select{|n| n.is_a?(RDF::Node) and n.id.is_a?(URI)}
|
95
|
+
else
|
96
|
+
[]
|
97
|
+
end
|
98
|
+
|
99
|
+
items = (pages.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth].max} } + uris.map { |uri| {:uri=>uri.to_s, :depth=>[-1, depth-1].max} }).uniq
|
100
|
+
|
101
|
+
items.each { |item| puts "Enqueuing (depth = #{item[:depth]}): #{item[:uri]}" } if options.debug
|
102
|
+
|
103
|
+
if queue.nil?
|
104
|
+
triples += process items
|
105
|
+
else
|
106
|
+
items.each { |item| queue.push_unless_done item }
|
109
107
|
end
|
110
108
|
|
111
109
|
triples unless options.dump
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Scrappy
|
2
|
+
module Formats
|
3
|
+
|
4
|
+
def format node, formats
|
5
|
+
case formats.first
|
6
|
+
when Node('sc:WikiText') then
|
7
|
+
doc = Nokogiri::XML(node.to_html)
|
8
|
+
doc.search("h1").each {|n| n.replace(Nokogiri::XML::Text.new("= #{n.text.strip} =", n.document)) }
|
9
|
+
doc.search("h2").each {|n| n.replace(Nokogiri::XML::Text.new("== #{n.text.strip} ==", n.document)) }
|
10
|
+
doc.search("h3").each {|n| n.replace(Nokogiri::XML::Text.new("=== #{n.text.strip} ===", n.document)) }
|
11
|
+
doc.search("h4").each {|n| n.replace(Nokogiri::XML::Text.new("==== #{n.text.strip} ====", n.document)) }
|
12
|
+
doc.search("h5").each {|n| n.replace(Nokogiri::XML::Text.new("===== #{n.text.strip} =====", n.document)) }
|
13
|
+
doc.search("b").each {|n| n.replace(Nokogiri::XML::Text.new("'''#{n.text.strip}'''", n.document)) }
|
14
|
+
doc.search("li").each {|n| n.replace(Nokogiri::XML::Text.new("* #{n.text.strip}", n.document)) }
|
15
|
+
doc.search("ul").each {|n| n.replace(Nokogiri::XML::Text.new(n.text.strip, n.document)) }
|
16
|
+
doc.search("pre, code").each {|n| n.replace(Nokogiri::XML::Text.new("<pre>\n#{n.text.strip}\n</pre>", n.document)) }
|
17
|
+
doc.search("p").each {|n| n.replace(Nokogiri::XML::Text.new("#{n.text.strip}\n", n.document)) }
|
18
|
+
doc.text.strip
|
19
|
+
when Node('sc:Html') then
|
20
|
+
node.to_html
|
21
|
+
else
|
22
|
+
node.text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
end
|
@@ -4,16 +4,20 @@ require 'monitor'
|
|
4
4
|
module MapReduce
|
5
5
|
|
6
6
|
class Queue
|
7
|
+
include MonitorMixin
|
8
|
+
|
7
9
|
def initialize
|
8
|
-
|
9
|
-
@items
|
10
|
+
super
|
11
|
+
@items = []
|
12
|
+
@history = []
|
10
13
|
end
|
11
14
|
|
12
15
|
def pop
|
13
16
|
yielded = false
|
14
17
|
item = nil
|
15
|
-
|
18
|
+
synchronize do
|
16
19
|
item = @items.shift
|
20
|
+
@history << item
|
17
21
|
if @items.empty?
|
18
22
|
yield item if (block_given? and item)
|
19
23
|
yielded = true
|
@@ -24,15 +28,19 @@ module MapReduce
|
|
24
28
|
end
|
25
29
|
|
26
30
|
def << value
|
27
|
-
|
31
|
+
push value
|
28
32
|
end
|
29
33
|
|
30
34
|
def push value
|
31
|
-
|
35
|
+
synchronize { @items << value }
|
32
36
|
end
|
33
37
|
|
38
|
+
def push_unless_done value
|
39
|
+
synchronize { @items << value unless @history.include?(value) }
|
40
|
+
end
|
41
|
+
|
34
42
|
def empty?
|
35
|
-
|
43
|
+
synchronize { @items.empty? }
|
36
44
|
end
|
37
45
|
end
|
38
46
|
|
@@ -1,10 +1,13 @@
|
|
1
1
|
module SectionSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
selector.rdf::value.map do |pattern|
|
4
6
|
doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
|
5
7
|
found = false
|
6
8
|
content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
|
7
|
-
|
9
|
+
|
10
|
+
[ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| format(t, selector.sc::format)}.select{|t| t.to_s.strip!=''}*"\n\n" } ]
|
8
11
|
end
|
9
12
|
end.flatten
|
10
13
|
end
|
@@ -1,4 +1,6 @@
|
|
1
1
|
module XPathSelector
|
2
|
+
extend Scrappy::Formats
|
3
|
+
|
2
4
|
def self.filter selector, doc
|
3
5
|
selector.rdf::value.map do |pattern|
|
4
6
|
interval = if selector.sc::index.first
|
@@ -13,7 +15,7 @@ module XPathSelector
|
|
13
15
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
14
16
|
else
|
15
17
|
# Select node
|
16
|
-
[ { :uri=>doc[:uri], :content=>result, :value=>result.
|
18
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>format(result, selector.sc::format) } ]
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end.flatten
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.15"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-
|
9
|
+
s.date = %q{2011-02-04}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/dumper.rb", "lib/scrappy/agent/formats.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 15
|
9
|
+
version: 0.1.15
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-
|
17
|
+
date: 2011-02-04 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -142,6 +142,7 @@ extra_rdoc_files:
|
|
142
142
|
- lib/scrappy/agent/blind_agent.rb
|
143
143
|
- lib/scrappy/agent/cache.rb
|
144
144
|
- lib/scrappy/agent/dumper.rb
|
145
|
+
- lib/scrappy/agent/formats.rb
|
145
146
|
- lib/scrappy/agent/map_reduce.rb
|
146
147
|
- lib/scrappy/agent/extractor.rb
|
147
148
|
- lib/scrappy/agent/visual_agent.rb
|
@@ -172,6 +173,7 @@ files:
|
|
172
173
|
- lib/scrappy/agent/blind_agent.rb
|
173
174
|
- lib/scrappy/agent/cache.rb
|
174
175
|
- lib/scrappy/agent/dumper.rb
|
176
|
+
- lib/scrappy/agent/formats.rb
|
175
177
|
- lib/scrappy/agent/map_reduce.rb
|
176
178
|
- lib/scrappy/agent/extractor.rb
|
177
179
|
- lib/scrappy/agent/visual_agent.rb
|