scrappy 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -1
- data/Manifest +8 -1
- data/README.rdoc +6 -0
- data/bin/scrappy +3 -2
- data/lib/scrappy/agent/extractor.rb +14 -31
- data/lib/scrappy/selectors/base_uri.rb +5 -0
- data/lib/scrappy/selectors/css.rb +6 -0
- data/lib/scrappy/selectors/root.rb +5 -0
- data/lib/scrappy/selectors/slice.rb +8 -0
- data/lib/scrappy/selectors/uri.rb +10 -0
- data/lib/scrappy/selectors/uri_pattern.rb +10 -0
- data/lib/scrappy/selectors/xpath.rb +15 -0
- data/lib/scrappy.rb +5 -1
- data/scrappy.gemspec +4 -4
- metadata +18 -4
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
History.txt
|
2
|
+
Manifest
|
2
3
|
README.rdoc
|
3
4
|
Rakefile
|
4
5
|
bin/scrappy
|
@@ -11,10 +12,16 @@ lib/scrappy/agent/cluster.rb
|
|
11
12
|
lib/scrappy/agent/extractor.rb
|
12
13
|
lib/scrappy/agent/visual_agent.rb
|
13
14
|
lib/scrappy/proxy.rb
|
15
|
+
lib/scrappy/selectors/base_uri.rb
|
16
|
+
lib/scrappy/selectors/css.rb
|
17
|
+
lib/scrappy/selectors/root.rb
|
18
|
+
lib/scrappy/selectors/slice.rb
|
19
|
+
lib/scrappy/selectors/uri.rb
|
20
|
+
lib/scrappy/selectors/uri_pattern.rb
|
21
|
+
lib/scrappy/selectors/xpath.rb
|
14
22
|
lib/scrappy/server.rb
|
15
23
|
lib/scrappy/shell.rb
|
16
24
|
lib/scrappy/support.rb
|
17
25
|
lib/scrappy/webkit/webkit.rb
|
18
26
|
test/test_helper.rb
|
19
27
|
test/test_scrappy.rb
|
20
|
-
Manifest
|
data/README.rdoc
CHANGED
@@ -150,6 +150,12 @@ Additionally, some extra libraries are needed for certain features:
|
|
150
150
|
|
151
151
|
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
152
152
|
|
153
|
+
== CONTRIBUTORS:
|
154
|
+
|
155
|
+
* José Ignacio Fernández
|
156
|
+
|
157
|
+
* Jacobo Blasco
|
158
|
+
|
153
159
|
== LICENSE:
|
154
160
|
|
155
161
|
(The MIT License)
|
data/bin/scrappy
CHANGED
@@ -30,6 +30,7 @@ module Scrappy
|
|
30
30
|
Options.port = 3434
|
31
31
|
Options.concurrence = 10
|
32
32
|
Agent::Options.depth = 1
|
33
|
+
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
33
34
|
|
34
35
|
OptionParser.new do |opts|
|
35
36
|
opts.on('-V', '--version') { output_version; exit 0 }
|
@@ -48,8 +49,8 @@ module Scrappy
|
|
48
49
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
49
50
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
50
51
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
51
|
-
end.parse!(
|
52
|
-
@file =
|
52
|
+
end.parse!(args)
|
53
|
+
@file = args.shift
|
53
54
|
end
|
54
55
|
|
55
56
|
def run
|
@@ -5,10 +5,15 @@ module Scrappy
|
|
5
5
|
def extract uri, html, referenceable=nil
|
6
6
|
triples = []
|
7
7
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
8
|
-
|
9
|
-
uri_selectors
|
8
|
+
|
9
|
+
uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).flatten.select do |uri_selector|
|
10
|
+
class_name = uri_selector.rdf::type.first.to_s.split('#').last
|
11
|
+
results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
|
12
|
+
!results.empty?
|
13
|
+
end
|
10
14
|
|
11
15
|
fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
16
|
+
|
12
17
|
fragments.each do |fragment|
|
13
18
|
extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
|
14
19
|
:parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
|
@@ -69,38 +74,16 @@ module Scrappy
|
|
69
74
|
end
|
70
75
|
|
71
76
|
def filter selector, doc
|
72
|
-
|
73
|
-
|
74
|
-
results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
|
75
|
-
selector.rdf::type.include?(Node('sc:XPathSelector'))
|
76
|
-
selector.rdf::value.map do |pattern|
|
77
|
-
content.search(pattern).map do |result|
|
78
|
-
if selector.sc::attribute.first
|
79
|
-
# Select node's attribute if given
|
80
|
-
selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
|
81
|
-
else
|
82
|
-
# Select node
|
83
|
-
[ { :uri=>uri, :content=>result, :value=>result.text } ]
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end.flatten
|
87
|
-
|
88
|
-
elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
|
89
|
-
text = content.text
|
90
|
-
selector.rdf::value.map do |separator|
|
91
|
-
slices = text.split(separator)
|
92
|
-
selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
|
93
|
-
end.flatten
|
77
|
+
# From "BaseUriSelector" to "base_uri"
|
78
|
+
class_name = selector.rdf::type.first.to_s.split('#').last
|
94
79
|
|
95
|
-
|
96
|
-
|
80
|
+
# Process selector
|
81
|
+
results = Kernel.const_get(class_name).filter selector, doc
|
97
82
|
|
98
|
-
|
99
|
-
[ { :uri=>uri, :content=>content, :value=>content.text } ]
|
100
|
-
end
|
101
|
-
|
102
|
-
# Process nested selectors, if any
|
83
|
+
# Return results if no nested selectors
|
103
84
|
return results if selector.sc::selector.empty?
|
85
|
+
|
86
|
+
# Process nested selectors
|
104
87
|
results.map do |result|
|
105
88
|
selector.sc::selector.map { |s| filter s, result }
|
106
89
|
end.flatten
|
@@ -0,0 +1,8 @@
|
|
1
|
+
module SliceSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
selector.rdf::value.map do |separator|
|
4
|
+
slices = doc[:content].text.split(separator)
|
5
|
+
selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
|
6
|
+
end.flatten
|
7
|
+
end
|
8
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module UriSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
# Check if the UriSelector has this URI as value (without params: ?param1=value1¶m2=value2)
|
4
|
+
if selector.rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
|
5
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
6
|
+
else
|
7
|
+
[]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module UriPatternSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
# Check if the uri fits the pattern
|
4
|
+
if selector.rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
5
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
6
|
+
else
|
7
|
+
[]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module XPathSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
selector.rdf::value.map do |pattern|
|
4
|
+
doc[:content].search(pattern).map do |result|
|
5
|
+
if selector.sc::attribute.first
|
6
|
+
# Select node's attribute if given
|
7
|
+
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
8
|
+
else
|
9
|
+
# Select node
|
10
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end.flatten
|
14
|
+
end
|
15
|
+
end
|
data/lib/scrappy.rb
CHANGED
@@ -11,6 +11,7 @@ require 'tmpdir'
|
|
11
11
|
require 'lightrdf'
|
12
12
|
|
13
13
|
require 'scrappy/support'
|
14
|
+
|
14
15
|
require 'scrappy/agent/extractor'
|
15
16
|
require 'scrappy/agent/cluster'
|
16
17
|
require 'scrappy/agent/agent'
|
@@ -18,5 +19,8 @@ require 'scrappy/agent/agent'
|
|
18
19
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
19
20
|
|
20
21
|
module Scrappy
|
21
|
-
VERSION = '0.1.
|
22
|
+
VERSION = '0.1.2'
|
22
23
|
end
|
24
|
+
|
25
|
+
# Require selectors
|
26
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-
|
9
|
+
s.date = %q{2010-11-03}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-03 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -130,12 +130,20 @@ extra_rdoc_files:
|
|
130
130
|
- lib/scrappy/agent/extractor.rb
|
131
131
|
- lib/scrappy/agent/visual_agent.rb
|
132
132
|
- lib/scrappy/proxy.rb
|
133
|
+
- lib/scrappy/selectors/base_uri.rb
|
134
|
+
- lib/scrappy/selectors/css.rb
|
135
|
+
- lib/scrappy/selectors/root.rb
|
136
|
+
- lib/scrappy/selectors/slice.rb
|
137
|
+
- lib/scrappy/selectors/uri.rb
|
138
|
+
- lib/scrappy/selectors/uri_pattern.rb
|
139
|
+
- lib/scrappy/selectors/xpath.rb
|
133
140
|
- lib/scrappy/server.rb
|
134
141
|
- lib/scrappy/shell.rb
|
135
142
|
- lib/scrappy/support.rb
|
136
143
|
- lib/scrappy/webkit/webkit.rb
|
137
144
|
files:
|
138
145
|
- History.txt
|
146
|
+
- Manifest
|
139
147
|
- README.rdoc
|
140
148
|
- Rakefile
|
141
149
|
- bin/scrappy
|
@@ -148,13 +156,19 @@ files:
|
|
148
156
|
- lib/scrappy/agent/extractor.rb
|
149
157
|
- lib/scrappy/agent/visual_agent.rb
|
150
158
|
- lib/scrappy/proxy.rb
|
159
|
+
- lib/scrappy/selectors/base_uri.rb
|
160
|
+
- lib/scrappy/selectors/css.rb
|
161
|
+
- lib/scrappy/selectors/root.rb
|
162
|
+
- lib/scrappy/selectors/slice.rb
|
163
|
+
- lib/scrappy/selectors/uri.rb
|
164
|
+
- lib/scrappy/selectors/uri_pattern.rb
|
165
|
+
- lib/scrappy/selectors/xpath.rb
|
151
166
|
- lib/scrappy/server.rb
|
152
167
|
- lib/scrappy/shell.rb
|
153
168
|
- lib/scrappy/support.rb
|
154
169
|
- lib/scrappy/webkit/webkit.rb
|
155
170
|
- test/test_helper.rb
|
156
171
|
- test/test_scrappy.rb
|
157
|
-
- Manifest
|
158
172
|
- scrappy.gemspec
|
159
173
|
has_rdoc: true
|
160
174
|
homepage: http://github.com/josei/scrappy
|