scrappy 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -1
- data/Manifest +8 -1
- data/README.rdoc +6 -0
- data/bin/scrappy +3 -2
- data/lib/scrappy/agent/extractor.rb +14 -31
- data/lib/scrappy/selectors/base_uri.rb +5 -0
- data/lib/scrappy/selectors/css.rb +6 -0
- data/lib/scrappy/selectors/root.rb +5 -0
- data/lib/scrappy/selectors/slice.rb +8 -0
- data/lib/scrappy/selectors/uri.rb +10 -0
- data/lib/scrappy/selectors/uri_pattern.rb +10 -0
- data/lib/scrappy/selectors/xpath.rb +15 -0
- data/lib/scrappy.rb +5 -1
- data/scrappy.gemspec +4 -4
- metadata +18 -4
data/History.txt
CHANGED
data/Manifest
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
History.txt
|
2
|
+
Manifest
|
2
3
|
README.rdoc
|
3
4
|
Rakefile
|
4
5
|
bin/scrappy
|
@@ -11,10 +12,16 @@ lib/scrappy/agent/cluster.rb
|
|
11
12
|
lib/scrappy/agent/extractor.rb
|
12
13
|
lib/scrappy/agent/visual_agent.rb
|
13
14
|
lib/scrappy/proxy.rb
|
15
|
+
lib/scrappy/selectors/base_uri.rb
|
16
|
+
lib/scrappy/selectors/css.rb
|
17
|
+
lib/scrappy/selectors/root.rb
|
18
|
+
lib/scrappy/selectors/slice.rb
|
19
|
+
lib/scrappy/selectors/uri.rb
|
20
|
+
lib/scrappy/selectors/uri_pattern.rb
|
21
|
+
lib/scrappy/selectors/xpath.rb
|
14
22
|
lib/scrappy/server.rb
|
15
23
|
lib/scrappy/shell.rb
|
16
24
|
lib/scrappy/support.rb
|
17
25
|
lib/scrappy/webkit/webkit.rb
|
18
26
|
test/test_helper.rb
|
19
27
|
test/test_scrappy.rb
|
20
|
-
Manifest
|
data/README.rdoc
CHANGED
@@ -150,6 +150,12 @@ Additionally, some extra libraries are needed for certain features:
|
|
150
150
|
|
151
151
|
* PNG output of RDF graphs requires Graphviz (in Debian systems: sudo aptitude install graphviz).
|
152
152
|
|
153
|
+
== CONTRIBUTORS:
|
154
|
+
|
155
|
+
* José Ignacio Fernández
|
156
|
+
|
157
|
+
* Jacobo Blasco
|
158
|
+
|
153
159
|
== LICENSE:
|
154
160
|
|
155
161
|
(The MIT License)
|
data/bin/scrappy
CHANGED
@@ -30,6 +30,7 @@ module Scrappy
|
|
30
30
|
Options.port = 3434
|
31
31
|
Options.concurrence = 10
|
32
32
|
Agent::Options.depth = 1
|
33
|
+
args = ARGV.map { |arg| arg.split(" ") }.flatten
|
33
34
|
|
34
35
|
OptionParser.new do |opts|
|
35
36
|
opts.on('-V', '--version') { output_version; exit 0 }
|
@@ -48,8 +49,8 @@ module Scrappy
|
|
48
49
|
opts.on('-R', '--reference-all') { Agent::Options.referenceable = :dump }
|
49
50
|
opts.on('-w', '--window') { Agent::Options.window = true }
|
50
51
|
opts.on('-f FORMAT', '--format FORMAT') { |f| Agent::Options.format = f.to_sym }
|
51
|
-
end.parse!(
|
52
|
-
@file =
|
52
|
+
end.parse!(args)
|
53
|
+
@file = args.shift
|
53
54
|
end
|
54
55
|
|
55
56
|
def run
|
@@ -5,10 +5,15 @@ module Scrappy
|
|
5
5
|
def extract uri, html, referenceable=nil
|
6
6
|
triples = []
|
7
7
|
content = Nokogiri::HTML(html, nil, 'utf-8')
|
8
|
-
|
9
|
-
uri_selectors
|
8
|
+
|
9
|
+
uri_selectors = kb.find(nil, Node('rdf:type'), Node('sc:UriSelector')) + kb.find(nil, Node('rdf:type'), Node('sc:UriPatternSelector')).flatten.select do |uri_selector|
|
10
|
+
class_name = uri_selector.rdf::type.first.to_s.split('#').last
|
11
|
+
results = Kernel.const_get(class_name).filter uri_selector, {:content=>content, :uri=>uri}
|
12
|
+
!results.empty?
|
13
|
+
end
|
10
14
|
|
11
15
|
fragments = uri_selectors.map { |uri_selector| kb.find(nil, Node('sc:selector'), uri_selector) }.flatten
|
16
|
+
|
12
17
|
fragments.each do |fragment|
|
13
18
|
extract_fragment fragment, :doc=>{:uri=>uri, :content=>content },
|
14
19
|
:parent=>uri, :triples=>triples, :referenceable=>!referenceable.nil?
|
@@ -69,38 +74,16 @@ module Scrappy
|
|
69
74
|
end
|
70
75
|
|
71
76
|
def filter selector, doc
|
72
|
-
|
73
|
-
|
74
|
-
results = if selector.rdf::type.include?(Node('sc:CssSelector')) or
|
75
|
-
selector.rdf::type.include?(Node('sc:XPathSelector'))
|
76
|
-
selector.rdf::value.map do |pattern|
|
77
|
-
content.search(pattern).map do |result|
|
78
|
-
if selector.sc::attribute.first
|
79
|
-
# Select node's attribute if given
|
80
|
-
selector.sc::attribute.map { |attribute| { :uri=>uri, :content=>result, :value=>result[attribute] } }
|
81
|
-
else
|
82
|
-
# Select node
|
83
|
-
[ { :uri=>uri, :content=>result, :value=>result.text } ]
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end.flatten
|
87
|
-
|
88
|
-
elsif selector.rdf::type.include?(Node('sc:SliceSelector'))
|
89
|
-
text = content.text
|
90
|
-
selector.rdf::value.map do |separator|
|
91
|
-
slices = text.split(separator)
|
92
|
-
selector.sc::index.map { |index| { :uri=>uri, :content=>content, :value=>slices[index.to_i].to_s.strip} }
|
93
|
-
end.flatten
|
77
|
+
# From "BaseUriSelector" to "base_uri"
|
78
|
+
class_name = selector.rdf::type.first.to_s.split('#').last
|
94
79
|
|
95
|
-
|
96
|
-
|
80
|
+
# Process selector
|
81
|
+
results = Kernel.const_get(class_name).filter selector, doc
|
97
82
|
|
98
|
-
|
99
|
-
[ { :uri=>uri, :content=>content, :value=>content.text } ]
|
100
|
-
end
|
101
|
-
|
102
|
-
# Process nested selectors, if any
|
83
|
+
# Return results if no nested selectors
|
103
84
|
return results if selector.sc::selector.empty?
|
85
|
+
|
86
|
+
# Process nested selectors
|
104
87
|
results.map do |result|
|
105
88
|
selector.sc::selector.map { |s| filter s, result }
|
106
89
|
end.flatten
|
@@ -0,0 +1,8 @@
|
|
1
|
+
module SliceSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
selector.rdf::value.map do |separator|
|
4
|
+
slices = doc[:content].text.split(separator)
|
5
|
+
selector.sc::index.map { |index| { :uri=>doc[:uri], :content=>doc[:content], :value=>slices[index.to_i].to_s.strip} }
|
6
|
+
end.flatten
|
7
|
+
end
|
8
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module UriSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
# Check if the UriSelector has this URI as value (without params: ?param1=value1¶m2=value2)
|
4
|
+
if selector.rdf::value.include?(doc[:uri].match(/\A([^\?]*)(\?.*\Z)?/).captures.first)
|
5
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
6
|
+
else
|
7
|
+
[]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module UriPatternSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
# Check if the uri fits the pattern
|
4
|
+
if selector.rdf::value.any? { |v| doc[:uri] =~ /\A#{v.gsub('.','\.').gsub('*', '.+')}\Z/ }
|
5
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
6
|
+
else
|
7
|
+
[]
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module XPathSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
selector.rdf::value.map do |pattern|
|
4
|
+
doc[:content].search(pattern).map do |result|
|
5
|
+
if selector.sc::attribute.first
|
6
|
+
# Select node's attribute if given
|
7
|
+
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
8
|
+
else
|
9
|
+
# Select node
|
10
|
+
[ { :uri=>doc[:uri], :content=>result, :value=>result.text } ]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end.flatten
|
14
|
+
end
|
15
|
+
end
|
data/lib/scrappy.rb
CHANGED
@@ -11,6 +11,7 @@ require 'tmpdir'
|
|
11
11
|
require 'lightrdf'
|
12
12
|
|
13
13
|
require 'scrappy/support'
|
14
|
+
|
14
15
|
require 'scrappy/agent/extractor'
|
15
16
|
require 'scrappy/agent/cluster'
|
16
17
|
require 'scrappy/agent/agent'
|
@@ -18,5 +19,8 @@ require 'scrappy/agent/agent'
|
|
18
19
|
Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
|
19
20
|
|
20
21
|
module Scrappy
|
21
|
-
VERSION = '0.1.
|
22
|
+
VERSION = '0.1.2'
|
22
23
|
end
|
24
|
+
|
25
|
+
# Require selectors
|
26
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/scrappy/selectors/*.rb"].each { |f| require f }
|
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-
|
9
|
+
s.date = %q{2010-11-03}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cluster.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 2
|
9
|
+
version: 0.1.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-
|
17
|
+
date: 2010-11-03 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -130,12 +130,20 @@ extra_rdoc_files:
|
|
130
130
|
- lib/scrappy/agent/extractor.rb
|
131
131
|
- lib/scrappy/agent/visual_agent.rb
|
132
132
|
- lib/scrappy/proxy.rb
|
133
|
+
- lib/scrappy/selectors/base_uri.rb
|
134
|
+
- lib/scrappy/selectors/css.rb
|
135
|
+
- lib/scrappy/selectors/root.rb
|
136
|
+
- lib/scrappy/selectors/slice.rb
|
137
|
+
- lib/scrappy/selectors/uri.rb
|
138
|
+
- lib/scrappy/selectors/uri_pattern.rb
|
139
|
+
- lib/scrappy/selectors/xpath.rb
|
133
140
|
- lib/scrappy/server.rb
|
134
141
|
- lib/scrappy/shell.rb
|
135
142
|
- lib/scrappy/support.rb
|
136
143
|
- lib/scrappy/webkit/webkit.rb
|
137
144
|
files:
|
138
145
|
- History.txt
|
146
|
+
- Manifest
|
139
147
|
- README.rdoc
|
140
148
|
- Rakefile
|
141
149
|
- bin/scrappy
|
@@ -148,13 +156,19 @@ files:
|
|
148
156
|
- lib/scrappy/agent/extractor.rb
|
149
157
|
- lib/scrappy/agent/visual_agent.rb
|
150
158
|
- lib/scrappy/proxy.rb
|
159
|
+
- lib/scrappy/selectors/base_uri.rb
|
160
|
+
- lib/scrappy/selectors/css.rb
|
161
|
+
- lib/scrappy/selectors/root.rb
|
162
|
+
- lib/scrappy/selectors/slice.rb
|
163
|
+
- lib/scrappy/selectors/uri.rb
|
164
|
+
- lib/scrappy/selectors/uri_pattern.rb
|
165
|
+
- lib/scrappy/selectors/xpath.rb
|
151
166
|
- lib/scrappy/server.rb
|
152
167
|
- lib/scrappy/shell.rb
|
153
168
|
- lib/scrappy/support.rb
|
154
169
|
- lib/scrappy/webkit/webkit.rb
|
155
170
|
- test/test_helper.rb
|
156
171
|
- test/test_scrappy.rb
|
157
|
-
- Manifest
|
158
172
|
- scrappy.gemspec
|
159
173
|
has_rdoc: true
|
160
174
|
homepage: http://github.com/josei/scrappy
|