scrappy 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +10 -0
- data/Manifest +1 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/agent/agent.rb +1 -1
- data/lib/scrappy/agent/extractor.rb +1 -0
- data/lib/scrappy/selectors/new_uri.rb +15 -0
- data/lib/scrappy/selectors/root.rb +6 -1
- data/lib/scrappy/support.rb +7 -1
- data/scrappy.gemspec +4 -4
- metadata +5 -3
data/History.txt
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
=== 0.1.8 2010-12-14
|
2
|
+
|
3
|
+
* Added sc:sameas
|
4
|
+
* Added sc:attribute support to sc:RootSelector
|
5
|
+
* Added sc:NewUriSelector for constructing new uris
|
6
|
+
|
7
|
+
=== 0.1.7 2010-12-09
|
8
|
+
|
9
|
+
* Added section selector
|
10
|
+
|
1
11
|
=== 0.1.6 2010-12-09
|
2
12
|
|
3
13
|
* Added sc:superclass to sc:Fragments
|
data/Manifest
CHANGED
@@ -15,6 +15,7 @@ lib/scrappy/agent/visual_agent.rb
|
|
15
15
|
lib/scrappy/proxy.rb
|
16
16
|
lib/scrappy/selectors/base_uri.rb
|
17
17
|
lib/scrappy/selectors/css.rb
|
18
|
+
lib/scrappy/selectors/new_uri.rb
|
18
19
|
lib/scrappy/selectors/root.rb
|
19
20
|
lib/scrappy/selectors/section.rb
|
20
21
|
lib/scrappy/selectors/slice.rb
|
data/lib/scrappy.rb
CHANGED
data/lib/scrappy/agent/agent.rb
CHANGED
@@ -56,6 +56,7 @@ module Scrappy
|
|
56
56
|
options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
|
57
57
|
end
|
58
58
|
fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
|
59
|
+
fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
|
59
60
|
node
|
60
61
|
end
|
61
62
|
fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module NewUriSelector
|
2
|
+
def self.filter selector, doc
|
3
|
+
contents = if selector.sc::attribute.first
|
4
|
+
# Select node's attribute if given
|
5
|
+
selector.sc::attribute.map { |attribute| doc[:content][attribute] }
|
6
|
+
else
|
7
|
+
[ doc[:content].text ]
|
8
|
+
end
|
9
|
+
|
10
|
+
contents.map do |content|
|
11
|
+
new_uri = selector.sc::prefix.to_s + content.wikify
|
12
|
+
{ :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -1,5 +1,10 @@
|
|
1
1
|
module RootSelector
|
2
2
|
def self.filter selector, doc
|
3
|
-
|
3
|
+
if selector.sc::attribute.first
|
4
|
+
# Select node's attribute if given
|
5
|
+
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
|
6
|
+
else
|
7
|
+
[ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
|
8
|
+
end
|
4
9
|
end
|
5
10
|
end
|
data/lib/scrappy/support.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.8"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-12-
|
9
|
+
s.date = %q{2010-12-14}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
13
13
|
s.executables = ["scrappy"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
-
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
|
15
|
+
s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
|
16
16
|
s.homepage = %q{http://github.com/josei/scrappy}
|
17
17
|
s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
|
18
18
|
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 8
|
9
|
+
version: 0.1.8
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-14 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -133,6 +133,7 @@ extra_rdoc_files:
|
|
133
133
|
- lib/scrappy/proxy.rb
|
134
134
|
- lib/scrappy/selectors/base_uri.rb
|
135
135
|
- lib/scrappy/selectors/css.rb
|
136
|
+
- lib/scrappy/selectors/new_uri.rb
|
136
137
|
- lib/scrappy/selectors/root.rb
|
137
138
|
- lib/scrappy/selectors/section.rb
|
138
139
|
- lib/scrappy/selectors/slice.rb
|
@@ -161,6 +162,7 @@ files:
|
|
161
162
|
- lib/scrappy/proxy.rb
|
162
163
|
- lib/scrappy/selectors/base_uri.rb
|
163
164
|
- lib/scrappy/selectors/css.rb
|
165
|
+
- lib/scrappy/selectors/new_uri.rb
|
164
166
|
- lib/scrappy/selectors/root.rb
|
165
167
|
- lib/scrappy/selectors/section.rb
|
166
168
|
- lib/scrappy/selectors/slice.rb
|