scrappy 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ === 0.1.8 2010-12-14
2
+
3
+ * Added sc:sameas
4
+ * Added sc:attribute support to sc:RootSelector
5
+ * Added sc:NewUriSelector for constructing new uris
6
+
7
+ === 0.1.7 2010-12-09
8
+
9
+ * Added section selector
10
+
1
11
  === 0.1.6 2010-12-09
2
12
 
3
13
  * Added sc:superclass to sc:Fragments
data/Manifest CHANGED
@@ -15,6 +15,7 @@ lib/scrappy/agent/visual_agent.rb
15
15
  lib/scrappy/proxy.rb
16
16
  lib/scrappy/selectors/base_uri.rb
17
17
  lib/scrappy/selectors/css.rb
18
+ lib/scrappy/selectors/new_uri.rb
18
19
  lib/scrappy/selectors/root.rb
19
20
  lib/scrappy/selectors/section.rb
20
21
  lib/scrappy/selectors/slice.rb
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.7'
22
+ VERSION = '0.1.8'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -94,7 +94,7 @@ module Scrappy
94
94
  end
95
95
 
96
96
  def request args={}
97
- RDF::Graph.new map(args).uniq
97
+ RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or o!=Node('sc:Index') })
98
98
  end
99
99
 
100
100
  def proxy args={}
@@ -56,6 +56,7 @@ module Scrappy
56
56
  options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
57
57
  end
58
58
  fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
59
+ fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
59
60
  node
60
61
  end
61
62
  fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
@@ -0,0 +1,15 @@
1
+ module NewUriSelector
2
+ def self.filter selector, doc
3
+ contents = if selector.sc::attribute.first
4
+ # Select node's attribute if given
5
+ selector.sc::attribute.map { |attribute| doc[:content][attribute] }
6
+ else
7
+ [ doc[:content].text ]
8
+ end
9
+
10
+ contents.map do |content|
11
+ new_uri = selector.sc::prefix.to_s + content.wikify
12
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,10 @@
1
1
  module RootSelector
2
2
  def self.filter selector, doc
3
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
3
+ if selector.sc::attribute.first
4
+ # Select node's attribute if given
5
+ selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
+ else
7
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
8
+ end
4
9
  end
5
10
  end
@@ -25,4 +25,10 @@ module Nokogiri
25
25
  end
26
26
  end
27
27
  end
28
- end
28
+ end
29
+
30
+ class String
31
+ def wikify
32
+ gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
33
+ end
34
+ end
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.7"
5
+ s.version = "0.1.8"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-12-09}
9
+ s.date = %q{2010-12-14}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 7
9
- version: 0.1.7
8
+ - 8
9
+ version: 0.1.8
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-09 00:00:00 +01:00
17
+ date: 2010-12-14 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -133,6 +133,7 @@ extra_rdoc_files:
133
133
  - lib/scrappy/proxy.rb
134
134
  - lib/scrappy/selectors/base_uri.rb
135
135
  - lib/scrappy/selectors/css.rb
136
+ - lib/scrappy/selectors/new_uri.rb
136
137
  - lib/scrappy/selectors/root.rb
137
138
  - lib/scrappy/selectors/section.rb
138
139
  - lib/scrappy/selectors/slice.rb
@@ -161,6 +162,7 @@ files:
161
162
  - lib/scrappy/proxy.rb
162
163
  - lib/scrappy/selectors/base_uri.rb
163
164
  - lib/scrappy/selectors/css.rb
165
+ - lib/scrappy/selectors/new_uri.rb
164
166
  - lib/scrappy/selectors/root.rb
165
167
  - lib/scrappy/selectors/section.rb
166
168
  - lib/scrappy/selectors/slice.rb