scrappy 0.1.7 → 0.1.8

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,13 @@
1
+ === 0.1.8 2010-12-14
2
+
3
+ * Added sc:sameas
4
+ * Added sc:attribute support to sc:RootSelector
5
+ * Added sc:NewUriSelector for constructing new uris
6
+
7
+ === 0.1.7 2010-12-09
8
+
9
+ * Added section selector
10
+
1
11
  === 0.1.6 2010-12-09
2
12
 
3
13
  * Added sc:superclass to sc:Fragments
data/Manifest CHANGED
@@ -15,6 +15,7 @@ lib/scrappy/agent/visual_agent.rb
15
15
  lib/scrappy/proxy.rb
16
16
  lib/scrappy/selectors/base_uri.rb
17
17
  lib/scrappy/selectors/css.rb
18
+ lib/scrappy/selectors/new_uri.rb
18
19
  lib/scrappy/selectors/root.rb
19
20
  lib/scrappy/selectors/section.rb
20
21
  lib/scrappy/selectors/slice.rb
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.7'
22
+ VERSION = '0.1.8'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -94,7 +94,7 @@ module Scrappy
94
94
  end
95
95
 
96
96
  def request args={}
97
- RDF::Graph.new map(args).uniq
97
+ RDF::Graph.new(map(args).uniq.select { |s,p,o| p!=Node('rdf:type') or o!=Node('sc:Index') })
98
98
  end
99
99
 
100
100
  def proxy args={}
@@ -56,6 +56,7 @@ module Scrappy
56
56
  options[:triples] << [node, Node('rdf:type'), fragment.sc::type.first]
57
57
  end
58
58
  fragment.sc::superclass.each { |superclass| options[:triples] << [node, Node('rdfs:subClassOf'), superclass] }
59
+ fragment.sc::sameas.each { |samenode| options[:triples] << [node, Node('owl:sameAs'), samenode] }
59
60
  node
60
61
  end
61
62
  fragment.sc::relation.each { |relation| options[:triples] << [options[:parent], relation, object] }
@@ -0,0 +1,15 @@
1
+ module NewUriSelector
2
+ def self.filter selector, doc
3
+ contents = if selector.sc::attribute.first
4
+ # Select node's attribute if given
5
+ selector.sc::attribute.map { |attribute| doc[:content][attribute] }
6
+ else
7
+ [ doc[:content].text ]
8
+ end
9
+
10
+ contents.map do |content|
11
+ new_uri = selector.sc::prefix.to_s + content.wikify
12
+ { :uri=>new_uri, :content=>doc[:content], :value=>new_uri }
13
+ end
14
+ end
15
+ end
@@ -1,5 +1,10 @@
1
1
  module RootSelector
2
2
  def self.filter selector, doc
3
- [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
3
+ if selector.sc::attribute.first
4
+ # Select node's attribute if given
5
+ selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content][attribute] } }
6
+ else
7
+ [ { :uri=>doc[:uri], :content=>doc[:content], :value=>doc[:content].text } ]
8
+ end
4
9
  end
5
10
  end
@@ -25,4 +25,10 @@ module Nokogiri
25
25
  end
26
26
  end
27
27
  end
28
- end
28
+ end
29
+
30
+ class String
31
+ def wikify
32
+ gsub(/^[a-z]|\s+[a-z]/) { |a| a.upcase }.gsub(/\s/, '')
33
+ end
34
+ end
data/scrappy.gemspec CHANGED
@@ -2,17 +2,17 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.7"
5
+ s.version = "0.1.8"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-12-09}
9
+ s.date = %q{2010-12-14}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/new_uri.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 7
9
- version: 0.1.7
8
+ - 8
9
+ version: 0.1.8
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-09 00:00:00 +01:00
17
+ date: 2010-12-14 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -133,6 +133,7 @@ extra_rdoc_files:
133
133
  - lib/scrappy/proxy.rb
134
134
  - lib/scrappy/selectors/base_uri.rb
135
135
  - lib/scrappy/selectors/css.rb
136
+ - lib/scrappy/selectors/new_uri.rb
136
137
  - lib/scrappy/selectors/root.rb
137
138
  - lib/scrappy/selectors/section.rb
138
139
  - lib/scrappy/selectors/slice.rb
@@ -161,6 +162,7 @@ files:
161
162
  - lib/scrappy/proxy.rb
162
163
  - lib/scrappy/selectors/base_uri.rb
163
164
  - lib/scrappy/selectors/css.rb
165
+ - lib/scrappy/selectors/new_uri.rb
164
166
  - lib/scrappy/selectors/root.rb
165
167
  - lib/scrappy/selectors/section.rb
166
168
  - lib/scrappy/selectors/slice.rb