scrappy 0.1.6 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest CHANGED
@@ -16,6 +16,7 @@ lib/scrappy/proxy.rb
16
16
  lib/scrappy/selectors/base_uri.rb
17
17
  lib/scrappy/selectors/css.rb
18
18
  lib/scrappy/selectors/root.rb
19
+ lib/scrappy/selectors/section.rb
19
20
  lib/scrappy/selectors/slice.rb
20
21
  lib/scrappy/selectors/uri.rb
21
22
  lib/scrappy/selectors/uri_pattern.rb
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.6'
22
+ VERSION = '0.1.7'
23
23
  end
24
24
 
25
25
  # Require selectors
@@ -0,0 +1,11 @@
1
+ module SectionSelector
2
+ def self.filter selector, doc
3
+ selector.rdf::value.map do |pattern|
4
+ doc[:content].search('h1, h2, h3, h4, h5, h6, h7, h8, h9, h10').select { |n| n.parent.name!='script' and n.text.downcase.strip == pattern }.map do |node|
5
+ found = false
6
+ content = node.parent.children[node.parent.children.index(node)+1..-1].select { |n| found ||= (n.name==node.name); !found }
7
+ [ { :uri=>doc[:uri], :content=>content, :value=>content.map{|t| t.text.strip}.select{|t| t.to_s.strip!=''}*"\n" } ]
8
+ end
9
+ end.flatten
10
+ end
11
+ end
data/scrappy.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.6"
5
+ s.version = "0.1.7"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
@@ -11,8 +11,8 @@ Gem::Specification.new do |s|
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
13
13
  s.executables = ["scrappy"]
14
- s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
- s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
14
+ s.extra_rdoc_files = ["README.rdoc", "bin/scrappy", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb"]
15
+ s.files = ["History.txt", "Manifest", "README.rdoc", "Rakefile", "bin/scrappy", "kb/elmundo.yarf", "lib/js/annotator.js", "lib/scrappy.rb", "lib/scrappy/agent/agent.rb", "lib/scrappy/agent/blind_agent.rb", "lib/scrappy/agent/cache.rb", "lib/scrappy/agent/map_reduce.rb", "lib/scrappy/agent/extractor.rb", "lib/scrappy/agent/visual_agent.rb", "lib/scrappy/proxy.rb", "lib/scrappy/selectors/base_uri.rb", "lib/scrappy/selectors/css.rb", "lib/scrappy/selectors/root.rb", "lib/scrappy/selectors/section.rb", "lib/scrappy/selectors/slice.rb", "lib/scrappy/selectors/uri.rb", "lib/scrappy/selectors/uri_pattern.rb", "lib/scrappy/selectors/xpath.rb", "lib/scrappy/server.rb", "lib/scrappy/shell.rb", "lib/scrappy/support.rb", "lib/scrappy/webkit/webkit.rb", "test/test_helper.rb", "test/test_scrappy.rb", "scrappy.gemspec"]
16
16
  s.homepage = %q{http://github.com/josei/scrappy}
17
17
  s.post_install_message = %q{**(Optional) Remember to install rbwebkitgtk for visual parsing features**}
18
18
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Scrappy", "--main", "README.rdoc"]
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 6
9
- version: 0.1.6
8
+ - 7
9
+ version: 0.1.7
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -134,6 +134,7 @@ extra_rdoc_files:
134
134
  - lib/scrappy/selectors/base_uri.rb
135
135
  - lib/scrappy/selectors/css.rb
136
136
  - lib/scrappy/selectors/root.rb
137
+ - lib/scrappy/selectors/section.rb
137
138
  - lib/scrappy/selectors/slice.rb
138
139
  - lib/scrappy/selectors/uri.rb
139
140
  - lib/scrappy/selectors/uri_pattern.rb
@@ -161,6 +162,7 @@ files:
161
162
  - lib/scrappy/selectors/base_uri.rb
162
163
  - lib/scrappy/selectors/css.rb
163
164
  - lib/scrappy/selectors/root.rb
165
+ - lib/scrappy/selectors/section.rb
164
166
  - lib/scrappy/selectors/slice.rb
165
167
  - lib/scrappy/selectors/uri.rb
166
168
  - lib/scrappy/selectors/uri_pattern.rb