scrappy 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,8 @@
1
+ === 0.4.2 2011-07-06
2
+
3
+ * Support to sc:nonempty and sc:boolean in selectors
4
+ * Bugfix in NewUriSelector
5
+
1
6
  === 0.4.1 2011-07-05
2
7
 
3
8
  * Bug correction in NewUriSelector
@@ -24,5 +24,5 @@ require 'scrappy/agent/blind_agent'
24
24
  require 'scrappy/agent/agent'
25
25
 
26
26
  module Scrappy
27
- VERSION = '0.4.1'
27
+ VERSION = '0.4.2'
28
28
  end
@@ -64,7 +64,7 @@ module Sc
64
64
 
65
65
  # Build the object -- it can be a node or a literal
66
66
  object = if sc::type.include?(Node('rdf:Literal'))
67
- value = doc[:value].to_s.strip
67
+ value = doc[:value].to_s.gsub("\302\240"," ").strip
68
68
  if options[:referenceable]
69
69
  node.rdf::value = value
70
70
  node.rdf::type += [Node('rdf:Literal')]
@@ -20,6 +20,27 @@ module Sc
20
20
  # Filter method is defined in each subclass
21
21
  results = filter doc
22
22
 
23
+ if sc::boolean.first=="true"
24
+ results = results.map do |r|
25
+ affirmations = ["yes", "true"]
26
+ negations = ["no", "none", "false", "-", "--"]
27
+ no = negations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
28
+ yes = affirmations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
29
+ if no
30
+ value = "false"
31
+ elsif yes
32
+ value = "true"
33
+ else
34
+ value = :remove
35
+ end
36
+ r.merge :value=>value
37
+ end
38
+ results = results.select{ |r| r[:value] != :remove }
39
+ end
40
+ if sc::nonempty.first=="true"
41
+ results = results.select{ |r| r[:value].gsub("\302\240"," ").strip!=""}
42
+ end
43
+
23
44
  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
24
45
  (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
25
46
 
@@ -10,7 +10,9 @@ module Sc
10
10
 
11
11
  @indexes ||= Hash.new(0)
12
12
  prefix = sc::prefix.first.to_s
13
- prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
13
+ if !["http://", "https://"].include?(prefix)
14
+ prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
15
+ end
14
16
  suffix = sc::suffix.first.to_s
15
17
 
16
18
  nofollow = (sc::follow.first != "true")
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.4.1"
5
+ s.version = "0.4.2"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2011-07-05}
9
+ s.date = %q{2011-07-06}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 4
8
- - 1
9
- version: 0.4.1
8
+ - 2
9
+ version: 0.4.2
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2011-07-05 00:00:00 +02:00
17
+ date: 2011-07-06 00:00:00 +02:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency