scrappy 0.4.1 → 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +5 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +21 -0
- data/lib/scrappy/extractor/selectors/new_uri.rb +3 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
data/lib/scrappy.rb
CHANGED
@@ -64,7 +64,7 @@ module Sc
|
|
64
64
|
|
65
65
|
# Build the object -- it can be a node or a literal
|
66
66
|
object = if sc::type.include?(Node('rdf:Literal'))
|
67
|
-
value = doc[:value].to_s.strip
|
67
|
+
value = doc[:value].to_s.gsub("\302\240"," ").strip
|
68
68
|
if options[:referenceable]
|
69
69
|
node.rdf::value = value
|
70
70
|
node.rdf::type += [Node('rdf:Literal')]
|
@@ -20,6 +20,27 @@ module Sc
|
|
20
20
|
# Filter method is defined in each subclass
|
21
21
|
results = filter doc
|
22
22
|
|
23
|
+
if sc::boolean.first=="true"
|
24
|
+
results = results.map do |r|
|
25
|
+
affirmations = ["yes", "true"]
|
26
|
+
negations = ["no", "none", "false", "-", "--"]
|
27
|
+
no = negations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
|
28
|
+
yes = affirmations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
|
29
|
+
if no
|
30
|
+
value = "false"
|
31
|
+
elsif yes
|
32
|
+
value = "true"
|
33
|
+
else
|
34
|
+
value = :remove
|
35
|
+
end
|
36
|
+
r.merge :value=>value
|
37
|
+
end
|
38
|
+
results = results.select{ |r| r[:value] != :remove }
|
39
|
+
end
|
40
|
+
if sc::nonempty.first=="true"
|
41
|
+
results = results.select{ |r| r[:value].gsub("\302\240"," ").strip!=""}
|
42
|
+
end
|
43
|
+
|
23
44
|
if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
|
24
45
|
(Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
|
25
46
|
|
@@ -10,7 +10,9 @@ module Sc
|
|
10
10
|
|
11
11
|
@indexes ||= Hash.new(0)
|
12
12
|
prefix = sc::prefix.first.to_s
|
13
|
-
|
13
|
+
if !["http://", "https://"].include?(prefix)
|
14
|
+
prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
|
15
|
+
end
|
14
16
|
suffix = sc::suffix.first.to_s
|
15
17
|
|
16
18
|
nofollow = (sc::follow.first != "true")
|
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.4.
|
5
|
+
s.version = "0.4.2"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2011-07-
|
9
|
+
s.date = %q{2011-07-06}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 4
|
8
|
-
-
|
9
|
-
version: 0.4.
|
8
|
+
- 2
|
9
|
+
version: 0.4.2
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2011-07-
|
17
|
+
date: 2011-07-06 00:00:00 +02:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|