scrappy 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/lib/scrappy/agent/extractor.rb +23 -4
- data/lib/scrappy/selectors/xpath.rb +7 -2
- data/lib/scrappy/support.rb +10 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
@@ -61,10 +61,12 @@ module Scrappy
|
|
61
61
|
|
62
62
|
# Add referenceable data if requested
|
63
63
|
if options[:referenceable]
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
64
|
+
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
65
|
+
sources.each do |source|
|
66
|
+
options[:triples] << [ object, Node("sc:source"), source ]
|
67
|
+
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
68
|
+
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
69
|
+
end
|
68
70
|
end
|
69
71
|
|
70
72
|
# Process subfragments
|
@@ -80,6 +82,23 @@ module Scrappy
|
|
80
82
|
# Process selector
|
81
83
|
results = Kernel.const_get(class_name).filter selector, doc
|
82
84
|
|
85
|
+
if !selector.sc::debug.empty?
|
86
|
+
puts '== DEBUG'
|
87
|
+
puts '== Selector:'
|
88
|
+
puts selector.serialize(:yarf, false)
|
89
|
+
puts '== Applied on fragment:'
|
90
|
+
puts "URI: #{doc[:uri]}"
|
91
|
+
puts "Content: #{doc[:content]}"
|
92
|
+
puts "Value: #{doc[:value]}"
|
93
|
+
results.each_with_index do |result, i|
|
94
|
+
puts "== Result ##{i}:"
|
95
|
+
puts "URI: #{result[:uri]}"
|
96
|
+
puts "Content: #{result[:content]}"
|
97
|
+
puts "Value: #{result[:value].inspect}"
|
98
|
+
end
|
99
|
+
puts
|
100
|
+
end
|
101
|
+
|
83
102
|
# Return results if no nested selectors
|
84
103
|
return results if selector.sc::selector.empty?
|
85
104
|
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module XPathSelector
|
2
2
|
def self.filter selector, doc
|
3
3
|
selector.rdf::value.map do |pattern|
|
4
|
-
|
4
|
+
interval = if selector.sc::index.first
|
5
|
+
(selector.sc::index.first.to_i..selector.sc::index.first.to_i)
|
6
|
+
else
|
7
|
+
(0..-1)
|
8
|
+
end
|
9
|
+
(doc[:content].search(pattern)[interval] || []).map do |result|
|
5
10
|
if selector.sc::attribute.first
|
6
11
|
# Select node's attribute if given
|
7
12
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
@@ -12,4 +17,4 @@ module XPathSelector
|
|
12
17
|
end
|
13
18
|
end.flatten
|
14
19
|
end
|
15
|
-
end
|
20
|
+
end
|
data/lib/scrappy/support.rb
CHANGED
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.4"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-11-
|
9
|
+
s.date = %q{2010-11-24}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 4
|
9
|
+
version: 0.1.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-24 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|