scrappy 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/lib/scrappy/agent/extractor.rb +23 -4
- data/lib/scrappy/selectors/xpath.rb +7 -2
- data/lib/scrappy/support.rb +10 -0
- data/lib/scrappy.rb +1 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
data/History.txt
CHANGED
@@ -61,10 +61,12 @@ module Scrappy
|
|
61
61
|
|
62
62
|
# Add referenceable data if requested
|
63
63
|
if options[:referenceable]
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
64
|
+
sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
|
65
|
+
sources.each do |source|
|
66
|
+
options[:triples] << [ object, Node("sc:source"), source ]
|
67
|
+
fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
|
68
|
+
fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
|
69
|
+
end
|
68
70
|
end
|
69
71
|
|
70
72
|
# Process subfragments
|
@@ -80,6 +82,23 @@ module Scrappy
|
|
80
82
|
# Process selector
|
81
83
|
results = Kernel.const_get(class_name).filter selector, doc
|
82
84
|
|
85
|
+
if !selector.sc::debug.empty?
|
86
|
+
puts '== DEBUG'
|
87
|
+
puts '== Selector:'
|
88
|
+
puts selector.serialize(:yarf, false)
|
89
|
+
puts '== Applied on fragment:'
|
90
|
+
puts "URI: #{doc[:uri]}"
|
91
|
+
puts "Content: #{doc[:content]}"
|
92
|
+
puts "Value: #{doc[:value]}"
|
93
|
+
results.each_with_index do |result, i|
|
94
|
+
puts "== Result ##{i}:"
|
95
|
+
puts "URI: #{result[:uri]}"
|
96
|
+
puts "Content: #{result[:content]}"
|
97
|
+
puts "Value: #{result[:value].inspect}"
|
98
|
+
end
|
99
|
+
puts
|
100
|
+
end
|
101
|
+
|
83
102
|
# Return results if no nested selectors
|
84
103
|
return results if selector.sc::selector.empty?
|
85
104
|
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module XPathSelector
|
2
2
|
def self.filter selector, doc
|
3
3
|
selector.rdf::value.map do |pattern|
|
4
|
-
|
4
|
+
interval = if selector.sc::index.first
|
5
|
+
(selector.sc::index.first.to_i..selector.sc::index.first.to_i)
|
6
|
+
else
|
7
|
+
(0..-1)
|
8
|
+
end
|
9
|
+
(doc[:content].search(pattern)[interval] || []).map do |result|
|
5
10
|
if selector.sc::attribute.first
|
6
11
|
# Select node's attribute if given
|
7
12
|
selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
|
@@ -12,4 +17,4 @@ module XPathSelector
|
|
12
17
|
end
|
13
18
|
end.flatten
|
14
19
|
end
|
15
|
-
end
|
20
|
+
end
|
data/lib/scrappy/support.rb
CHANGED
data/lib/scrappy.rb
CHANGED
data/scrappy.gemspec
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{scrappy}
|
5
|
-
s.version = "0.1.
|
5
|
+
s.version = "0.1.4"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Jose Ignacio"]
|
9
|
-
s.date = %q{2010-11-
|
9
|
+
s.date = %q{2010-11-24}
|
10
10
|
s.default_executable = %q{scrappy}
|
11
11
|
s.description = %q{RDF web scraper}
|
12
12
|
s.email = %q{joseignacio.fernandez@gmail.com}
|
metadata
CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
|
|
5
5
|
segments:
|
6
6
|
- 0
|
7
7
|
- 1
|
8
|
-
-
|
9
|
-
version: 0.1.
|
8
|
+
- 4
|
9
|
+
version: 0.1.4
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Jose Ignacio
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-11-
|
17
|
+
date: 2010-11-24 00:00:00 +01:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|