scrappy 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.1.4 2010-11-24
2
+
3
+ * Support for node sets in extractions
4
+ * Support for index selection in CSS and XPath selectors
5
+ * Debugging mode
6
+
1
7
  === 0.1.3 2010-11-18
2
8
 
3
9
  * RDF node caching
@@ -61,10 +61,12 @@ module Scrappy
61
61
 
62
62
  # Add referenceable data if requested
63
63
  if options[:referenceable]
64
- source = Node(node_hash(doc[:uri], doc[:content].path))
65
- options[:triples] << [ object, Node("sc:source"), source ]
66
- fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
67
- fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
64
+ sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
65
+ sources.each do |source|
66
+ options[:triples] << [ object, Node("sc:source"), source ]
67
+ fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
68
+ fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
69
+ end
68
70
  end
69
71
 
70
72
  # Process subfragments
@@ -80,6 +82,23 @@ module Scrappy
80
82
  # Process selector
81
83
  results = Kernel.const_get(class_name).filter selector, doc
82
84
 
85
+ if !selector.sc::debug.empty?
86
+ puts '== DEBUG'
87
+ puts '== Selector:'
88
+ puts selector.serialize(:yarf, false)
89
+ puts '== Applied on fragment:'
90
+ puts "URI: #{doc[:uri]}"
91
+ puts "Content: #{doc[:content]}"
92
+ puts "Value: #{doc[:value]}"
93
+ results.each_with_index do |result, i|
94
+ puts "== Result ##{i}:"
95
+ puts "URI: #{result[:uri]}"
96
+ puts "Content: #{result[:content]}"
97
+ puts "Value: #{result[:value].inspect}"
98
+ end
99
+ puts
100
+ end
101
+
83
102
  # Return results if no nested selectors
84
103
  return results if selector.sc::selector.empty?
85
104
 
@@ -1,7 +1,12 @@
1
1
  module XPathSelector
2
2
  def self.filter selector, doc
3
3
  selector.rdf::value.map do |pattern|
4
- doc[:content].search(pattern).map do |result|
4
+ interval = if selector.sc::index.first
5
+ (selector.sc::index.first.to_i..selector.sc::index.first.to_i)
6
+ else
7
+ (0..-1)
8
+ end
9
+ (doc[:content].search(pattern)[interval] || []).map do |result|
5
10
  if selector.sc::attribute.first
6
11
  # Select node's attribute if given
7
12
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
@@ -12,4 +17,4 @@ module XPathSelector
12
17
  end
13
18
  end.flatten
14
19
  end
15
- end
20
+ end
@@ -16,3 +16,13 @@ module Scrappy
16
16
  end
17
17
  end
18
18
  end
19
+
20
+ module Nokogiri
21
+ module XML
22
+ class NodeSet
23
+ def select &block
24
+ NodeSet.new(document, super(&block))
25
+ end
26
+ end
27
+ end
28
+ end
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.3'
22
+ VERSION = '0.1.4'
23
23
  end
24
24
 
25
25
  # Require selectors
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.3"
5
+ s.version = "0.1.4"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-18}
9
+ s.date = %q{2010-11-24}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 3
9
- version: 0.1.3
8
+ - 4
9
+ version: 0.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-18 00:00:00 +01:00
17
+ date: 2010-11-24 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency