scrappy 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
data/History.txt CHANGED
@@ -1,3 +1,9 @@
1
+ === 0.1.4 2010-11-24
2
+
3
+ * Support for node sets in extractions
4
+ * Support for index selection in CSS and XPath selectors
5
+ * Debugging mode
6
+
1
7
  === 0.1.3 2010-11-18
2
8
 
3
9
  * RDF node caching
@@ -61,10 +61,12 @@ module Scrappy
61
61
 
62
62
  # Add referenceable data if requested
63
63
  if options[:referenceable]
64
- source = Node(node_hash(doc[:uri], doc[:content].path))
65
- options[:triples] << [ object, Node("sc:source"), source ]
66
- fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
67
- fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
64
+ sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
65
+ sources.each do |source|
66
+ options[:triples] << [ object, Node("sc:source"), source ]
67
+ fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
68
+ fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
69
+ end
68
70
  end
69
71
 
70
72
  # Process subfragments
@@ -80,6 +82,23 @@ module Scrappy
80
82
  # Process selector
81
83
  results = Kernel.const_get(class_name).filter selector, doc
82
84
 
85
+ if !selector.sc::debug.empty?
86
+ puts '== DEBUG'
87
+ puts '== Selector:'
88
+ puts selector.serialize(:yarf, false)
89
+ puts '== Applied on fragment:'
90
+ puts "URI: #{doc[:uri]}"
91
+ puts "Content: #{doc[:content]}"
92
+ puts "Value: #{doc[:value]}"
93
+ results.each_with_index do |result, i|
94
+ puts "== Result ##{i}:"
95
+ puts "URI: #{result[:uri]}"
96
+ puts "Content: #{result[:content]}"
97
+ puts "Value: #{result[:value].inspect}"
98
+ end
99
+ puts
100
+ end
101
+
83
102
  # Return results if no nested selectors
84
103
  return results if selector.sc::selector.empty?
85
104
 
@@ -1,7 +1,12 @@
1
1
  module XPathSelector
2
2
  def self.filter selector, doc
3
3
  selector.rdf::value.map do |pattern|
4
- doc[:content].search(pattern).map do |result|
4
+ interval = if selector.sc::index.first
5
+ (selector.sc::index.first.to_i..selector.sc::index.first.to_i)
6
+ else
7
+ (0..-1)
8
+ end
9
+ (doc[:content].search(pattern)[interval] || []).map do |result|
5
10
  if selector.sc::attribute.first
6
11
  # Select node's attribute if given
7
12
  selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
@@ -12,4 +17,4 @@ module XPathSelector
12
17
  end
13
18
  end.flatten
14
19
  end
15
- end
20
+ end
@@ -16,3 +16,13 @@ module Scrappy
16
16
  end
17
17
  end
18
18
  end
19
+
20
+ module Nokogiri
21
+ module XML
22
+ class NodeSet
23
+ def select &block
24
+ NodeSet.new(document, super(&block))
25
+ end
26
+ end
27
+ end
28
+ end
data/lib/scrappy.rb CHANGED
@@ -19,7 +19,7 @@ require 'scrappy/agent/agent'
19
19
  Namespace :sc, 'http://lab.gsi.dit.upm.es/scraping.rdf#'
20
20
 
21
21
  module Scrappy
22
- VERSION = '0.1.3'
22
+ VERSION = '0.1.4'
23
23
  end
24
24
 
25
25
  # Require selectors
data/scrappy.gemspec CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{scrappy}
5
- s.version = "0.1.3"
5
+ s.version = "0.1.4"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Jose Ignacio"]
9
- s.date = %q{2010-11-18}
9
+ s.date = %q{2010-11-24}
10
10
  s.default_executable = %q{scrappy}
11
11
  s.description = %q{RDF web scraper}
12
12
  s.email = %q{joseignacio.fernandez@gmail.com}
metadata CHANGED
@@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version
5
5
  segments:
6
6
  - 0
7
7
  - 1
8
- - 3
9
- version: 0.1.3
8
+ - 4
9
+ version: 0.1.4
10
10
  platform: ruby
11
11
  authors:
12
12
  - Jose Ignacio
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-11-18 00:00:00 +01:00
17
+ date: 2010-11-24 00:00:00 +01:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency