scrappy 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
 - data/lib/scrappy/agent/extractor.rb +23 -4
 - data/lib/scrappy/selectors/xpath.rb +7 -2
 - data/lib/scrappy/support.rb +10 -0
 - data/lib/scrappy.rb +1 -1
 - data/scrappy.gemspec +2 -2
 - metadata +3 -3
 
    
        data/History.txt
    CHANGED
    
    
| 
         @@ -61,10 +61,12 @@ module Scrappy 
     | 
|
| 
       61 
61 
     | 
    
         | 
| 
       62 
62 
     | 
    
         
             
                      # Add referenceable data if requested
         
     | 
| 
       63 
63 
     | 
    
         
             
                      if options[:referenceable]
         
     | 
| 
       64 
     | 
    
         
            -
                         
     | 
| 
       65 
     | 
    
         
            -
                         
     | 
| 
       66 
     | 
    
         
            -
             
     | 
| 
       67 
     | 
    
         
            -
             
     | 
| 
      
 64 
     | 
    
         
            +
                        sources = [doc[:content]].flatten.map { |node| Node(node_hash(doc[:uri], node.path)) }
         
     | 
| 
      
 65 
     | 
    
         
            +
                        sources.each do |source|
         
     | 
| 
      
 66 
     | 
    
         
            +
                          options[:triples] << [ object, Node("sc:source"), source ]
         
     | 
| 
      
 67 
     | 
    
         
            +
                          fragment.sc::type.each { |t| options[:triples] << [ source, Node("sc:type"), t ] }
         
     | 
| 
      
 68 
     | 
    
         
            +
                          fragment.sc::relation.each { |relation| options[:triples] << [ source, Node("sc:relation"), relation ] }
         
     | 
| 
      
 69 
     | 
    
         
            +
                        end
         
     | 
| 
       68 
70 
     | 
    
         
             
                      end
         
     | 
| 
       69 
71 
     | 
    
         | 
| 
       70 
72 
     | 
    
         
             
                      # Process subfragments
         
     | 
| 
         @@ -80,6 +82,23 @@ module Scrappy 
     | 
|
| 
       80 
82 
     | 
    
         
             
                  # Process selector
         
     | 
| 
       81 
83 
     | 
    
         
             
                  results = Kernel.const_get(class_name).filter selector, doc
         
     | 
| 
       82 
84 
     | 
    
         | 
| 
      
 85 
     | 
    
         
            +
                  if !selector.sc::debug.empty?
         
     | 
| 
      
 86 
     | 
    
         
            +
                    puts '== DEBUG'
         
     | 
| 
      
 87 
     | 
    
         
            +
                    puts '== Selector:'
         
     | 
| 
      
 88 
     | 
    
         
            +
                    puts selector.serialize(:yarf, false)
         
     | 
| 
      
 89 
     | 
    
         
            +
                    puts '== Applied on fragment:'
         
     | 
| 
      
 90 
     | 
    
         
            +
                    puts "URI: #{doc[:uri]}"
         
     | 
| 
      
 91 
     | 
    
         
            +
                    puts "Content: #{doc[:content]}"
         
     | 
| 
      
 92 
     | 
    
         
            +
                    puts "Value: #{doc[:value]}"
         
     | 
| 
      
 93 
     | 
    
         
            +
                    results.each_with_index do |result, i|
         
     | 
| 
      
 94 
     | 
    
         
            +
                      puts "== Result ##{i}:"
         
     | 
| 
      
 95 
     | 
    
         
            +
                      puts "URI: #{result[:uri]}"
         
     | 
| 
      
 96 
     | 
    
         
            +
                      puts "Content: #{result[:content]}"
         
     | 
| 
      
 97 
     | 
    
         
            +
                      puts "Value: #{result[:value].inspect}"
         
     | 
| 
      
 98 
     | 
    
         
            +
                    end
         
     | 
| 
      
 99 
     | 
    
         
            +
                    puts
         
     | 
| 
      
 100 
     | 
    
         
            +
                  end
         
     | 
| 
      
 101 
     | 
    
         
            +
                  
         
     | 
| 
       83 
102 
     | 
    
         
             
                  # Return results if no nested selectors
         
     | 
| 
       84 
103 
     | 
    
         
             
                  return results if selector.sc::selector.empty?
         
     | 
| 
       85 
104 
     | 
    
         | 
| 
         @@ -1,7 +1,12 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module XPathSelector
         
     | 
| 
       2 
2 
     | 
    
         
             
              def self.filter selector, doc
         
     | 
| 
       3 
3 
     | 
    
         
             
                selector.rdf::value.map do |pattern|
         
     | 
| 
       4 
     | 
    
         
            -
                   
     | 
| 
      
 4 
     | 
    
         
            +
                  interval = if selector.sc::index.first
         
     | 
| 
      
 5 
     | 
    
         
            +
                    (selector.sc::index.first.to_i..selector.sc::index.first.to_i)
         
     | 
| 
      
 6 
     | 
    
         
            +
                  else
         
     | 
| 
      
 7 
     | 
    
         
            +
                    (0..-1)
         
     | 
| 
      
 8 
     | 
    
         
            +
                  end
         
     | 
| 
      
 9 
     | 
    
         
            +
                  (doc[:content].search(pattern)[interval] || []).map do |result|
         
     | 
| 
       5 
10 
     | 
    
         
             
                    if selector.sc::attribute.first
         
     | 
| 
       6 
11 
     | 
    
         
             
                      # Select node's attribute if given
         
     | 
| 
       7 
12 
     | 
    
         
             
                      selector.sc::attribute.map { |attribute| { :uri=>doc[:uri], :content=>result, :value=>result[attribute] } }
         
     | 
| 
         @@ -12,4 +17,4 @@ module XPathSelector 
     | 
|
| 
       12 
17 
     | 
    
         
             
                  end
         
     | 
| 
       13 
18 
     | 
    
         
             
                end.flatten
         
     | 
| 
       14 
19 
     | 
    
         
             
              end
         
     | 
| 
       15 
     | 
    
         
            -
            end
         
     | 
| 
      
 20 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/scrappy/support.rb
    CHANGED
    
    
    
        data/lib/scrappy.rb
    CHANGED
    
    
    
        data/scrappy.gemspec
    CHANGED
    
    | 
         @@ -2,11 +2,11 @@ 
     | 
|
| 
       2 
2 
     | 
    
         | 
| 
       3 
3 
     | 
    
         
             
            Gem::Specification.new do |s|
         
     | 
| 
       4 
4 
     | 
    
         
             
              s.name = %q{scrappy}
         
     | 
| 
       5 
     | 
    
         
            -
              s.version = "0.1. 
     | 
| 
      
 5 
     | 
    
         
            +
              s.version = "0.1.4"
         
     | 
| 
       6 
6 
     | 
    
         | 
| 
       7 
7 
     | 
    
         
             
              s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
         
     | 
| 
       8 
8 
     | 
    
         
             
              s.authors = ["Jose Ignacio"]
         
     | 
| 
       9 
     | 
    
         
            -
              s.date = %q{2010-11- 
     | 
| 
      
 9 
     | 
    
         
            +
              s.date = %q{2010-11-24}
         
     | 
| 
       10 
10 
     | 
    
         
             
              s.default_executable = %q{scrappy}
         
     | 
| 
       11 
11 
     | 
    
         
             
              s.description = %q{RDF web scraper}
         
     | 
| 
       12 
12 
     | 
    
         
             
              s.email = %q{joseignacio.fernandez@gmail.com}
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version 
     | 
|
| 
       5 
5 
     | 
    
         
             
              segments: 
         
     | 
| 
       6 
6 
     | 
    
         
             
              - 0
         
     | 
| 
       7 
7 
     | 
    
         
             
              - 1
         
     | 
| 
       8 
     | 
    
         
            -
              -  
     | 
| 
       9 
     | 
    
         
            -
              version: 0.1. 
     | 
| 
      
 8 
     | 
    
         
            +
              - 4
         
     | 
| 
      
 9 
     | 
    
         
            +
              version: 0.1.4
         
     | 
| 
       10 
10 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       11 
11 
     | 
    
         
             
            authors: 
         
     | 
| 
       12 
12 
     | 
    
         
             
            - Jose Ignacio
         
     | 
| 
         @@ -14,7 +14,7 @@ autorequire: 
     | 
|
| 
       14 
14 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       15 
15 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       16 
16 
     | 
    
         | 
| 
       17 
     | 
    
         
            -
            date: 2010-11- 
     | 
| 
      
 17 
     | 
    
         
            +
            date: 2010-11-24 00:00:00 +01:00
         
     | 
| 
       18 
18 
     | 
    
         
             
            default_executable: 
         
     | 
| 
       19 
19 
     | 
    
         
             
            dependencies: 
         
     | 
| 
       20 
20 
     | 
    
         
             
            - !ruby/object:Gem::Dependency 
         
     |