scrappy 0.4.1 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +5 -0
- data/lib/scrappy.rb +1 -1
- data/lib/scrappy/extractor/fragment.rb +1 -1
- data/lib/scrappy/extractor/selector.rb +21 -0
- data/lib/scrappy/extractor/selectors/new_uri.rb +3 -1
- data/scrappy.gemspec +2 -2
- metadata +3 -3
    
        data/History.txt
    CHANGED
    
    
    
        data/lib/scrappy.rb
    CHANGED
    
    
| @@ -64,7 +64,7 @@ module Sc | |
| 64 64 |  | 
| 65 65 | 
             
                    # Build the object -- it can be a node or a literal
         | 
| 66 66 | 
             
                    object = if sc::type.include?(Node('rdf:Literal'))
         | 
| 67 | 
            -
                      value = doc[:value].to_s.strip
         | 
| 67 | 
            +
                      value = doc[:value].to_s.gsub("\302\240"," ").strip
         | 
| 68 68 | 
             
                      if options[:referenceable]
         | 
| 69 69 | 
             
                        node.rdf::value = value
         | 
| 70 70 | 
             
                        node.rdf::type += [Node('rdf:Literal')]
         | 
| @@ -20,6 +20,27 @@ module Sc | |
| 20 20 | 
             
                  # Filter method is defined in each subclass
         | 
| 21 21 | 
             
                  results = filter doc
         | 
| 22 22 |  | 
| 23 | 
            +
                  if sc::boolean.first=="true"
         | 
| 24 | 
            +
                    results = results.map do |r|
         | 
| 25 | 
            +
                      affirmations = ["yes", "true"]
         | 
| 26 | 
            +
                      negations = ["no", "none", "false", "-", "--"]
         | 
| 27 | 
            +
                      no  = negations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
         | 
| 28 | 
            +
                      yes = affirmations.include?(r[:value].gsub("\302\240"," ").strip.downcase)
         | 
| 29 | 
            +
                      if no
         | 
| 30 | 
            +
                        value = "false" 
         | 
| 31 | 
            +
                      elsif yes
         | 
| 32 | 
            +
                        value = "true"
         | 
| 33 | 
            +
                      else
         | 
| 34 | 
            +
                        value = :remove
         | 
| 35 | 
            +
                      end
         | 
| 36 | 
            +
                      r.merge :value=>value
         | 
| 37 | 
            +
                    end
         | 
| 38 | 
            +
                    results = results.select{ |r| r[:value] != :remove }
         | 
| 39 | 
            +
                  end
         | 
| 40 | 
            +
                  if sc::nonempty.first=="true"
         | 
| 41 | 
            +
                    results = results.select{ |r| r[:value].gsub("\302\240"," ").strip!=""}
         | 
| 42 | 
            +
                  end
         | 
| 43 | 
            +
                  
         | 
| 23 44 | 
             
                  if sc::debug.first=="true" and Scrappy::Agent::Options.debug and
         | 
| 24 45 | 
             
                    (Scrappy::Agent::Options.debug_key.nil? or doc[:value].downcase.include?(Scrappy::Agent::Options.debug_key) )
         | 
| 25 46 |  | 
| @@ -10,7 +10,9 @@ module Sc | |
| 10 10 |  | 
| 11 11 | 
             
                  @indexes ||= Hash.new(0)
         | 
| 12 12 | 
             
                  prefix = sc::prefix.first.to_s
         | 
| 13 | 
            -
                   | 
| 13 | 
            +
                  if !["http://", "https://"].include?(prefix)
         | 
| 14 | 
            +
                    prefix = (prefix =~ /\Ahttp\:/ or prefix =~ /\Ahttps\:/) ? URI::parse(doc[:uri]).merge(prefix).to_s : "#{doc[:uri]}#{prefix}"
         | 
| 15 | 
            +
                  end
         | 
| 14 16 | 
             
                  suffix = sc::suffix.first.to_s
         | 
| 15 17 |  | 
| 16 18 | 
             
                  nofollow = (sc::follow.first != "true")
         | 
    
        data/scrappy.gemspec
    CHANGED
    
    | @@ -2,11 +2,11 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            Gem::Specification.new do |s|
         | 
| 4 4 | 
             
              s.name = %q{scrappy}
         | 
| 5 | 
            -
              s.version = "0.4. | 
| 5 | 
            +
              s.version = "0.4.2"
         | 
| 6 6 |  | 
| 7 7 | 
             
              s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
         | 
| 8 8 | 
             
              s.authors = ["Jose Ignacio"]
         | 
| 9 | 
            -
              s.date = %q{2011-07- | 
| 9 | 
            +
              s.date = %q{2011-07-06}
         | 
| 10 10 | 
             
              s.default_executable = %q{scrappy}
         | 
| 11 11 | 
             
              s.description = %q{RDF web scraper}
         | 
| 12 12 | 
             
              s.email = %q{joseignacio.fernandez@gmail.com}
         | 
    
        metadata
    CHANGED
    
    | @@ -5,8 +5,8 @@ version: !ruby/object:Gem::Version | |
| 5 5 | 
             
              segments: 
         | 
| 6 6 | 
             
              - 0
         | 
| 7 7 | 
             
              - 4
         | 
| 8 | 
            -
              -  | 
| 9 | 
            -
              version: 0.4. | 
| 8 | 
            +
              - 2
         | 
| 9 | 
            +
              version: 0.4.2
         | 
| 10 10 | 
             
            platform: ruby
         | 
| 11 11 | 
             
            authors: 
         | 
| 12 12 | 
             
            - Jose Ignacio
         | 
| @@ -14,7 +14,7 @@ autorequire: | |
| 14 14 | 
             
            bindir: bin
         | 
| 15 15 | 
             
            cert_chain: []
         | 
| 16 16 |  | 
| 17 | 
            -
            date: 2011-07- | 
| 17 | 
            +
            date: 2011-07-06 00:00:00 +02:00
         | 
| 18 18 | 
             
            default_executable: 
         | 
| 19 19 | 
             
            dependencies: 
         | 
| 20 20 | 
             
            - !ruby/object:Gem::Dependency 
         |