RubyGems - nibbler - Versions diffs - 1.2.1 → 1.3.0 - Mend

nibbler 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/README.md CHANGED

@@ -1,32 +1,71 @@
 Nibbler
 =======
-*Nibbler* is a cute HTML screen-scraping tool.
-    require 'nibbler'
-    require 'open-uri'
-    class BlogScraper < Nibbler
-      element :title
-      elements 'div.hentry' => :articles do
-        element 'h2' => :title
-        element 'a/@href' => :url
-      end
-    end
-    blog = BlogScraper.parse open('http://example.com')
-    blog.title
-    #=> "My blog title"
-    blog.articles.first.title
-    #=> "First article title"
-    blog.articles.first.url
-    #=> "http://example.com/article"
-There are sample scripts in the "examples/" directory; run them with:
+*Nibbler* is a small little tool (~100 LOC) that helps you map data structures to objects that you define.
+It can be used for HTML screen scraping:
+~~~ ruby
+require 'nibbler'
+require 'open-uri'
+class BlogScraper < Nibbler
+  element :title
+  elements 'div.hentry' => :articles do
+    element 'h2' => :title
+    element 'a/@href' => :url
+  end
+end
+blog = BlogScraper.parse open('http://example.com')
+blog.title
+#=> "My blog title"
+blog.articles.first.title
+#=> "First article title"
+blog.articles.first.url
+#=> "http://example.com/article"
+~~~
+For mapping XML API payloads:
+~~~ ruby
+class Movie < Nibbler
+  element './title/@regular' => :name
+  element './box_art/@large' => :poster_large
+  element 'release_year' => :year, :with => lambda { |node| node.text.to_i }
+  element './/link[@title="web page"]/@href' => :url
+end
+response = Net::HTTP.get_response URI('http://example.com/movie.xml')
+movie = Movie.parse response.body
+movie.name  #=> "Toy Story 3"
+movie.year  #=> 2010
+~~~
+Or even for JSON:
+~~~ ruby
+require 'json'
+require 'nibbler/json'
+class Movie < NibblerJSON
+  element :title
+  element :year
+  elements :genres
+  # JSONPath selectors:
+  element '.links.alternate' => :url
+  element '.ratings.critics_score' => :critics_score
+end
+movie = Movie.parse json_string
+~~~
+There are sample scripts in the "examples/" directory:
     ruby -Ilib -rubygems examples/delicious.rb
     ruby -Ilib -rubygems examples/tweetburner.rb > output.csv
@@ -36,7 +75,10 @@ There are sample scripts in the "examples/" directory; run them with:
 Requirements
 ------------
-*None*. Well, [Nokogiri][] is a requirement if you pass in HTML content that needs to be parsed, like in the example above. Otherwise you can initialize the scraper with an Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
+*None*. Well, [Nokogiri][] is a requirement if you pass in an HTML string for parsing, like in the example above. Otherwise you can initialize the scraper with an
+Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
+NibblerJSON needs a JSON parser if string content is passed, so "json" library should be installed on Ruby 1.8.
 [wiki]: http://wiki.github.com/mislav/nibbler

data/Rakefile CHANGED

@@ -2,7 +2,11 @@ task :default => [:loc, :spec]
 desc %(Run specs)
 task :spec do
-  exec %(ruby -Ilib -rubygems lib/nibbler.rb --color)
+  tests = []
+  tests << %(ruby -Ilib -rubygems lib/nibbler.rb --color)
+  tests << %(ruby -Ilib -rubygems lib/nibbler/json.rb)
+  exit(1) if tests.any? {|cmd| !sh(cmd) }
 end
 desc %(Count lines of code in implementation)
@@ -12,13 +16,13 @@ task :loc do
     file.each_line do |line|
       case line
-      when /^class\b/   then counting = true
+      when /^(class|module)\b/ then counting = true
       when /^\s*(#|$)/ then next
-      when /^end\b/     then break
+      when /^end\b/ then break
       end
       loc += 1 if counting
     end
     puts "#{loc} lines of code"
   end
-end
+end

data/examples/delicious.rb CHANGED

@@ -5,33 +5,31 @@
 require 'nibbler'
 require 'open-uri'
-require 'date'
 # extracts data from a single bookmark
 class Bookmark < Nibbler
-  element 'h4 a' => :title
-  element '.description' => :description
-  # extract attribute with xpath
-  element './/h4/a/@href' => :url
+  element '.body .title' => :title
+  element '.note' => :description
+  element '.sub span' => :url
   # tags are plural
-  elements 'ul.tag-chain .tagItem' => :tags
-  # dates are in form "22 OCT 09"
-  element '.dateGroup span' => :date, :with => lambda { |span|
-    Date.strptime(span.inner_text.strip, '%d %b %y')
-  }
+  elements '.tag .name' => :tags
+  # extract timestamp from HTML attribute
+  element './@date' => :date, :with => lambda { |timestamp| Time.at timestamp.text.to_i }
 end
 # finds all bookmarks on the page
 class Delicious < Nibbler
-  elements '#bookmarklist div.bookmark' => :bookmarks, :with => Bookmark
+  elements '.content .linkList .link' => :bookmarks, :with => Bookmark
 end
 mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
-bookmark = mislav.bookmarks.first
-puts bookmark.title   #=> "Some title"
-p bookmark.tags       #=> ['foo', 'bar', ...]
-puts bookmark.date    #=> <Date>
+mislav.bookmarks[0,3].each do |bookmark|
+  puts bookmark.title   #=> "Some title"
+  p bookmark.tags       #=> ['foo', 'bar', ...]
+  puts bookmark.date    #=> <Date>
+  puts
+end

data/examples/tweetburner.rb CHANGED

@@ -1,3 +1,5 @@
+# encoding: utf-8
+#
 ## Tweetburner.com archive dump
 #
 # I needed to dump my Tweetburner archive to CSV
@@ -14,8 +16,15 @@ module Tweetburner
   SITE = URI('http://tweetburner.com')
   class Scraper < ::Nibbler
-    # add our behavior to convert_document; open web pages with UTF-8 encoding
-    def self.convert_document(url)
+    def initialize url
+      doc = get_document url
+      super doc
+    end
+    private
+    # open web pages with UTF-8 encoding
+    def get_document(url)
       URI === url ? Nokogiri::HTML::Document.parse(open(url), url.to_s, 'UTF-8') : url
     rescue OpenURI::HTTPError
       $stderr.puts "ERROR opening #{url}"
@@ -31,7 +40,7 @@ module Tweetburner
     element '.col-tweet-text' => :text, :with => lambda { |node|
       node.text.sub(/\s+– .+?$/, '')
     }
-    element '.col-clicks' => :clicks
+    element '.col-clicks' => :clicks, :with => lambda { |node| node.inner_text.to_i }
     element '.col-created-at' => :created_at, :with => lambda { |node| DateTime.parse node.text }
     def stats
@@ -58,18 +67,18 @@ module Tweetburner
     def parse
       super
       if next_page_url
-        @doc = self.class.convert_document(URI(next_page_url))
+        @doc = get_document(URI(next_page_url))
         self.parse
-      else
-        self
       end
+      self
     end
     def to_csv(io = STDOUT)
       io.sync = true if io == STDOUT
-      csv = CSV::Writer.create io
-      links.each do |link|
-        csv << [link.text, link.clicks, link.created_at, link.stats.destination]
+      CSV(io) do |csv|
+        links.each do |link|
+          csv << [link.text, link.clicks, link.created_at, link.stats.destination]
+        end
       end
     end
   end

data/examples/twitter.rb CHANGED

@@ -11,11 +11,11 @@ require 'time'
 # now here's the real deal
 class Twitter < NibblerJSON
-  elements :tweets, :with => NibblerJSON do
+  elements :tweets do
     element :created_at, :with => lambda { |time| Time.parse(time) }
     element :text
     element :id
-    element 'user' => :author, :with => NibblerJSON do
+    element 'user' => :author do
       element 'name' => :full_name
       element 'screen_name' => :username
     end

data/lib/nibbler.rb CHANGED

@@ -1,110 +1,137 @@
-# A minimalistic, declarative HTML scraper
-class Nibbler
-  attr_reader :doc
+# DSL for defining data extraction rules from an abstract document object
+module NibblerMethods
+  def self.extended(base)
+    base.send(:include, InstanceMethods) if base.is_a? Class
+  end
   # Declare a singular scraping rule
-  def self.element(*args, &block)
+  def element(*args, &block)
     selector, name, delegate = parse_rule_declaration(*args, &block)
     rules[name] = [selector, delegate]
     attr_accessor name
     name
   end
   # Declare a plural scraping rule
-  def self.elements(*args, &block)
+  def elements(*args, &block)
     name = element(*args, &block)
     rules[name] << true
   end
-  # Process data by creating a new scraper
-  def self.parse(data) new(data).parse end
-  # Initialize the parser with raw data or a document
-  def initialize(data)
-    @doc = self.class.convert_document(data)
-    # initialize plural properties
-    self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
-  end
-  # Parse the document and save values returned by selectors
-  def parse
-    self.class.rules.each do |target, (selector, delegate, plural)|
-      if plural
-        send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
-      else
-        send("#{target}=", parse_result(@doc.at(selector), delegate))
-      end
-    end
-    self
-  end
-  # Dump the extracted data into a hash with symbolized keys
-  def to_hash
-    converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
-    self.class.rules.keys.inject({}) do |hash, name|
-      value = send(name)
-      hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
-      hash
-    end
-  end
-  protected
-  # `delegate` is optional, but should respond to `call` or `parse`
-  def parse_result(node, delegate)
-    if delegate
-      delegate.respond_to?(:call) ? delegate.call(node) : delegate.parse(node)
-    elsif node.respond_to? :inner_text
-      node.inner_text
-    else
-      node
-    end unless node.nil?
-  end
-  private
   # Parsing rules declared with `element` or `elements`
-  def self.rules
+  def rules
     @rules ||= {}
   end
+  # Process data by creating a new instance
+  def parse(doc) new(doc).parse end
+  private
   # Make subclasses inherit the parsing rules
-  def self.inherited(subclass)
+  def inherited(subclass)
+    super
     subclass.rules.update self.rules
   end
   # Rule declaration forms:
-  #
+  #
   #   { 'selector' => :property, :with => delegate }
   #     #=> ['selector', :property, delegate]
-  #
+  #
   #   :title
   #     #=> ['title', :title, nil]
-  def self.parse_rule_declaration(*args, &block)
+  def parse_rule_declaration(*args, &block)
     options, name = Hash === args.last ? args.pop : {}, args.first
     delegate = options.delete(:with)
     selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
     raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
     # eval block in context of a new scraper subclass
-    delegate = Class.new(delegate || Nibbler, &block) if block_given?
+    delegate = Class.new(delegate || base_parser_class, &block) if block_given?
     return selector, property, delegate
   end
+  def base_parser_class
+    klass = self
+    klass = klass.superclass until klass.superclass == Object
+    klass
+  end
+  module InstanceMethods
+    attr_reader :doc
+    # Initialize the parser with a document
+    def initialize(doc)
+      @doc = doc
+      # initialize plural properties
+      self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
+    end
+    # Parse the document and save values returned by selectors
+    def parse
+      self.class.rules.each do |target, (selector, delegate, plural)|
+        if plural
+          send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
+        else
+          send("#{target}=", parse_result(@doc.at(selector), delegate))
+        end
+      end
+      self
+    end
+    # Dump the extracted data into a hash with symbolized keys
+    def to_hash
+      converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
+      self.class.rules.keys.inject({}) do |hash, name|
+        value = send(name)
+        hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
+        hash
+      end
+    end
+    protected
+    # `delegate` is optional, but should respond to `call` or `parse`
+    def parse_result(node, delegate)
+      if delegate
+        method = delegate.is_a?(Proc) ? delegate : delegate.method(delegate.respond_to?(:call) ? :call : :parse)
+        method.arity == 1 ? method[node] : method[node, self]
+      else
+        node
+      end unless node.nil?
+    end
+  end
+end
+# An HTML/XML scraper
+class Nibbler
+  extend NibblerMethods
   # Parse data with Nokogiri unless it's already an acceptable document
-  def self.convert_document(doc)
-    if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
-    else
+  def initialize(doc)
+    unless doc.respond_to?(:at) and doc.respond_to?(:search)
       require 'nokogiri' unless defined? ::Nokogiri
-      Nokogiri doc
+      doc = Nokogiri doc
     end
+    super(doc)
   end
-end
+  protected
+  def parse_result(node, delegate)
+    if !delegate and node.respond_to? :inner_text
+      node.inner_text
+    else
+      super
+    end
+  end
+end
 ## specs
 if __FILE__ == $0
-  require 'rspec'
+  require 'date'
+  require 'rspec/autorun'
   HTML = DATA.read
   class Article < Nibbler

data/lib/nibbler/json.rb CHANGED

@@ -1,29 +1,190 @@
 require 'nibbler'
+require 'strscan'
 # a wrapper for JSON data that provides `at` and `search`
 class Nibbler::JsonDocument
   attr_reader :data
-  def initialize(obj)
-    @data = String === obj ? JSON.parse(obj) : obj
+  def initialize(obj, root = nil)
+    @data = obj.respond_to?(:to_str) ? JSON.parse(obj) : obj
+    @root = root
   end
-  def self.[](obj)
-    self.class === obj ? obj : new(obj)
+  def root
+    @root || data
   end
   def search(selector)
-    Array === data ? data : Array(at(selector))
+    if selector !~ /[^\w-]/
+      found = Array === data ? data : data[selector]
+      found = [] if found.nil?
+      found = [found] unless Array === found
+    else
+      found = scan_selector selector
+    end
+    found
   end
   def at(selector)
-    data[selector]
+    search(selector).first
+  end
+  private
+  # stupid implementation of http://goessner.net/articles/JsonPath/
+  def scan_selector(selector)
+    s = StringScanner.new selector
+    found = s.scan(/\$/) ? root : data
+    found = [found] unless Array === found
+    while prop = s.scan(/\.\.?[\w-]+/)
+      prop.sub!(/\.\.?/, '')
+      found = if $&.size == 2
+                search_recursive(prop, found).compact
+              else
+                found.flatten.map {|i| i[prop] if Hash === i and i.key? prop }.compact
+              end
+      if s.scan(/\[/)
+        if range = s.scan(/[\d:]+/)
+          start, till, = range.split(':', 2)
+          start = start.to_i
+          idx = !till ? start : till.empty?? start..-1 : start...(till.to_i)
+          found.map! {|i| i[idx] if Array === i }
+          found.compact!
+        elsif s.scan(/\?/)
+          expr = s.scan_until(/\)/) or raise
+          expr.gsub!('@', 'self')
+          found.flatten!
+          found.reject! {|i| !(i.instance_eval expr rescue nil) }
+          found.compact!
+        end
+        s.scan(/\]/) or raise
+      end
+      break if found.empty?
+    end
+    found.flatten!
+    found
+  end
+  def search_recursive(prop, items, found = [])
+    items.map { |item|
+      case item
+      when Hash
+        found << item[prop] if item.key? prop
+        search_recursive(prop, item.values, found)
+      when Array
+        search_recursive(prop, item, found)
+      end
+    }
+    found
   end
 end
 # a scraper that works with JsonDocument
-class NibblerJSON < Nibbler
-  def self.convert_document(doc)
-    Nibbler::JsonDocument[doc]
+class NibblerJSON
+  extend NibblerMethods
+  def self.parse(data, parent = nil)
+    new(data, parent).parse
+  end
+  def initialize(doc, parent = nil)
+    doc = Nibbler::JsonDocument.new(doc, parent && parent.doc.root) unless doc.respond_to? :search
+    super(doc)
   end
 end
+if __FILE__ == $0
+  require 'json'
+  require 'forwardable'
+  require 'minitest/spec'
+  require 'minitest/autorun'
+  describe Nibbler::JsonDocument do
+    DOC = Nibbler::JsonDocument.new DATA.read
+    extend Forwardable
+    def_delegators :DOC, :at, :search
+    it "fetches unknown key" do
+      at('doesnotexist').must_be_nil
+    end
+    it "fetches existing key" do
+      at('title').must_equal "Toy Story 3"
+    end
+    it "fetches selector" do
+      at('.year').must_equal 2010
+    end
+    it "fetches deep selector" do
+      at('.release_dates.dvd').must_equal "2010-11-02"
+    end
+    it "fetches first item of array" do
+      at('.genres').must_equal "Animation"
+    end
+    it "fetches array" do
+      search('.genres').must_equal [ "Animation", "Kids & Family", "Comedy" ]
+    end
+    it "extracts subset of array" do
+      search('.genres[:2]').must_equal  [ "Animation", "Kids & Family" ]
+      search('.genres[1:3]').must_equal [ "Kids & Family", "Comedy" ]
+      search('.genres[2:]').must_equal  [ "Comedy" ]
+    end
+    it "searches recursively" do
+      search('..characters').must_equal ["Woody", "Moody", "Buzz Lightyear"]
+    end
+    it "respects array index" do
+      search('..characters[0]').must_equal ["Woody", "Buzz Lightyear"]
+    end
+    it "respects conditions" do
+      search('.abridged_cast[?(@["name"] =~ /tom/i)].characters').must_equal ["Woody", "Moody"]
+    end
+  end
+end
+__END__
+{
+    "title": "Toy Story 3",
+    "year": 2010,
+    "genres": [ "Animation", "Kids & Family", "Comedy" ],
+    "runtime": 103,
+    "release_dates": {
+        "theater": "2010-06-18",
+        "dvd": "2010-11-02"
+    },
+    "ratings": {
+        "critics_rating": "Certified Fresh",
+        "critics_score": 99,
+        "audience_rating": "Upright",
+        "audience_score": 91
+    },
+    "posters": {
+        "thumbnail": "http://content6.flixster.com/movie/11/13/43/11134356_mob.jpg",
+        "profile": "http://content6.flixster.com/movie/11/13/43/11134356_pro.jpg",
+        "detailed": "http://content6.flixster.com/movie/11/13/43/11134356_det.jpg",
+        "original": "http://content6.flixster.com/movie/11/13/43/11134356_ori.jpg"
+    },
+    "abridged_cast": [
+        { "name": "Tom Hanks",
+          "characters": [ "Woody", "Moody" ] },
+        { "name": "Tim Allen",
+          "characters": [ "Buzz Lightyear" ] }
+    ],
+    "abridged_directors": [ {"name": "Lee Unkrich"} ],
+    "studio": "Walt Disney Pictures",
+    "alternate_ids": { "imdb": "0435761" },
+    "links": {
+        "self": "http://api.rottentomatoes.com/api/public/v1.0/movies/770672122.json",
+        "alternate": "http://www.rottentomatoes.com/m/toy_story_3/"
+    }
+}

metadata CHANGED

@@ -1,33 +1,23 @@
---- !ruby/object:Gem::Specification
+--- !ruby/object:Gem::Specification
 name: nibbler
-version: !ruby/object:Gem::Version
-  hash: 29
+version: !ruby/object:Gem::Version
+  version: 1.3.0
   prerelease:
-  segments:
-  - 1
-  - 2
-  - 1
-  version: 1.2.1
 platform: ruby
-authors:
-- "Mislav Marohni\xC4\x87"
+authors:
+- Mislav Marohnić
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-01-15 00:00:00 +01:00
-default_executable:
+date: 2012-01-17 00:00:00.000000000 Z
 dependencies: []
-description: Nibbler is a super simple and powerful declarative generic scraper written in under 70 lines of code.
+description: Nibbler is a super simple and powerful declarative generic scraper written
+  in under 70 lines of code.
 email: mislav.marohnic@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
-files:
+files:
 - Rakefile
 - lib/nibbler/json.rb
 - lib/nibbler.rb
@@ -36,39 +26,29 @@ files:
 - examples/twitter.rb
 - README.md
 - LICENSE
-has_rdoc: false
-homepage: http://github.com/mislav/nibbler
+homepage: https://github.com/mislav/nibbler
 licenses: []
 post_install_message:
 rdoc_options: []
-require_paths:
+require_paths:
 - lib
-required_ruby_version: !ruby/object:Gem::Requirement
+required_ruby_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
-required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
-  requirements:
-  - - ">="
-    - !ruby/object:Gem::Version
-      hash: 3
-      segments:
-      - 0
-      version: "0"
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.4.1
+rubygems_version: 1.8.12
 signing_key:
 specification_version: 3
 summary: A cute HTML scraper / data extraction tool
 test_files: []
+has_rdoc: