RubyGems - nibbler - Versions diffs - 1.0 - Mend

nibbler 1.0

Files changed (9) hide show

data/LICENSE ADDED

@@ -0,0 +1,18 @@
+Copyright (c) 2009 Mislav Marohnić
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED

@@ -0,0 +1,43 @@
+Nibbler
+=======
+*Nibbler* is a cute HTML screen-scraping tool.
+    require 'nibbler'
+    require 'open-uri'
+    class BlogScraper < Nibbler
+      element :title
+      elements 'div.hentry' => :articles do
+        element 'h2' => :title
+        element 'a/@href' => :url
+      end
+    end
+    blog = BlogScraper.parse open('http://example.com')
+    blog.title
+    #=> "My blog title"
+    blog.articles.first.title
+    #=> "First article title"
+    blog.articles.first.url
+    #=> "http://example.com/article"
+There are sample scripts in the "examples/" directory; run them with:
+    ruby -Ilib -rubygems examples/delicious.rb
+    ruby -Ilib -rubygems examples/tweetburner.rb > output.csv
+[See the wiki][wiki] for more on how to use *Nibbler*.
+Requirements
+------------
+*None*. Well, [Nokogiri][] is a requirement if you pass in HTML content that needs to be parsed, like in the example above. Otherwise you can initialize the scraper with an Hpricot document or anything else that implements `at(selector)` and `search(selector)` methods.
+[wiki]: http://wiki.github.com/mislav/nibbler
+[nokogiri]: http://nokogiri.rubyforge.org/nokogiri/

data/Rakefile ADDED

@@ -0,0 +1,24 @@
+task :default => :spec
+desc %(Run specs)
+task :spec do
+  exec %(ruby -Ilib -rubygems lib/nibbler.rb --color)
+end
+desc %(Count lines of code in implementation)
+task :loc do
+  File.open('lib/nibbler.rb') do |file|
+    loc, counting = 1, false
+    file.each_line do |line|
+      case line
+      when /^class\b/   then counting = true
+      when /^\s*(#|\Z)/ then next
+      when /^end\b/     then break
+      end
+      loc += 1 if counting
+    end
+    puts loc
+  end
+end

data/examples/delicious.rb ADDED

@@ -0,0 +1,37 @@
+## Delicious bookmarks fetching
+#
+# Let's pretend that delicious.com doesn't have an API.
+# This is a demonstration of the most common use-case.
+require 'nibbler'
+require 'open-uri'
+require 'date'
+# extracts data from a single bookmark
+class Bookmark < Nibbler
+  element 'h4 a' => :title
+  element '.description' => :description
+  # extract attribute with xpath
+  element './/h4/a/@href' => :url
+  # tags are plural
+  elements 'ul.tag-chain .tagItem' => :tags
+  # dates are in form "22 OCT 09"
+  element '.dateGroup span' => :date, :with => lambda { |span|
+    Date.strptime(span.inner_text.strip, '%d %b %y')
+  }
+end
+# finds all bookmarks on the page
+class Delicious < Nibbler
+  elements '#bookmarklist div.bookmark' => :bookmarks, :with => Bookmark
+end
+mislav = Delicious.parse open('http://delicious.com/mislav/ruby')
+bookmark = mislav.bookmarks.first
+puts bookmark.title   #=> "Some title"
+p bookmark.tags       #=> ['foo', 'bar', ...]
+puts bookmark.date    #=> <Date>

data/examples/tweetburner.rb ADDED

@@ -0,0 +1,78 @@
+## Tweetburner.com archive dump
+#
+# I needed to dump my Tweetburner archive to CSV
+# http://tweetburner.com/users/mislav/archive
+require 'nibbler'
+require 'uri'
+require 'open-uri'
+require 'date'
+require 'nokogiri'
+require 'csv'
+module Tweetburner
+  SITE = URI('http://tweetburner.com')
+  class Scraper < ::Nibbler
+    # add our behavior to convert_document; open web pages with UTF-8 encoding
+    def self.convert_document(url)
+      URI === url ? Nokogiri::HTML::Document.parse(open(url), url.to_s, 'UTF-8') : url
+    rescue OpenURI::HTTPError
+      $stderr.puts "ERROR opening #{url}"
+      Nokogiri('')
+    end
+  end
+  # a single link (table row one the archive page)
+  class Link < ::Nibbler
+    element './/a[starts-with(@href, "/links/")]/@href' => :stats_url, :with => lambda { |href|
+      SITE + href.text
+    }
+    element '.col-tweet-text' => :text, :with => lambda { |node|
+      node.text.sub(/\s+– .+?$/, '')
+    }
+    element '.col-clicks' => :clicks
+    element '.col-created-at' => :created_at, :with => lambda { |node| DateTime.parse node.text }
+    def stats
+      @stats ||= Stats.parse(stats_url)
+    end
+  end
+  # single link stats page parser
+  class Stats < Scraper
+    element '//*[@id="main-content"]/p/a/@href' => :destination
+  end
+  # parser for the paginated archive
+  class Archive < Scraper
+    def self.parse(username)
+      path = '/users/%s/archive' % username
+      super SITE + path
+    end
+    elements '//table//tr[position() > 1]' => :links, :with => Link
+    element '//*[@class="page-navigation"]//a[starts-with(text(), "Older")]/@href' => :next_page_url
+    # augment to recursively parse other pages
+    def parse
+      super
+      if next_page_url
+        @doc = self.class.convert_document(URI(next_page_url))
+        self.parse
+      else
+        self
+      end
+    end
+    def to_csv(io = STDOUT)
+      io.sync = true if io == STDOUT
+      csv = CSV::Writer.create io
+      links.each do |link|
+        csv << [link.text, link.clicks, link.created_at, link.stats.destination]
+      end
+    end
+  end
+end
+Tweetburner::Archive.parse('mislav').to_csv

data/examples/twitter.rb ADDED

@@ -0,0 +1,71 @@
+## JSON data extraction example
+#
+# This is an example how we're not limited to Nokogiri and HTML screen-scraping.
+# Here we use Nibbler to extract tweets from a Twitter API JSON response.
+#
+# Requirements: a JSON library (tested with "json" gem)
+require 'nibbler/json'
+require 'json'
+require 'time'
+# now here's the real deal
+class Twitter < NibblerJSON
+  elements :tweets, :with => NibblerJSON do
+    element :created_at, :with => lambda { |time| Time.parse(time) }
+    element :text
+    element :id
+    element 'user' => :author, :with => NibblerJSON do
+      element 'name' => :full_name
+      element 'screen_name' => :username
+    end
+  end
+end
+twitter = Twitter.parse(DATA.read)
+twitter.tweets.each do |tweet|
+  puts "@%s: %s [%s]" % [tweet.author.username, tweet.text, tweet.created_at]
+  puts
+end
+__END__
+[{"created_at": "Thu Oct 22 23:50:02 +0000 2009",
+  "text":
+   "\"It is OK being wrong.\" \"I don't have any experience in that field.\"",
+  "id": 5083117521,
+  "user":
+   {"name": "Ryan Bigg",
+    "created_at": "Thu Apr 24 03:23:53 +0000 2008",
+    "location": "iPhone: -27.471957,152.999225",
+    "profile_image_url":
+     "http://a1.twimg.com/profile_images/287965508/Photo_47_normal.jpg",
+    "url": "http://www.frozenplague.net",
+    "id": 14506011,
+    "followers_count": 432,
+    "description": "I work at Mocra and code Ruby on Rails",
+    "statuses_count": 7659,
+    "friends_count": 211,
+    "screen_name": "ryanbigg"},
+  "source": "<a href=\"http://www.atebits.com/\" rel=\"nofollow\">Tweetie</a>"},
+ {"created_at": "Mon Oct 19 23:43:50 +0000 2009",
+  "text":
+   "Programming is the art of forcing the exceptions of the real world into the absolutes of a computer.",
+  "id": 5004137490,
+  "user":
+   {"name": "Ryan Bates",
+    "created_at": "Fri Mar 28 19:10:25 +0000 2008",
+    "location": "Southern Oregon",
+    "profile_image_url":
+     "http://a1.twimg.com/profile_images/52189024/ryan_bates_cropped_normal.jpg",
+    "url": "http://railscasts.com",
+    "id": 14246143,
+    "followers_count": 3225,
+    "description": "Producer of Railscasts - Free Ruby on Rails Screencasts",
+    "profile_background_image_url":
+     "http://s.twimg.com/a/1255724203/images/themes/theme2/bg.gif",
+    "statuses_count": 2066,
+    "friends_count": 225,
+    "screen_name": "rbates"}
+    }]

data/lib/nibbler.rb ADDED

@@ -0,0 +1,291 @@
+## A minimalistic, declarative HTML scraper
+class Nibbler
+  attr_reader :doc
+  # Accepts string, open file, or Nokogiri-like document
+  def initialize(doc)
+    @doc = self.class.convert_document(doc)
+    initialize_plural_accessors
+  end
+  # Initialize a new scraper and process data
+  def self.parse(html)
+    new(html).parse
+  end
+  # Specify a new singular scraping rule
+  def self.element(*args, &block)
+    selector, name, delegate = parse_rule_declaration(*args, &block)
+    rules[name] = [selector, delegate]
+    attr_accessor name
+    name
+  end
+  # Specify a new plural scraping rule
+  def self.elements(*args, &block)
+    name = element(*args, &block)
+    rules[name] << true
+  end
+  # Let it do its thing!
+  def parse
+    self.class.rules.each do |target, (selector, delegate, plural)|
+      if plural
+        @doc.search(selector).each do |node|
+          send(target) << parse_result(node, delegate)
+        end
+      else
+        send("#{target}=", parse_result(@doc.at(selector), delegate))
+      end
+    end
+    self
+  end
+  protected
+  # `delegate` is optional, but should respond to `call` or `parse`
+  def parse_result(node, delegate)
+    if delegate
+      delegate.respond_to?(:call) ? delegate.call(node) : delegate.parse(node)
+    elsif node.respond_to? :inner_text
+      node.inner_text
+    else
+      node.to_s
+    end unless node.nil?
+  end
+  private
+  def self.rules
+    @rules ||= {}
+  end
+  def self.inherited(subclass)
+    subclass.rules.update self.rules
+  end
+  # Rule declaration is in Hash or single argument form:
+  #
+  #   { '//some/selector' => :name, :with => delegate }
+  #     #=> ['//some/selector', :name, delegate]
+  #
+  #   :title
+  #     #=> ['title', :title, nil]
+  def self.parse_rule_declaration(*args, &block)
+    options, name = Hash === args.last ? args.pop : {}, args.first
+    delegate = options.delete(:with)
+    selector, property = name ? [name.to_s, name.to_sym] : options.to_a.flatten
+    raise ArgumentError, "invalid rule declaration: #{args.inspect}" unless property
+    # eval block in context of a new scraper subclass
+    delegate = Class.new(delegate || Nibbler, &block) if block_given?
+    return selector, property, delegate
+  end
+  def initialize_plural_accessors
+    self.class.rules.each do |name, (s, k, plural)|
+      send("#{name}=", []) if plural
+    end
+  end
+  def self.convert_document(doc)
+    unless doc.respond_to?(:at) && doc.respond_to?(:search)
+      require 'nokogiri' unless defined? ::Nokogiri
+      Nokogiri doc
+    else
+      doc
+    end
+  end
+end
+## specs
+if __FILE__ == $0
+  require 'spec/autorun'
+  HTML = DATA.read
+  class Article < Nibbler
+    element 'h1' => :title
+    element 'a/@href' => :link
+  end
+  class TimestampedArticle < Article
+    element 'p.pubdate' => :published, :with => lambda { |node|
+      node.inner_text.sub('Published on ', '')
+    }
+    def published_date
+      @date ||= Date.parse published
+    end
+  end
+  class SpecialArticle < Article
+    element 'span'
+  end
+  class BlogScraper < Nibbler
+    element :title
+    elements '#nav li' => :navigation_items
+  end
+  class OverrideBlogScraper < BlogScraper
+    elements :title
+    element '#nav li' => :navigation_items
+  end
+  class BlogWithArticles < BlogScraper
+    elements 'div.hentry' => :articles, :with => Article
+  end
+  class BlogWithTimestampedArticles < BlogScraper
+    elements 'div.hentry' => :articles, :with => TimestampedArticle
+  end
+  class BlogWithArticlesBlock < BlogScraper
+    elements 'div.hentry' => :articles do
+      element 'h1' => :title
+    end
+  end
+  class FakeHtmlParser
+    def initialize(name)
+      @name = name
+    end
+    def at(selector)
+      "fake #{@name}"
+    end
+    def search(selector)
+      (1..3).map { |n| self.class.new(@name + n.to_s) }
+    end
+  end
+  describe BlogWithTimestampedArticles do
+    before(:all) do
+      @blog = described_class.parse(HTML)
+    end
+    it "should have title" do
+      @blog.title.should == 'Maximum awesome'
+    end
+    it "should have articles" do
+      @blog.should have(2).articles
+    end
+    it "should have navigation items" do
+      @blog.should have(3).navigation_items
+      @blog.navigation_items.should == %w[Home About Help]
+    end
+    it "should have title, pubdate for first article" do
+      article = @blog.articles[0]
+      article.title.should == 'First article'
+      article.published.should == 'Oct 1'
+      article.published_date.month.should == 10
+      article.published_date.day.should == 1
+      article.link.should be_nil
+    end
+    it "should have title, link for second article" do
+      article = @blog.articles[1]
+      article.title.should == 'Second article'
+      article.published.should == 'Sep 5'
+      article.link.should == 'http://mislav.uniqpath.com'
+    end
+    it "should override singular properties when re-parsing" do
+      blog = @blog.dup
+      blog.instance_variable_set('@doc', Nokogiri::HTML(''))
+      blog.parse
+      blog.title.should be_nil
+      blog.should have(2).articles
+    end
+  end
+  describe SpecialArticle do
+    before(:all) do
+      doc = Nokogiri::HTML(HTML).at('//div[2]')
+      @article = described_class.parse(doc)
+      @parent_article = described_class.superclass.parse(doc)
+    end
+    it "should inherit title parsing from parent" do
+      @article.title.should == 'Second article'
+    end
+    it "should have additional 'span' rule" do
+      @article.span.should == 'My blog'
+    end
+    it "should not let superclass inherit rules" do
+      @parent_article.should_not respond_to(:span)
+    end
+  end
+  describe BlogWithArticles, 'with fake HTML parser' do
+    before(:all) do
+      doc = FakeHtmlParser.new('test')
+      @blog = described_class.parse(doc)
+    end
+    it "should have fake title" do
+      @blog.title.should == 'fake test'
+    end
+    it "should have fake articles" do
+      titles = @blog.articles.map { |a| a.title }
+      titles.should == ['fake test1', 'fake test2', 'fake test3']
+    end
+  end
+  describe OverrideBlogScraper do
+    before(:all) do
+      @blog = described_class.parse(HTML)
+    end
+    it "should have plural titles" do
+      @blog.title.should == ['Maximum awesome']
+    end
+    it "should have singular navigation item" do
+      @blog.navigation_items.should == 'Home'
+    end
+  end
+  describe BlogWithArticlesBlock do
+    before(:all) do
+      @blog = described_class.parse(HTML)
+    end
+    it "should have article objects" do
+      titles = @blog.articles.map { |article| article.title }
+      titles.should == ['First article', 'Second article']
+    end
+  end
+end
+__END__
+<!doctype html>
+<title>Maximum awesome</title>
+<body>
+  <ol id="nav">
+    <li>Home</li>
+    <li>About</li>
+    <li>Help</li>
+  </ol>
+  <div class="hentry">
+    <h1>First article</h1>
+    <p class="pubdate">Published on Oct 1</p>
+  </div>
+  <div class="hentry">
+    <h1>Second article</h1>
+    <p class="pubdate">Published on Sep 5</p>
+    <span><a href="http://mislav.uniqpath.com">My blog</a></span>
+  </div>
+</body>

data/lib/nibbler/json.rb ADDED

@@ -0,0 +1,27 @@
+require 'nibbler'
+# a wrapper for JSON data that provides `at` and `search`
+class Nibbler::JsonDocument
+  def initialize(obj)
+    @data = String === obj ? JSON.parse(obj) : obj
+  end
+  def self.[](obj)
+    self.class === obj ? obj : new(obj)
+  end
+  def search(selector)
+    @data.to_a
+  end
+  def at(selector)
+    @data[selector]
+  end
+end
+# a scraper that works with JsonDocument
+class NibblerJSON < Nibbler
+  def self.convert_document(doc)
+    Nibbler::JsonDocument[doc]
+  end
+end

metadata ADDED

@@ -0,0 +1,121 @@
+--- !ruby/object:Gem::Specification
+name: nibbler
+version: !ruby/object:Gem::Version
+  hash: 15
+  prerelease: false
+  segments:
+  - 1
+  - 0
+  version: "1.0"
+platform: ruby
+authors:
+- "Mislav Marohni\xC4\x87"
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-08-16 00:00:00 +02:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: yajl-ruby
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 9
+        segments:
+        - 0
+        - 7
+        - 5
+        version: 0.7.5
+  type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 113
+        segments:
+        - 1
+        - 4
+        - 3
+        - 1
+        version: 1.4.3.1
+  type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 27
+        segments:
+        - 1
+        - 3
+        - 0
+        version: 1.3.0
+  type: :development
+  version_requirements: *id003
+description: Nibbler is a super simple and powerful declarative generic scraper written in under 70 lines of code.
+email: mislav.marohnic@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- Rakefile
+- lib/nibbler/json.rb
+- lib/nibbler.rb
+- examples/delicious.rb
+- examples/tweetburner.rb
+- examples/twitter.rb
+- README.md
+- LICENSE
+has_rdoc: false
+homepage: http://github.com/mislav/nibbler
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
+      version: "0"
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.7
+signing_key:
+specification_version: 3
+summary: A cute HTML scraper / data extraction tool
+test_files: []