RubyGems - pismo - Versions diffs - 0.7.0 → 0.7.1 - Mend

pismo 0.7.0 → 0.7.1

Files changed (13) hide show

data/.gitignore +5 -0
data/Gemfile +4 -0
data/README.markdown +13 -2
data/Rakefile +2 -28
data/bin/pismo +1 -1
data/lib/pismo.rb +7 -3
data/lib/pismo/internal_attributes.rb +20 -21
data/lib/pismo/stopwords.txt +40 -3
data/lib/pismo/version.rb +3 -0
data/pismo.gemspec +24 -94
data/test/corpus/metadata_expected.yaml +8 -8
metadata +81 -45
data/VERSION +0 -1

data/.gitignore CHANGED

@@ -1,3 +1,8 @@
+pkg/*
+*.gem
+.bundle
+Gemfile.lock
 ## MAC OS
 .DS_Store

data/Gemfile ADDED

@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+# Specify your gem's dependencies in pismo.gemspec
+gemspec

data/README.markdown CHANGED

@@ -6,7 +6,11 @@ Pismo extracts machine-usable metadata from unstructured (or poorly structured)
 Data that Pismo can extract include titles, feed URLs, ledes, body text, image URLs, date, and keywords.
 Pismo is used heavily in production on http://coder.io/ to extract data from Web pages.
-All tests pass on Ruby 1.8.7 (MRI) and Ruby 1.9.1-p378 (MRI).
+All tests pass on Ruby 1.8.7, Ruby 1.9.2 (both MRI) and JRuby 1.5.6.
+## NEWS:
+December 19, 2010: Version 1.7.1 has been released - it includes a patch from Darcy Laycock to fix keyword extraction problems on some pages, has switched from Jeweler to Bundler for management of the gem, and adds support for JRuby 1.5.6 by skipping stemming on that platform.
 ## USAGE:
@@ -46,12 +50,19 @@ The current metadata methods are:
 These methods are not fully documented here yet - you'll just need to try them out. The plural methods like #titles, #authors, and #feeds will return multiple matches in an array, if present. This is so you can use your own techniques to choose a "best" result in ambiguous cases.
 The html_body and body methods will be of particular interest. They return the "body" of the page as determined by Pismo's "Reader" (like Arc90's Readability or Safari Reader) algorithm. #body returns it as plain-text, #html_body maintains some basic HTML styling.
+New! The keywords method accepts optional arguments. These are the current defaults:
+    :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2
+You can also pass an array to keywords with :hints => arr if you want only words of your choosing to be found.
 ## CAVEATS AND SHORTCOMINGS:
 There are some shortcomings or problems that I'm aware of and am going to pursue:
-* I do not know how Pismo fares on Rubinius or other versions of 1.9 (e.g. 1.9.2) yet
+* I do not know how Pismo fares on Rubinius
+* pismo requires Bundler - get it :-)
 * pismo does not install on JRuby due to a problem in the fast-stemmer dependency
 * Some users have had issues with using Pismo from irb. This appears to be related to Nokogiri use causing a segfault
 * The "Reader" content extraction algorithm is not perfect. It can sometimes return crap and can barf on certain types of characters for sentence extraction

data/Rakefile CHANGED

@@ -1,29 +1,5 @@
-require 'rubygems'
-require 'rake'
-begin
-  require 'jeweler'
-  Jeweler::Tasks.new do |gem|
-    gem.name = "pismo"
-    gem.summary = %Q{Extracts or retrieves content-related metadata from HTML pages}
-    gem.description = %Q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
-    gem.email = "git@peterc.org"
-    gem.homepage = "http://github.com/peterc/pismo"
-    gem.authors = ["Peter Cooper"]
-    gem.executables = "pismo"
-    gem.default_executable = "pismo"
-    gem.add_development_dependency "shoulda", ">= 0"
-    gem.add_development_dependency "awesome_print"
-    gem.add_dependency "jeweler"
-    gem.add_dependency "nokogiri"
-    gem.add_dependency "sanitize"
-    gem.add_dependency "fast-stemmer"
-    gem.add_dependency "chronic"
-  end
-  Jeweler::GemcutterTasks.new
-rescue LoadError
-  puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
-end
+require 'bundler'
+Bundler::GemHelper.install_tasks
 require 'rake/testtask'
 Rake::TestTask.new(:test) do |test|
@@ -45,8 +21,6 @@ rescue LoadError
   end
 end
-task :test => :check_dependencies
 task :default => :test
 require 'rake/rdoctask'

data/bin/pismo CHANGED

@@ -32,7 +32,7 @@ if ARGV.empty?
   P = doc
   @p = doc
   puts "Pismo has loaded #{url} into @p and P"
-  puts "Note: There have been several reports of Nokogiri segfaulting while using Pismo from irb. If this happens, try the same code as a standalone Ruby app."
+  #puts "Note: There have been several reports of Nokogiri segfaulting while using Pismo from irb. If this happens, try the same code as a standalone Ruby app."
   IRB.start
 else
   output = { :url => doc.url }

data/lib/pismo.rb CHANGED

@@ -2,7 +2,6 @@
 require 'open-uri'
 require 'nokogiri'
-require 'fast_stemmer'
 require 'chronic'
 require 'sanitize'
 require 'tempfile'
@@ -11,6 +10,12 @@ $: << File.dirname(__FILE__)
 require 'pismo/document'
 require 'pismo/reader'
+if RUBY_PLATFORM == "java"
+  class String; def stem; self; end; end
+else
+  require 'fast_stemmer'
+end
 module Pismo
   # Sugar methods to make creating document objects nicer
   def self.document(handle, url = nil)
@@ -59,8 +64,7 @@ class Nokogiri::HTML::Document
       end
       if result
-      #  result.gsub!(/\342\200\231/, '\'')
-      #  result.gsub!(/\342\200\224/, '-')
+        # TODO: Sort out sanitization in a more centralized way
         result.gsub!('’', '\'')
         result.gsub!('—', '-')
         if all

data/lib/pismo/internal_attributes.rb CHANGED

@@ -15,6 +15,7 @@ module Pismo
                             '.post-header h1',
                             '.entry-title',
                             '.post-title',
+                            '.post h1',
                             '.post h3 a',
                             'a.datitle',          # Slashdot style
                             '.posttitle',
@@ -93,9 +94,7 @@ module Pismo
       datetime = 10
       regexen.each do |r|
-        datetime = @doc.to_html[r]
-        # p datetime
-        break if datetime
+        break if datetime = @doc.to_html[r]
       end
       return unless datetime && datetime.length > 4
@@ -111,10 +110,6 @@ module Pismo
       Chronic.parse(datetime) || datetime
     end
-    # TODO: Attempts to work out what type of site or page the page is from the provided URL
-    # def site_type
-    # end
     # Returns the author of the page/content
     def author(all = false)
       author = @doc.match([
@@ -189,13 +184,15 @@ module Pismo
                   '.post-text p',
                   '#blogpost p',
                   '.story-teaser',
-                  '//div[@class="entrytext"]//p[string-length()>10]',                      # Ruby Inside / Kubrick style
+                  '.article .body p',
+                  '//div[@class="entrytext"]//p[string-length()>40]',                      # Ruby Inside / Kubrick style
                   'section p',
                   '.entry .text p',
+                  '.hentry .content p',
                   '.entry-content p',
                   '#wikicontent p',                                                        # Google Code style
                   '.wikistyle p',                                                          # GitHub style
-                  '//td[@class="storybody"]/p[string-length()>10]',                        # BBC News style
+                  '//td[@class="storybody"]/p[string-length()>40]',                        # BBC News style
                   '//div[@class="entry"]//p[string-length()>100]',
                   # The below is a horrible, horrible way to pluck out lead paras from crappy Blogspot blogs that
                   # don't use <p> tags..
@@ -212,16 +209,16 @@ module Pismo
       # TODO: Improve sentence extraction - this is dire even if it "works for now"
       if lede && String === lede
-        return (lede[/^(.*?[\.\!\?]\s){2}/m] || lede).to_s.strip
+        return (lede[/^(.*?[\.\!\?]\s){1,3}/m] || lede).to_s.strip
       elsif lede && Array === lede
-        return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){2}/m].strip || l }.uniq
+        return lede.map { |l| l.to_s[/^(.*?[\.\!\?]\s){1,3}/m].strip || l }.uniq
       else
-        return reader_doc && !reader_doc.sentences(3).empty? ? reader_doc.sentences(3).join(' ') : nil
+        return reader_doc && !reader_doc.sentences(4).empty? ? reader_doc.sentences(4).join(' ') : nil
       end
     end
     def ledes
-      lede(true)
+      lede(true) rescue []
     end
     # Returns a string containing the first [limit] sentences as determined by the Reader algorithm
@@ -236,29 +233,31 @@ module Pismo
     # Returns the "keywords" in the document (not the meta keywords - they're next to useless now)
     def keywords(options = {})
-      options = { :stem_at => 20, :word_length_limit => 15, :limit => 20 }.merge(options)
+      options = { :stem_at => 20, :word_length_limit => 15, :limit => 20, :remove_stopwords => true, :minimum_score => 2 }.merge(options)
       words = {}
       # Convert doc to lowercase, scrub out most HTML tags, then keep track of words
-      cached_title = title
+      cached_title = title.to_s
       content_to_use = body.to_s.downcase + " " + description.to_s.downcase
       # old regex for safe keeping -- \b[a-z][a-z\+\.\'\+\#\-]*\b
-      content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\/\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.each do |word|
+      content_to_use.downcase.gsub(/\<[^\>]{1,100}\>/, '').gsub(/\.+\s+/, ' ').gsub(/\&\w+\;/, '').scan(/(\b|\s|\A)([a-z0-9][a-z0-9\+\.\'\+\#\-\\]*)(\b|\s|\Z)/i).map{ |ta1| ta1[1] }.compact.each do |word|
         next if word.length > options[:word_length_limit]
-        word.gsub!(/\'\w+/, '')
+        word.gsub!(/^[\']/, '')
+        word.gsub!(/[\.\-\']$/, '')
+        next if options[:hints] && !options[:hints].include?(word)
         words[word] ||= 0
-        words[word] += (cached_title.downcase.include?(word) ? 5 : 1)
+        words[word] += (cached_title.downcase =~ /\b#{word}\b/ ? 5 : 1)
       end
       # Stem the words and stop words if necessary
       d = words.keys.uniq.map { |a| a.length > options[:stem_at] ? a.stem : a }
       s = Pismo.stopwords.map { |a| a.length > options[:stem_at] ? a.stem : a }
-      w = words.delete_if { |k1, v1| s.include?(k1) || (v1 < 2 && words.size > 80) }.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
-      return w
+      words.delete_if { |k1, v1| v1 < options[:minimum_score] }
+      words.delete_if { |k1, v1| s.include?(k1) } if options[:remove_stopwords]
+      words.sort_by { |k2, v2| v2 }.reverse.first(options[:limit])
     end
     def reader_doc

data/lib/pismo/stopwords.txt CHANGED

@@ -1,3 +1,4 @@
+a
 a's
 Aaliyah
 Aaron
@@ -70,6 +71,8 @@ apart
 appear
 appreciate
 appropriate
+approximate
+approximately
 apr
 april
 are
@@ -138,6 +141,7 @@ Brooklyn
 Bryan
 Bryce
 but
+by
 c'mon
 c's
 Caden
@@ -238,6 +242,7 @@ driven
 drove
 during
 Dylan
+e
 each
 easier
 edu
@@ -282,6 +287,7 @@ existing
 extensive
 extra
 extremely
+f
 Faith
 false
 fame
@@ -310,6 +316,7 @@ fuck
 full
 further
 furthermore
+g
 Gabriel
 Gabriella
 Gabrielle
@@ -334,6 +341,7 @@ gotten
 Grace
 great
 greetings
+h
 had
 hadn't
 Hailey
@@ -376,12 +384,14 @@ howbeit
 however
 huge
 Hunter
+i
 i'd
 i'll
 i'm
 i've
 Ian
 ie
+if
 ignored
 imagine
 immediate
@@ -418,6 +428,7 @@ it's
 its
 itself
 Ivan
+j
 Jack
 Jackson
 Jacob
@@ -440,6 +451,7 @@ Jessica
 Jesus
 jim
 jimmy
+jnr
 Jocelyn
 Joel
 John
@@ -450,6 +462,7 @@ Jose
 Joseph
 Joshua
 Josiah
+jr
 Juan
 jul
 Julia
@@ -459,6 +472,7 @@ jun
 june
 just
 Justin
+k
 Kaden
 Kaitlyn
 Kaleb
@@ -479,6 +493,7 @@ known
 knows
 Kyle
 Kylie
+l
 la
 Landon
 last
@@ -518,6 +533,7 @@ ltd
 Lucas
 Luis
 Luke
+m
 Mackenzie
 Madeline
 Madison
@@ -564,6 +580,7 @@ much
 must
 my
 myself
+n
 name
 namely
 Natalie
@@ -602,6 +619,7 @@ novel
 november
 now
 nowhere
+o
 Obie
 obviously
 oct
@@ -637,6 +655,7 @@ out
 overall
 Owen
 own
+p
 Paige
 par
 Parker
@@ -666,9 +685,11 @@ proud
 provide
 provides
 put
+q
 que
 quite
 qv
+r
 Rachel
 rather
 rd
@@ -694,6 +715,7 @@ Riley
 Robert
 run
 Ryan
+s
 safest
 said
 Samantha
@@ -764,6 +786,7 @@ specify
 specifying
 spoke
 spread
+sr
 stand
 started
 step
@@ -780,6 +803,7 @@ sup
 sur
 sure
 Sydney
+t
 t's
 take
 taken
@@ -859,6 +883,7 @@ twice
 two
 Tyler
 typically
+u
 ultra
 un
 unfortunately
@@ -876,6 +901,7 @@ uses
 using
 usually
 uucp
+v
 value
 Vanessa
 various
@@ -886,6 +912,7 @@ Victoria
 Vincent
 viz
 vs
+w
 walks
 want
 wants
@@ -927,8 +954,6 @@ who's
 whoever
 whole
 whom
-approximate
-approximately
 whose
 why
 will
@@ -948,6 +973,7 @@ wouldn't
 wrapped
 Wyatt
 Xavier
+y
 yeah
 yes
 yet
@@ -960,6 +986,17 @@ your
 yours
 yourself
 yourselves
+z
 Zachary
 zero
-Zoe
+Zoe
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9

data/lib/pismo/version.rb ADDED

@@ -0,0 +1,3 @@
+module Pismo
+  VERSION = "0.7.1"
+end

data/pismo.gemspec CHANGED

@@ -1,101 +1,31 @@
-# Generated by jeweler
-# DO NOT EDIT THIS FILE DIRECTLY
-# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
 # -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+require "pismo/version"
 Gem::Specification.new do |s|
-  s.name = %q{pismo}
-  s.version = "0.7.0"
-  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
-  s.authors = ["Peter Cooper"]
-  s.date = %q{2010-07-27}
-  s.default_executable = %q{pismo}
+  s.name        = "pismo"
+  s.version     = Pismo::VERSION
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Peter Cooper"]
+  s.email       = ["git@peterc.org"]
+  s.homepage    = "http://github.com/peterc/pismo"
+  s.summary     = %q{TODO: Write a gem summary}
   s.description = %q{Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.}
-  s.email = %q{git@peterc.org}
-  s.executables = ["pismo"]
-  s.extra_rdoc_files = [
-    "LICENSE",
-     "README.markdown"
-  ]
-  s.files = [
-    ".document",
-     ".gitignore",
-     "LICENSE",
-     "NOTICE",
-     "README.markdown",
-     "Rakefile",
-     "VERSION",
-     "bin/pismo",
-     "lib/pismo.rb",
-     "lib/pismo/document.rb",
-     "lib/pismo/external_attributes.rb",
-     "lib/pismo/internal_attributes.rb",
-     "lib/pismo/reader.rb",
-     "lib/pismo/stopwords.txt",
-     "pismo.gemspec",
-     "test/corpus/bbcnews.html",
-     "test/corpus/bbcnews2.html",
-     "test/corpus/briancray.html",
-     "test/corpus/cant_read.html",
-     "test/corpus/factor.html",
-     "test/corpus/gmane.html",
-     "test/corpus/huffington.html",
-     "test/corpus/metadata_expected.yaml",
-     "test/corpus/metadata_expected.yaml.old",
-     "test/corpus/queness.html",
-     "test/corpus/reader_expected.yaml",
-     "test/corpus/rubyinside.html",
-     "test/corpus/rww.html",
-     "test/corpus/spolsky.html",
-     "test/corpus/techcrunch.html",
-     "test/corpus/tweet.html",
-     "test/corpus/youtube.html",
-     "test/corpus/zefrank.html",
-     "test/helper.rb",
-     "test/test_corpus.rb",
-     "test/test_pismo_document.rb"
-  ]
-  s.homepage = %q{http://github.com/peterc/pismo}
-  s.rdoc_options = ["--charset=UTF-8"]
-  s.require_paths = ["lib"]
-  s.rubygems_version = %q{1.3.5}
-  s.summary = %q{Extracts or retrieves content-related metadata from HTML pages}
-  s.test_files = [
-    "test/helper.rb",
-     "test/test_corpus.rb",
-     "test/test_pismo_document.rb"
-  ]
+  s.summary     = %q{Extracts or retrieves content-related metadata from HTML pages}
+  s.date        = %q{2010-07-27}
+  s.default_executable = %q{pismo}
-  if s.respond_to? :specification_version then
-    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
-    s.specification_version = 3
+  s.rubyforge_project = "pismo"
-    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
-      s.add_development_dependency(%q<shoulda>, [">= 0"])
-      s.add_development_dependency(%q<awesome_print>, [">= 0"])
-      s.add_runtime_dependency(%q<jeweler>, [">= 0"])
-      s.add_runtime_dependency(%q<nokogiri>, [">= 0"])
-      s.add_runtime_dependency(%q<sanitize>, [">= 0"])
-      s.add_runtime_dependency(%q<fast-stemmer>, [">= 0"])
-      s.add_runtime_dependency(%q<chronic>, [">= 0"])
-    else
-      s.add_dependency(%q<shoulda>, [">= 0"])
-      s.add_dependency(%q<awesome_print>, [">= 0"])
-      s.add_dependency(%q<jeweler>, [">= 0"])
-      s.add_dependency(%q<nokogiri>, [">= 0"])
-      s.add_dependency(%q<sanitize>, [">= 0"])
-      s.add_dependency(%q<fast-stemmer>, [">= 0"])
-      s.add_dependency(%q<chronic>, [">= 0"])
-    end
-  else
-    s.add_dependency(%q<shoulda>, [">= 0"])
-    s.add_dependency(%q<awesome_print>, [">= 0"])
-    s.add_dependency(%q<jeweler>, [">= 0"])
-    s.add_dependency(%q<nokogiri>, [">= 0"])
-    s.add_dependency(%q<sanitize>, [">= 0"])
-    s.add_dependency(%q<fast-stemmer>, [">= 0"])
-    s.add_dependency(%q<chronic>, [">= 0"])
-  end
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_dependency(%q<shoulda>, [">= 0"])
+  s.add_dependency(%q<awesome_print>, [">= 0"])
+  s.add_dependency(%q<nokogiri>, [">= 0"])
+  s.add_dependency(%q<sanitize>, [">= 0"])
+  s.add_dependency(%q<fast-stemmer>, [">= 0"])
+  s.add_dependency(%q<chronic>, [">= 0"])
 end

data/test/corpus/metadata_expected.yaml CHANGED

@@ -2,14 +2,14 @@
 :rww:
   :title: "Cartoon: Apple Tablet: Now With Barometer and Bird Call Generator"
   :feed: http://www.readwriteweb.com/rss.xml
-  :lede: I'm just aching to know if the new Apple tablet (insert caveats, weasel words and qualifiers here) is a potential Cintiq competitor. I don't think it will be, but you never know. It may also have a built in barometer and bird call generator.
+  :lede: I'm just aching to know if the new Apple tablet (insert caveats, weasel words and qualifiers here) is a potential Cintiq competitor. I don't think it will be, but you never know. It may also have a built in barometer and bird call generator. I'm never sure if Apple does themselves more good than harm with the secrecy and anticipation that surrounds the run-up to these announcements.
   :feeds:
     - http://www.readwriteweb.com/rss.xml
     - http://www.readwriteweb.com/archives/2010/01/cartoon_apple_tablet_now_with_barometer_and_bird_c.xml
 :briancray:
   :title: 5 great examples of popular blog posts that you should know
   :feed: http://feeds.feedburner.com/briancray/blog
-  :lede: "This is a mock post. While there is a place for all of these posts, I'm trying to make a point that original blogs are being shut out by formulaic blogs."
+  :lede: "This is a mock post."
 :huffington:
   :title: Afghans Losing Hope After 8 Years Of War
   :author: TODD PITMAN
@@ -31,9 +31,9 @@
   :feed: http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml
 :factor:
   :title: Factor's bootstrap process explained
-  :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
+  :lede: "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. It then begins executing code in the image, by calling a special startup quotation.When new source files are loaded into a running Factor instance by the developer, they are parsed and compiled into a collection of objects -- words, quotations, and other literals, along with executable machine code."
   :ledes:
-    - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap."
+    - "Separation of concerns between Factor VM and library codeThe Factor VM implements an abstract machine consisting of a data heap of objects, a code heap of machine code blocks, and a set of stacks. The VM loads an image file on startup, which becomes the data and code heap. It then begins executing code in the image, by calling a special startup quotation.When new source files are loaded into a running Factor instance by the developer, they are parsed and compiled into a collection of objects -- words, quotations, and other literals, along with executable machine code."
 :youtube:
   :title: YMO - Rydeen (Official Video)
   :author: ymo1965
@@ -42,7 +42,7 @@
 :spolsky:
   :title: The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!) - Joel on Software
   :description: Haven't mastered the basics of Unicode and character sets? Please don't write another line of code until you've read this article.
-  :lede: Ever wonder about that mysterious Content-Type tag? You know, the one you're supposed to put in HTML and you never quite know what it should be? Did you ever get an email from your friends in Bulgaria with the subject line "????
+  :lede: "Ever wonder about that mysterious Content-Type tag? You know, the one you're supposed to put in HTML and you never quite know what it should be? Did you ever get an email from your friends in Bulgaria with the subject line \"???? "
   :author: Joel Spolsky
   :favicon: /favicon.ico
   :feed: http://www.joelonsoftware.com/rss.xml
@@ -52,14 +52,14 @@
 :rubyinside:
   :title: "CoffeeScript: A New Language With A Pure Ruby Compiler"
   :author: Peter Cooper
-  :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler. Creator Jeremy Ashkenas calls it "JavaScript's less ostentatious kid brother" - mostly because it compiles into JavaScript and shares most of the same constructs, but with a different, tighter syntax.
+  :lede: CoffeeScript (GitHub repo) is a new programming language with a pure Ruby compiler.
   :feed: http://www.rubyinside.com/feed/
 :zefrank:
   :sentences: If there's anyone who knows how to marshal an online audience, it's Ze Frank. Ze is best-known for his 2006 program "The Show," in which he made a new 2-3 minute video every day for 1 year. Topics ranged from "fingers in food" to the mysteries of airport signage to a tour de force summary of creatives' addiction to un-executed ideas, aka brain crack.
   :title: "Ze Frank on Imaginary Audiences :: Articles :: The 99 Percent"
   :description: We chat with the Internet's most notorious mass-collaboration instigator Ze Frank about idea execution and how to build armies of sportsracers.
 :tweet:
-  :lede: Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS X. Wow..!
+  :lede: Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS X.
   :sentences: Gobsmacked that TeX/LaTeX (document formatting tools) for OS X is a 1.3GB (yes, GIGAbytes) download OS Wow..!
 :cant_read:
   :sentences: "For those of us who grew up as weird kids in the 1980s, the work of Berkeley Breathed was as important as those twin eternal pillars of weird-kid-dom: Monty Python and Mad magazine. In a word: seminal. In two words: fucking seminal."
@@ -67,6 +67,6 @@
   :sentences: I am pleased to report that the GCC Steering Committee and the FSF have approved the use of C++ in GCC itself. Of course, there's no reason for us to use C++ features just because we can. The goal is a better compiler for users, not a C++ code base for its own sake.
 :queness:
   :title: 18 Incredible CSS3 Effects You Have Never Seen Before
-  :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it."
+  :lede: "CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it. Also, I have started to implement it to my own project as well and I really love it!"
   :sentences: CSS3 is hot these days and will soon be available in most modern browser. Just recently, I started to become aware to the present of CSS3 around the web. I can see some of the websites such as twitter and designer portfolios websites are using it.
   :datetime: 2010-06-02 12:00:00 +01:00

metadata CHANGED

@@ -1,7 +1,12 @@
 --- !ruby/object:Gem::Specification
 name: pismo
 version: !ruby/object:Gem::Version
-  version: 0.7.0
+  prerelease: false
+  segments:
+  - 0
+  - 7
+  - 1
+  version: 0.7.1
 platform: ruby
 authors:
 - Peter Cooper
@@ -14,91 +19,99 @@ default_executable: pismo
 dependencies:
 - !ruby/object:Gem::Dependency
   name: shoulda
-  type: :development
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id001
 - !ruby/object:Gem::Dependency
   name: awesome_print
-  type: :development
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
-- !ruby/object:Gem::Dependency
-  name: jeweler
   type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
-    requirements:
-    - - ">="
-      - !ruby/object:Gem::Version
-        version: "0"
-    version:
+  version_requirements: *id002
 - !ruby/object:Gem::Dependency
   name: nokogiri
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id003
 - !ruby/object:Gem::Dependency
   name: sanitize
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id004
 - !ruby/object:Gem::Dependency
   name: fast-stemmer
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id005 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id005
 - !ruby/object:Gem::Dependency
   name: chronic
-  type: :runtime
-  version_requirement:
-  version_requirements: !ruby/object:Gem::Requirement
+  prerelease: false
+  requirement: &id006 !ruby/object:Gem::Requirement
+    none: false
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
+        segments:
+        - 0
         version: "0"
-    version:
+  type: :runtime
+  version_requirements: *id006
 description: Pismo extracts and retrieves content-related metadata from HTML pages - you can use the resulting data in an organized way, such as a summary/first paragraph, body text, keywords, RSS feed URL, favicon, etc.
-email: git@peterc.org
+email:
+- git@peterc.org
 executables:
 - pismo
 extensions: []
-extra_rdoc_files:
-- LICENSE
-- README.markdown
+extra_rdoc_files: []
 files:
 - .document
 - .gitignore
+- Gemfile
 - LICENSE
 - NOTICE
 - README.markdown
 - Rakefile
-- VERSION
 - bin/pismo
 - lib/pismo.rb
 - lib/pismo/document.rb
@@ -106,6 +119,7 @@ files:
 - lib/pismo/internal_attributes.rb
 - lib/pismo/reader.rb
 - lib/pismo/stopwords.txt
+- lib/pismo/version.rb
 - pismo.gemspec
 - test/corpus/bbcnews.html
 - test/corpus/bbcnews2.html
@@ -133,30 +147,52 @@ homepage: http://github.com/peterc/pismo
 licenses: []
 post_install_message:
-rdoc_options:
-- --charset=UTF-8
+rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      segments:
+      - 0
       version: "0"
-  version:
 required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      segments:
+      - 0
       version: "0"
-  version:
 requirements: []
-rubyforge_project:
-rubygems_version: 1.3.5
+rubyforge_project: pismo
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
 summary: Extracts or retrieves content-related metadata from HTML pages
 test_files:
+- test/corpus/bbcnews.html
+- test/corpus/bbcnews2.html
+- test/corpus/briancray.html
+- test/corpus/cant_read.html
+- test/corpus/factor.html
+- test/corpus/gmane.html
+- test/corpus/huffington.html
+- test/corpus/metadata_expected.yaml
+- test/corpus/metadata_expected.yaml.old
+- test/corpus/queness.html
+- test/corpus/reader_expected.yaml
+- test/corpus/rubyinside.html
+- test/corpus/rww.html
+- test/corpus/spolsky.html
+- test/corpus/techcrunch.html
+- test/corpus/tweet.html
+- test/corpus/youtube.html
+- test/corpus/zefrank.html
 - test/helper.rb
 - test/test_corpus.rb
 - test/test_pismo_document.rb

data/VERSION DELETED

	@@ -1 +0,0 @@
1	- 0.7.0