RubyGems - auto_excerpt - Versions diffs - 0.6.3 → 0.7.0 - Mend

auto_excerpt 0.6.3 → 0.7.0

Files changed (7) hide show

data/CHANGELOG CHANGED Viewed

@@ -1,3 +1,8 @@
+== 0.7.0 (2010-01-31)
+  * Changed AutoExcerpt from a Class to a Module in order to return a String object when used
+  * Removed String#clean
+  * Limiting by :characters does not break off in the middle of words
 == 0.6.3 (2010-01-26)
   * Removed limit by :characters for the time being
   * Improved limit by characters to be more accurate

data/README.textile CHANGED Viewed

@@ -2,7 +2,7 @@ h1. AutoExcerpt
 pre. [sudo] gem install auto_excerpt
-Creates Automatic excerpts of html formatted text.
+Creates excerpts of html formatted text.
 pre. AutoExcerpt.new("<span>This is <strong>some</strong> fancy html formatted text homie</span>", {:words => 5})
     #   => "<span>This is <strong>some</strong> fancy html...</span>"
@@ -11,16 +11,17 @@ h3. Features
   * There are 4 different ways to limit the length of an excerpt: *characters*, *words*, *sentences*, *paragraphs*
   * If the excerpt would be shorter than the limit that is set, the entire text will be shown.
-  * HTML can be stripped
+  * If limiting by *characters* the gem will ensure that the excerpt does not cutoff in the middle of a word.
+  * HTML can be stripped. You can also set specific tags that you don't want stripped.
   * HTML tags are automatically closed.
 h3. Options
 @:characters@
 The number of characters to display from the text.
-Default: 150 (does not need to be reset to 0 if you choose another option)
+Default: 150 (does not need to be set to 0 if you choose another option)
-If you need to be 100% accurate in your character count, then remove the @:ending@
+_If you need to be 100% accurate in your character count, then remove the @:ending@_
 pre. AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil})
      # => <h1>Hello</h1>
@@ -39,6 +40,13 @@ pre. AutoExcerpt.new("This is cool stuff man!", :ending => ". Srsly!", :words =>
 Strips HTML tags from the excerpt that is displayed.
 Default: false
+@:allowed_tags@
+If using @:strip_html@ then this setting will allow the listed tags to be shown.
+default: []
+pre. AutoExcerpt.new("<p>This <em>is</em> some <strong>formatted</strong> html</p>", {:strip_html => true, :allowed_tags => %w(p em)})
+    # => "<p>This <em>is</em> some formatted html</p>"
 @:strip_paragraphs@
 Strip all paragraph tags from the html.
 Default: false

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.6.3
1	+ 0.7.0

data/lib/auto_excerpt/parser.rb ADDED Viewed

@@ -0,0 +1,177 @@
+module AutoExcerpt
+  # TODO allow for default options to be set.
+  class Parser
+    DEFAULTS = {
+       :characters => 0,
+       :words => 0,
+       :sentences => 0,
+       :paragraphs => 0,
+       # :skip_characters => 0,
+       :skip_words => 0,
+       :skip_sentences => 0,
+       :skip_paragraphs => 0,
+       :ending => '...',
+       :strip_html => false, :allowed_tags => [],
+       :strip_breaks_tabs => false,
+       :strip_paragraphs => false
+    }
+    # TODO add and allowwed tags option
+    PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
+    NO_CLOSE = %w( br hr img input ) # tags that do not have opposite closing tags
+    OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
+    CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
+    # @param [String] text The text to be excerpted
+    # @param [Hash] settings The settings for creating the excerpt
+    # @option settings [Integer] :characters (0) The number of characters to limit the html by
+    # @option settings [Integer] :words (0) The number of words to limit the html by
+    # @option settings [Integer] :sentences (0) The number of sentences to limit the html by
+    # @option settings [Integer] :paragraphs (0) The number of paragraphs to limit the html by
+    # @option settings [Integer] :skip_characters (0) The number of characters to skip from the start of the html
+    # @option settings [Integer] :skip_words (0) The number of words to skip from the start of the html
+    # @option settings [Integer] :skip_sentences (0) The number of sentences to skip from the start of the html
+    # @option settings [Integer] :skip_paragraphs (0) The number of paragraphs to skip from the start of the html
+    # @option settings [String] :ending ('...') A string added to the end of the excerpt
+    # @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
+    # @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
+    def initialize(text, settings = {})
+      @settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
+      # make our copy
+      @body = text.dup.strip
+      @excerpt = ""
+      if @settings[:strip_html]
+        (@settings[:allowed_tags] << "p") if @settings[:paragraphs] > 0 # don't stip P tags if that is the limiter
+        @body = strip_html(@body)
+      end
+      @body = clean(@body) if @settings[:strip_breaks_tabs]
+      # TODO replace this with better regex
+      @body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
+      @charcount = strip_html(@body).length
+      @wordcount = strip_html(@body).scan(/\w+/).size
+      @sencount  = @body.split(PUNCTUATION_MARKS).size
+      @pghcount  = @body.split("</p>").size
+      @settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil?  }
+    end
+    def create_excerpt
+      return characters unless @settings[:characters].zero?
+      return words      unless @settings[:words].zero?
+      return sentences  unless @settings[:sentences].zero?
+      return paragraphs unless @settings[:paragraphs].zero?
+    end
+    alias_method :parse, :create_excerpt
+    protected
+    attr_reader :charcount, :wordcount, :sencount, :pghcount
+    attr_accessor :settings, :body, :excerpt
+   # close html tags
+   # TODO make this work with new strip_html method. Improve regex
+    def close_tags(text)
+      # Don't bother closing tags if html is stripped since there are no tags.
+      if @settings[:strip_html] && @settings[:allowed_tags].empty?
+        tagstoclose = nil
+      else
+        tagstoclose = ""
+        tags = []
+        opentags = text.scan(OPENING_TAG).transpose[0] || []
+        opentags.reverse!
+        closedtags = text.scan(CLOSING_TAG).transpose[0] || []
+        opentags.each do |ot|
+          if closedtags.include?(ot)
+            closedtags.delete_at(closedtags.index(ot))
+          else
+            tags << ot
+          end
+        end
+        tags.each do |tag|
+          tagstoclose << "</#{tag.strip.downcase}>" unless NO_CLOSE.include?(tag)
+        end
+      end
+      @excerpt = [text, @settings[:ending], tagstoclose].compact.join
+    end
+    def non_excerpted_text
+      @settings[:ending] = nil
+      close_tags(@body)
+    end
+    # limit by characters
+    # @todo make this work with skip characters
+    def characters
+      return non_excerpted_text if @charcount < @settings[:characters]
+      html_count = char_count = 0
+      tags_entities = /#{Regexp.union(/(<[a-z0-9]{1,}\b[^>]*>)/,/(<\/[a-z0-9]{1,}>)/,/(&[^\s]*;)/)}/io
+      @body.split(tags_entities).each do |piece|
+        if piece =~ tags_entities
+          html_count += piece.length
+        else
+          chars_left = @settings[:characters] - char_count
+          # NOTE Do I want to count spaces or not?
+          piece.split(/\b/).each{|p|
+            break if (char_count >= @settings[:characters])
+            char_count += p.length
+          }
+        end
+        break if (char_count >= @settings[:characters])
+      end
+      text = clean(@body[0...(html_count+char_count)])
+      close_tags(text)
+    end
+    # limit by words
+    def words
+      return non_excerpted_text if @wordcount < @settings[:words]
+       text = @body.split(" ").slice(@settings[:skip_words], @settings[:words]).join(" ")
+       close_tags(text)
+    end
+    # limit by sentences
+    def sentences
+      return non_excerpted_text if @sencount < @settings[:sentences]
+      # TODO don't change punctuation
+      text = @body.split(PUNCTUATION_MARKS).slice(@settings[:skip_sentences], @settings[:sentences]).join(". ")
+      close_tags(text)
+    end
+    # limit by paragraphs
+    def paragraphs
+      return non_excerpted_text if @pghcount < @settings[:paragraphs]
+      text = @body.split("</p>").slice(@settings[:skip_paragraphs], @settings[:paragraphs])
+      @settings[:ending] = nil
+      text = text.join("</p>")
+      close_tags(text)
+    end
+    # remove all double-spaces, tabs, and new lines from string
+    def clean(str)
+      str.strip.gsub(/\s{2,}|[\n\r\t]/, ' ')
+    end
+    # Removes HTML tags from a string. Allows you to specify some tags to be kept.
+    # @see http://codesnippets.joyent.com/posts/show/1354#comment-293
+    def strip_html(html)
+      return @stripped_html if @stripped_html
+      allowed = @settings[:allowed_tags]
+      reg = if allowed.any?
+        Regexp.new(
+          %(<(?!(\\s|\\/)*(#{
+            allowed.map {|tag| Regexp.escape( tag )}.join( "|" )
+          })( |>|\\/|'|"|<|\\s*\\z))[^>]*(>+|\\s*\\z)),
+          Regexp::IGNORECASE | Regexp::MULTILINE, 'u'
+        )
+      else
+        /<[^>]*(>+|\s*\z)/m
+      end
+       @stripped_html = html.gsub(reg,'')
+    end
+  end
+end

data/lib/auto_excerpt.rb CHANGED Viewed

@@ -1,184 +1,8 @@
-class String
-  def clean # remove all double-spaces, tabs, and new lines from string
-    strip.gsub(/\s{2,}|[\n\r\t]/, ' ')
-  end
-  def clean! # ditto, but replaces the original string
-    replace(clean)
-  end
-end
-# TODO allow for default options to be set.
-class AutoExcerpt < String
-  DEFAULTS = {
-     :characters => 0,
-     :words => 0,
-     :sentences => 0,
-     :paragraphs => 0,
-     # :skip_characters => 0,
-     :skip_words => 0,
-     :skip_sentences => 0,
-     :skip_paragraphs => 0,
-     :ending => '...',
-     :strip_html => false, :allowed_tags => [],
-     :strip_breaks_tabs => false,
-     :strip_paragraphs => false
-  }
-  # TODO add and allowwed tags option
-  PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
-  NO_CLOSE = %w( br hr img input ) # tags that do not have opposite closing tags
-  OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
-  CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
-  # @param [String] text The text to be excerpted
-  # @param [Hash] settings The settings for creating the excerpt
-  # @option settings [Integer] :characters (0) The number of characters to limit the html by
-  # @option settings [Integer] :words (0) The number of words to limit the html by
-  # @option settings [Integer] :sentences (0) The number of sentences to limit the html by
-  # @option settings [Integer] :paragraphs (0) The number of paragraphs to limit the html by
-  # @option settings [Integer] :skip_characters (0) The number of characters to skip from the start of the html
-  # @option settings [Integer] :skip_words (0) The number of words to skip from the start of the html
-  # @option settings [Integer] :skip_sentences (0) The number of sentences to skip from the start of the html
-  # @option settings [Integer] :skip_paragraphs (0) The number of paragraphs to skip from the start of the html
-  # @option settings [String] :ending ('...') A string added to the end of the excerpt
-  # @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
-  # @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
-  def initialize(text, settings = {})
-    @settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
-    # make our copy
-    @body = text.dup.strip
-    @excerpt = ""
-    if @settings[:strip_html]
-      (@settings[:allowed_tags] << "p") if @settings[:paragraphs] > 0 # don't stip P tags if that is the limiter
-      @body = strip_html(@body)
-    end
-    @body = @body.clean if @settings[:strip_breaks_tabs]
-    # TODO replace this with better regex
-    @body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
-    @charcount = strip_html(@body).length
-    @wordcount = strip_html(@body).scan(/\w+/).size
-    @sencount  = @body.split(PUNCTUATION_MARKS).size
-    @pghcount  = @body.split("</p>").size
-    @settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil?  }
-    create_excerpt
-    super(@excerpt)
-  end
-  protected
-  attr_reader :charcount, :wordcount, :sencount, :pghcount
-  attr_accessor :settings, :body, :excerpt
- # close html tags
- # TODO make this work with new strip_html method. Improve regex
-  def close_tags(text)
-    # Don't bother closing tags if html is stripped since there are no tags.
-    if @settings[:strip_html] && @settings[:allowed_tags].empty?
-      tagstoclose = nil
-    else
-      tagstoclose = ""
-      tags = []
-      # /<(([A-Z]|[a-z]).*?)(( )|(>))/is
-      # /<\/(([A-Z]|[a-z]).*?)(( )|(>))/is
-      opentags = text.scan(OPENING_TAG).transpose[0] || []
-      opentags.reverse!
-      closedtags = text.scan(CLOSING_TAG).transpose[0] || []
-      opentags.each do |ot|
-        if closedtags.include?(ot)
-          closedtags.delete_at(closedtags.index(ot))
-        else
-          tags << ot
-        end
-      end
-      tags.each do |tag|
-        tagstoclose << "</#{tag.strip.downcase}>" unless NO_CLOSE.include?(tag)
-      end
-    end
-    @excerpt = [text, @settings[:ending], tagstoclose].compact.join
-  end
-  def create_excerpt
-    return characters unless @settings[:characters].zero?
-    return words      unless @settings[:words].zero?
-    return sentences  unless @settings[:sentences].zero?
-    return paragraphs unless @settings[:paragraphs].zero?
-  end
-  def non_excerpted_text
-    @settings[:ending] = nil
-    close_tags(@body)
-  end
-  # limit by characters
-  # @todo make this work with skip characters
-  def characters
-    return non_excerpted_text if @charcount < @settings[:characters]
-    text = ""
-    html_count = char_count = 0
-    start_end_tags = /#{Regexp.union(/(<[a-z0-9]{1,}\b[^>]*>)/,/(<\/[a-z0-9]{1,}>)/)}/io
-    @body.split(start_end_tags).each do |piece|
-      if piece =~ start_end_tags
-        html_count += piece.length
-      else
-        chars_left = @settings[:characters] - char_count
-        # TODO don't clip the middle of a word
-        # unless piece[0...(chars_left+1)] =~ /(\s|\W)$/
-        #   chars_left += 1 until piece[0...chars_left] =~ /(\s|\W)$/
-        # end
-        char_count += piece[0...chars_left].length
-      end
-      break if (char_count >= @settings[:characters])
-    end
-    text = @body[0...(html_count+char_count)]
-    close_tags(text)
-  end
-  # limit by words
-  def words
-    return non_excerpted_text if @wordcount < @settings[:words]
-     text = @body.split(" ").slice(@settings[:skip_words], @settings[:words]).join(" ")
-     close_tags(text)
-  end
+require File.join(File.dirname(__FILE__), *%w[auto_excerpt parser])
-  # limit by sentences
-  def sentences
-    return non_excerpted_text if @sencount < @settings[:sentences]
-    # TODO don't change punctuation
-    text = @body.split(PUNCTUATION_MARKS).slice(@settings[:skip_sentences], @settings[:sentences]).join(". ")
-    close_tags(text)
-  end
-  # limit by paragraphs
-  def paragraphs
-    return non_excerpted_text if @pghcount < @settings[:paragraphs]
-    text = @body.split("</p>").slice(@settings[:skip_paragraphs], @settings[:paragraphs])
-    @settings[:ending] = nil
-    text = text.join("</p>")
-    close_tags(text)
-  end
-  # Removes HTML tags from a string. Allows you to specify some tags to be kept.
-  # @see http://codesnippets.joyent.com/posts/show/1354#comment-293
-  def strip_html(html)
-    allowed = @settings[:allowed_tags]
-    reg = if allowed.any?
-      Regexp.new(
-        %(<(?!(\\s|\\/)*(#{
-          allowed.map {|tag| Regexp.escape( tag )}.join( "|" )
-        })( |>|\\/|'|"|<|\\s*\\z))[^>]*(>+|\\s*\\z)),
-        Regexp::IGNORECASE | Regexp::MULTILINE, 'u'
-      )
-    else
-      /<[^>]*(>+|\s*\z)/m
-    end
-     @stripped_html ||= html.gsub(reg,'')
+module AutoExcerpt
+  def self.new(text, options = {})
+    parser = Parser.new(text, options)
+    parser.parse
   end
 end

data/spec/auto_excerpt_spec.rb CHANGED Viewed

@@ -8,23 +8,23 @@ describe AutoExcerpt do
    text = html_excerpt({:characters => 5, :ending => nil})
    stripped_text(text).length.should eql(5)
-   text = heavy_excerpt({:characters => 5, :ending => nil})
-   stripped_text(text).length.should eql(5)
+   text = heavy_excerpt({:characters => 7, :ending => nil})
+   stripped_text(text).length.should eql(7)
   end
   it "should default to 150 characters" do
    text = html_excerpt(:ending => nil)
-   stripped_text(text).length.should eql(150)
+   stripped_text(text).length.should be_close(150, 7)
   end
-  it "does not include html tags in character count" do
+  it "does not include html tags or entities in character count" do
     AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).should == "<h1>Hello</h1>"
+    AutoExcerpt.new("<h1>Copyright &copy; 2010</h1>", {:characters => 11, :ending => nil}).should == "<h1>Copyright &copy;</h1>"
   end
   it "should not cutoff in the middle of a word" do
-    pending("this does not work yet") do
-      AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
-    end
+    AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
+    AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).should == "<h1>Hello World</h1>"
   end
   it "should limit words" do
@@ -40,8 +40,8 @@ describe AutoExcerpt do
    text = html_excerpt({:sentences => 3})
    text.should == %{<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur...</p>}
-   # text = heavy_excerpt({:sentences => 3})
-   # text.should == %{<p>Alright&hellip;ok&hellip;that title is a bold faced lie. I don&rsquo;t give a damn about <acronym title="Cascading Style Sheets">CSS</acronym> validation! Being a designer for a living, you have to know when to ditch some of these &lsquo;web 2.0&rsquo; type fads...</p>}
+   text = heavy_excerpt({:sentences => 3})
+   text.should == %{<p>Alright&hellip;ok&hellip;that title is a bold faced lie. I don&rsquo;t give a damn about <acronym title="Cascading Style Sheets">CSS</acronym> validation. Being a designer for a living, you have to know when to ditch some of these &lsquo;web 2.0&rsquo; type fads...</p>}
   end
   it "should limit paragraphs" do

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: auto_excerpt
 version: !ruby/object:Gem::Version
-  version: 0.6.3
+  version: 0.7.0
 platform: ruby
 authors:
 - Kabari Hendrick
@@ -9,7 +9,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-01-26 00:00:00 -06:00
+date: 2010-01-31 00:00:00 -06:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -50,6 +50,7 @@ files:
 - VERSION
 - browser_test/browser_test.rb
 - lib/auto_excerpt.rb
+- lib/auto_excerpt/parser.rb
 - spec/auto_excerpt_spec.rb
 - spec/shared/strip_html_spec.rb
 - spec/spec.opts