auto_excerpt 0.6.3 → 0.7.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,8 @@
1
+ == 0.7.0 (2010-01-31)
2
+ * Changed AutoExcerpt from a Class to a Module in order to return a String object when used
3
+ * Removed String#clean
4
+ * Limiting by :characters does not break off in the middle of words
5
+
1
6
  == 0.6.3 (2010-01-26)
2
7
  * Removed limit by :characters for the time being
3
8
  * Improved limit by characters to be more accurate
data/README.textile CHANGED
@@ -2,7 +2,7 @@ h1. AutoExcerpt
2
2
 
3
3
  pre. [sudo] gem install auto_excerpt
4
4
 
5
- Creates Automatic excerpts of html formatted text.
5
+ Creates excerpts of html formatted text.
6
6
 
7
7
  pre. AutoExcerpt.new("<span>This is <strong>some</strong> fancy html formatted text homie</span>", {:words => 5})
8
8
  # => "<span>This is <strong>some</strong> fancy html...</span>"
@@ -11,16 +11,17 @@ h3. Features
11
11
 
12
12
  * There are 4 different ways to limit the length of an excerpt: *characters*, *words*, *sentences*, *paragraphs*
13
13
  * If the excerpt would be shorter than the limit that is set, the entire text will be shown.
14
- * HTML can be stripped
14
+ * If limiting by *characters* the gem will ensure that the excerpt does not cutoff in the middle of a word.
15
+ * HTML can be stripped. You can also set specific tags that you don't want stripped.
15
16
  * HTML tags are automatically closed.
16
17
 
17
18
  h3. Options
18
19
 
19
20
  @:characters@
20
21
  The number of characters to display from the text.
21
- Default: 150 (does not need to be reset to 0 if you choose another option)
22
+ Default: 150 (does not need to be set to 0 if you choose another option)
22
23
 
23
- If you need to be 100% accurate in your character count, then remove the @:ending@
24
+ _If you need to be 100% accurate in your character count, then remove the @:ending@_
24
25
 
25
26
  pre. AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil})
26
27
  # => <h1>Hello</h1>
@@ -39,6 +40,13 @@ pre. AutoExcerpt.new("This is cool stuff man!", :ending => ". Srsly!", :words =>
39
40
  Strips HTML tags from the excerpt that is displayed.
40
41
  Default: false
41
42
 
43
+ @:allowed_tags@
44
+ If using @:strip_html@ then this setting will allow the listed tags to be shown.
45
+ default: []
46
+
47
+ pre. AutoExcerpt.new("<p>This <em>is</em> some <strong>formatted</strong> html</p>", {:strip_html => true, :allowed_tags => %w(p em)})
48
+ # => "<p>This <em>is</em> some formatted html</p>"
49
+
42
50
  @:strip_paragraphs@
43
51
  Strip all paragraph tags from the html.
44
52
  Default: false
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.6.3
1
+ 0.7.0
@@ -0,0 +1,177 @@
1
+ module AutoExcerpt
2
+ # TODO allow for default options to be set.
3
+ class Parser
4
+ DEFAULTS = {
5
+ :characters => 0,
6
+ :words => 0,
7
+ :sentences => 0,
8
+ :paragraphs => 0,
9
+ # :skip_characters => 0,
10
+ :skip_words => 0,
11
+ :skip_sentences => 0,
12
+ :skip_paragraphs => 0,
13
+ :ending => '...',
14
+ :strip_html => false, :allowed_tags => [],
15
+ :strip_breaks_tabs => false,
16
+ :strip_paragraphs => false
17
+ }
18
+
19
+ # TODO add and allowwed tags option
20
+ PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
21
+ NO_CLOSE = %w( br hr img input ) # tags that do not have opposite closing tags
22
+ OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
23
+ CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
24
+
25
+ # @param [String] text The text to be excerpted
26
+ # @param [Hash] settings The settings for creating the excerpt
27
+ # @option settings [Integer] :characters (0) The number of characters to limit the html by
28
+ # @option settings [Integer] :words (0) The number of words to limit the html by
29
+ # @option settings [Integer] :sentences (0) The number of sentences to limit the html by
30
+ # @option settings [Integer] :paragraphs (0) The number of paragraphs to limit the html by
31
+ # @option settings [Integer] :skip_characters (0) The number of characters to skip from the start of the html
32
+ # @option settings [Integer] :skip_words (0) The number of words to skip from the start of the html
33
+ # @option settings [Integer] :skip_sentences (0) The number of sentences to skip from the start of the html
34
+ # @option settings [Integer] :skip_paragraphs (0) The number of paragraphs to skip from the start of the html
35
+ # @option settings [String] :ending ('...') A string added to the end of the excerpt
36
+ # @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
37
+ # @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
38
+ def initialize(text, settings = {})
39
+ @settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
40
+
41
+ # make our copy
42
+ @body = text.dup.strip
43
+ @excerpt = ""
44
+
45
+ if @settings[:strip_html]
46
+ (@settings[:allowed_tags] << "p") if @settings[:paragraphs] > 0 # don't stip P tags if that is the limiter
47
+ @body = strip_html(@body)
48
+ end
49
+ @body = clean(@body) if @settings[:strip_breaks_tabs]
50
+ # TODO replace this with better regex
51
+ @body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
52
+ @charcount = strip_html(@body).length
53
+ @wordcount = strip_html(@body).scan(/\w+/).size
54
+ @sencount = @body.split(PUNCTUATION_MARKS).size
55
+ @pghcount = @body.split("</p>").size
56
+ @settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil? }
57
+ end
58
+
59
+ def create_excerpt
60
+ return characters unless @settings[:characters].zero?
61
+ return words unless @settings[:words].zero?
62
+ return sentences unless @settings[:sentences].zero?
63
+ return paragraphs unless @settings[:paragraphs].zero?
64
+ end
65
+
66
+ alias_method :parse, :create_excerpt
67
+
68
+ protected
69
+
70
+ attr_reader :charcount, :wordcount, :sencount, :pghcount
71
+ attr_accessor :settings, :body, :excerpt
72
+
73
+ # close html tags
74
+ # TODO make this work with new strip_html method. Improve regex
75
+ def close_tags(text)
76
+ # Don't bother closing tags if html is stripped since there are no tags.
77
+ if @settings[:strip_html] && @settings[:allowed_tags].empty?
78
+ tagstoclose = nil
79
+ else
80
+ tagstoclose = ""
81
+ tags = []
82
+ opentags = text.scan(OPENING_TAG).transpose[0] || []
83
+ opentags.reverse!
84
+ closedtags = text.scan(CLOSING_TAG).transpose[0] || []
85
+
86
+ opentags.each do |ot|
87
+ if closedtags.include?(ot)
88
+ closedtags.delete_at(closedtags.index(ot))
89
+ else
90
+ tags << ot
91
+ end
92
+ end
93
+
94
+ tags.each do |tag|
95
+ tagstoclose << "</#{tag.strip.downcase}>" unless NO_CLOSE.include?(tag)
96
+ end
97
+ end
98
+
99
+ @excerpt = [text, @settings[:ending], tagstoclose].compact.join
100
+ end
101
+
102
+ def non_excerpted_text
103
+ @settings[:ending] = nil
104
+ close_tags(@body)
105
+ end
106
+
107
+ # limit by characters
108
+ # @todo make this work with skip characters
109
+ def characters
110
+ return non_excerpted_text if @charcount < @settings[:characters]
111
+ html_count = char_count = 0
112
+ tags_entities = /#{Regexp.union(/(<[a-z0-9]{1,}\b[^>]*>)/,/(<\/[a-z0-9]{1,}>)/,/(&[^\s]*;)/)}/io
113
+ @body.split(tags_entities).each do |piece|
114
+ if piece =~ tags_entities
115
+ html_count += piece.length
116
+ else
117
+ chars_left = @settings[:characters] - char_count
118
+ # NOTE Do I want to count spaces or not?
119
+ piece.split(/\b/).each{|p|
120
+ break if (char_count >= @settings[:characters])
121
+ char_count += p.length
122
+ }
123
+ end
124
+ break if (char_count >= @settings[:characters])
125
+ end
126
+ text = clean(@body[0...(html_count+char_count)])
127
+ close_tags(text)
128
+ end
129
+
130
+ # limit by words
131
+ def words
132
+ return non_excerpted_text if @wordcount < @settings[:words]
133
+ text = @body.split(" ").slice(@settings[:skip_words], @settings[:words]).join(" ")
134
+ close_tags(text)
135
+ end
136
+
137
+ # limit by sentences
138
+ def sentences
139
+ return non_excerpted_text if @sencount < @settings[:sentences]
140
+ # TODO don't change punctuation
141
+ text = @body.split(PUNCTUATION_MARKS).slice(@settings[:skip_sentences], @settings[:sentences]).join(". ")
142
+ close_tags(text)
143
+ end
144
+
145
+ # limit by paragraphs
146
+ def paragraphs
147
+ return non_excerpted_text if @pghcount < @settings[:paragraphs]
148
+ text = @body.split("</p>").slice(@settings[:skip_paragraphs], @settings[:paragraphs])
149
+ @settings[:ending] = nil
150
+ text = text.join("</p>")
151
+ close_tags(text)
152
+ end
153
+
154
+ # remove all double-spaces, tabs, and new lines from string
155
+ def clean(str)
156
+ str.strip.gsub(/\s{2,}|[\n\r\t]/, ' ')
157
+ end
158
+
159
+ # Removes HTML tags from a string. Allows you to specify some tags to be kept.
160
+ # @see http://codesnippets.joyent.com/posts/show/1354#comment-293
161
+ def strip_html(html)
162
+ return @stripped_html if @stripped_html
163
+ allowed = @settings[:allowed_tags]
164
+ reg = if allowed.any?
165
+ Regexp.new(
166
+ %(<(?!(\\s|\\/)*(#{
167
+ allowed.map {|tag| Regexp.escape( tag )}.join( "|" )
168
+ })( |>|\\/|'|"|<|\\s*\\z))[^>]*(>+|\\s*\\z)),
169
+ Regexp::IGNORECASE | Regexp::MULTILINE, 'u'
170
+ )
171
+ else
172
+ /<[^>]*(>+|\s*\z)/m
173
+ end
174
+ @stripped_html = html.gsub(reg,'')
175
+ end
176
+ end
177
+ end
data/lib/auto_excerpt.rb CHANGED
@@ -1,184 +1,8 @@
1
- class String
2
- def clean # remove all double-spaces, tabs, and new lines from string
3
- strip.gsub(/\s{2,}|[\n\r\t]/, ' ')
4
- end
5
-
6
- def clean! # ditto, but replaces the original string
7
- replace(clean)
8
- end
9
- end
10
-
11
- # TODO allow for default options to be set.
12
- class AutoExcerpt < String
13
- DEFAULTS = {
14
- :characters => 0,
15
- :words => 0,
16
- :sentences => 0,
17
- :paragraphs => 0,
18
- # :skip_characters => 0,
19
- :skip_words => 0,
20
- :skip_sentences => 0,
21
- :skip_paragraphs => 0,
22
- :ending => '...',
23
- :strip_html => false, :allowed_tags => [],
24
- :strip_breaks_tabs => false,
25
- :strip_paragraphs => false
26
- }
27
-
28
- # TODO add and allowwed tags option
29
- PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
30
- NO_CLOSE = %w( br hr img input ) # tags that do not have opposite closing tags
31
- OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
32
- CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
33
-
34
- # @param [String] text The text to be excerpted
35
- # @param [Hash] settings The settings for creating the excerpt
36
- # @option settings [Integer] :characters (0) The number of characters to limit the html by
37
- # @option settings [Integer] :words (0) The number of words to limit the html by
38
- # @option settings [Integer] :sentences (0) The number of sentences to limit the html by
39
- # @option settings [Integer] :paragraphs (0) The number of paragraphs to limit the html by
40
- # @option settings [Integer] :skip_characters (0) The number of characters to skip from the start of the html
41
- # @option settings [Integer] :skip_words (0) The number of words to skip from the start of the html
42
- # @option settings [Integer] :skip_sentences (0) The number of sentences to skip from the start of the html
43
- # @option settings [Integer] :skip_paragraphs (0) The number of paragraphs to skip from the start of the html
44
- # @option settings [String] :ending ('...') A string added to the end of the excerpt
45
- # @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
46
- # @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
47
- def initialize(text, settings = {})
48
- @settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
49
-
50
- # make our copy
51
- @body = text.dup.strip
52
- @excerpt = ""
53
-
54
- if @settings[:strip_html]
55
- (@settings[:allowed_tags] << "p") if @settings[:paragraphs] > 0 # don't stip P tags if that is the limiter
56
- @body = strip_html(@body)
57
- end
58
- @body = @body.clean if @settings[:strip_breaks_tabs]
59
- # TODO replace this with better regex
60
- @body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
61
- @charcount = strip_html(@body).length
62
- @wordcount = strip_html(@body).scan(/\w+/).size
63
- @sencount = @body.split(PUNCTUATION_MARKS).size
64
- @pghcount = @body.split("</p>").size
65
- @settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil? }
66
-
67
- create_excerpt
68
- super(@excerpt)
69
- end
70
-
71
-
72
- protected
73
-
74
- attr_reader :charcount, :wordcount, :sencount, :pghcount
75
- attr_accessor :settings, :body, :excerpt
76
-
77
- # close html tags
78
- # TODO make this work with new strip_html method. Improve regex
79
- def close_tags(text)
80
- # Don't bother closing tags if html is stripped since there are no tags.
81
- if @settings[:strip_html] && @settings[:allowed_tags].empty?
82
- tagstoclose = nil
83
- else
84
- tagstoclose = ""
85
- tags = []
86
- # /<(([A-Z]|[a-z]).*?)(( )|(>))/is
87
- # /<\/(([A-Z]|[a-z]).*?)(( )|(>))/is
88
- opentags = text.scan(OPENING_TAG).transpose[0] || []
89
- opentags.reverse!
90
- closedtags = text.scan(CLOSING_TAG).transpose[0] || []
91
-
92
- opentags.each do |ot|
93
- if closedtags.include?(ot)
94
- closedtags.delete_at(closedtags.index(ot))
95
- else
96
- tags << ot
97
- end
98
- end
99
-
100
- tags.each do |tag|
101
- tagstoclose << "</#{tag.strip.downcase}>" unless NO_CLOSE.include?(tag)
102
- end
103
- end
104
-
105
- @excerpt = [text, @settings[:ending], tagstoclose].compact.join
106
- end
107
-
108
- def create_excerpt
109
- return characters unless @settings[:characters].zero?
110
- return words unless @settings[:words].zero?
111
- return sentences unless @settings[:sentences].zero?
112
- return paragraphs unless @settings[:paragraphs].zero?
113
- end
114
-
115
- def non_excerpted_text
116
- @settings[:ending] = nil
117
- close_tags(@body)
118
- end
119
-
120
- # limit by characters
121
- # @todo make this work with skip characters
122
- def characters
123
- return non_excerpted_text if @charcount < @settings[:characters]
124
- text = ""
125
- html_count = char_count = 0
126
- start_end_tags = /#{Regexp.union(/(<[a-z0-9]{1,}\b[^>]*>)/,/(<\/[a-z0-9]{1,}>)/)}/io
127
- @body.split(start_end_tags).each do |piece|
128
- if piece =~ start_end_tags
129
- html_count += piece.length
130
- else
131
- chars_left = @settings[:characters] - char_count
132
- # TODO don't clip the middle of a word
133
- # unless piece[0...(chars_left+1)] =~ /(\s|\W)$/
134
- # chars_left += 1 until piece[0...chars_left] =~ /(\s|\W)$/
135
- # end
136
- char_count += piece[0...chars_left].length
137
- end
138
- break if (char_count >= @settings[:characters])
139
- end
140
- text = @body[0...(html_count+char_count)]
141
- close_tags(text)
142
- end
143
-
144
- # limit by words
145
- def words
146
- return non_excerpted_text if @wordcount < @settings[:words]
147
- text = @body.split(" ").slice(@settings[:skip_words], @settings[:words]).join(" ")
148
- close_tags(text)
149
- end
1
+ require File.join(File.dirname(__FILE__), *%w[auto_excerpt parser])
150
2
 
151
- # limit by sentences
152
- def sentences
153
- return non_excerpted_text if @sencount < @settings[:sentences]
154
- # TODO don't change punctuation
155
- text = @body.split(PUNCTUATION_MARKS).slice(@settings[:skip_sentences], @settings[:sentences]).join(". ")
156
- close_tags(text)
157
- end
158
-
159
- # limit by paragraphs
160
- def paragraphs
161
- return non_excerpted_text if @pghcount < @settings[:paragraphs]
162
- text = @body.split("</p>").slice(@settings[:skip_paragraphs], @settings[:paragraphs])
163
- @settings[:ending] = nil
164
- text = text.join("</p>")
165
- close_tags(text)
166
- end
167
-
168
- # Removes HTML tags from a string. Allows you to specify some tags to be kept.
169
- # @see http://codesnippets.joyent.com/posts/show/1354#comment-293
170
- def strip_html(html)
171
- allowed = @settings[:allowed_tags]
172
- reg = if allowed.any?
173
- Regexp.new(
174
- %(<(?!(\\s|\\/)*(#{
175
- allowed.map {|tag| Regexp.escape( tag )}.join( "|" )
176
- })( |>|\\/|'|"|<|\\s*\\z))[^>]*(>+|\\s*\\z)),
177
- Regexp::IGNORECASE | Regexp::MULTILINE, 'u'
178
- )
179
- else
180
- /<[^>]*(>+|\s*\z)/m
181
- end
182
- @stripped_html ||= html.gsub(reg,'')
3
+ module AutoExcerpt
4
+ def self.new(text, options = {})
5
+ parser = Parser.new(text, options)
6
+ parser.parse
183
7
  end
184
8
  end
@@ -8,23 +8,23 @@ describe AutoExcerpt do
8
8
  text = html_excerpt({:characters => 5, :ending => nil})
9
9
  stripped_text(text).length.should eql(5)
10
10
 
11
- text = heavy_excerpt({:characters => 5, :ending => nil})
12
- stripped_text(text).length.should eql(5)
11
+ text = heavy_excerpt({:characters => 7, :ending => nil})
12
+ stripped_text(text).length.should eql(7)
13
13
  end
14
-
14
+
15
15
  it "should default to 150 characters" do
16
16
  text = html_excerpt(:ending => nil)
17
- stripped_text(text).length.should eql(150)
17
+ stripped_text(text).length.should be_close(150, 7)
18
18
  end
19
19
 
20
- it "does not include html tags in character count" do
20
+ it "does not include html tags or entities in character count" do
21
21
  AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).should == "<h1>Hello</h1>"
22
+ AutoExcerpt.new("<h1>Copyright &copy; 2010</h1>", {:characters => 11, :ending => nil}).should == "<h1>Copyright &copy;</h1>"
22
23
  end
23
24
 
24
25
  it "should not cutoff in the middle of a word" do
25
- pending("this does not work yet") do
26
- AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
27
- end
26
+ AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
27
+ AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).should == "<h1>Hello World</h1>"
28
28
  end
29
29
 
30
30
  it "should limit words" do
@@ -40,8 +40,8 @@ describe AutoExcerpt do
40
40
  text = html_excerpt({:sentences => 3})
41
41
  text.should == %{<p>Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur...</p>}
42
42
 
43
- # text = heavy_excerpt({:sentences => 3})
44
- # text.should == %{<p>Alright&hellip;ok&hellip;that title is a bold faced lie. I don&rsquo;t give a damn about <acronym title="Cascading Style Sheets">CSS</acronym> validation! Being a designer for a living, you have to know when to ditch some of these &lsquo;web 2.0&rsquo; type fads...</p>}
43
+ text = heavy_excerpt({:sentences => 3})
44
+ text.should == %{<p>Alright&hellip;ok&hellip;that title is a bold faced lie. I don&rsquo;t give a damn about <acronym title="Cascading Style Sheets">CSS</acronym> validation. Being a designer for a living, you have to know when to ditch some of these &lsquo;web 2.0&rsquo; type fads...</p>}
45
45
  end
46
46
 
47
47
  it "should limit paragraphs" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: auto_excerpt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.3
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kabari Hendrick
@@ -9,7 +9,7 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-26 00:00:00 -06:00
12
+ date: 2010-01-31 00:00:00 -06:00
13
13
  default_executable:
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
@@ -50,6 +50,7 @@ files:
50
50
  - VERSION
51
51
  - browser_test/browser_test.rb
52
52
  - lib/auto_excerpt.rb
53
+ - lib/auto_excerpt/parser.rb
53
54
  - spec/auto_excerpt_spec.rb
54
55
  - spec/shared/strip_html_spec.rb
55
56
  - spec/spec.opts