auto_excerpt 0.7.0 → 0.7.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -5
- data/CHANGELOG +2 -0
- data/README.textile +5 -0
- data/Rakefile +4 -1
- data/VERSION +1 -1
- data/bin/auto_excerpt +59 -0
- data/lib/auto_excerpt.rb +6 -2
- data/lib/auto_excerpt/parser.rb +28 -24
- data/spec/auto_excerpt_spec.rb +21 -13
- data/spec/spec_helper.rb +6 -7
- metadata +6 -5
data/.gitignore
CHANGED
data/CHANGELOG
CHANGED
data/README.textile
CHANGED
@@ -55,6 +55,11 @@ Default: false
|
|
55
55
|
The number of [which] to skip at the beginning of the html when returned.
|
56
56
|
Default: 0
|
57
57
|
|
58
|
+
h3. Command Line
|
59
|
+
|
60
|
+
*new: v0.7.1* Run @auto_excerpt@ from the command line with all of the options shown above!
|
61
|
+
@$ auto_excerpt --words 10 "<p>Some html string</p>"@
|
62
|
+
|
58
63
|
h2. Help out on Github!
|
59
64
|
|
60
65
|
* Fork the project.
|
data/Rakefile
CHANGED
@@ -30,7 +30,10 @@ end
|
|
30
30
|
|
31
31
|
begin
|
32
32
|
require 'yard'
|
33
|
-
YARD::Rake::YardocTask.new
|
33
|
+
YARD::Rake::YardocTask.new do |t|
|
34
|
+
t.files = ['lib/**/*.rb', 'README.textile', 'CHANGELOG', 'LICENSE']
|
35
|
+
t.options = ['--any', '--extra', '--opts']
|
36
|
+
end
|
34
37
|
rescue LoadError
|
35
38
|
task :yard do
|
36
39
|
abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.7.
|
1
|
+
0.7.1
|
data/bin/auto_excerpt
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "rubygems"
|
4
|
+
require "auto_excerpt"
|
5
|
+
require "optparse"
|
6
|
+
|
7
|
+
def perr(msg)
|
8
|
+
$stderr.puts(msg)
|
9
|
+
end
|
10
|
+
|
11
|
+
def dashed(key)
|
12
|
+
key.to_s.gsub('_','-')
|
13
|
+
end
|
14
|
+
|
15
|
+
@options = {}
|
16
|
+
|
17
|
+
options_parser = OptionParser.new do |o|
|
18
|
+
o.banner = "Usage: auto_excerpt [options] STRING|FILE"
|
19
|
+
o.separator ""
|
20
|
+
|
21
|
+
[
|
22
|
+
:characters,
|
23
|
+
:words,
|
24
|
+
:sentences,
|
25
|
+
:paragraphs,
|
26
|
+
:skip_words,
|
27
|
+
:skip_sentences,
|
28
|
+
:skip_paragraphs
|
29
|
+
].each do |key|
|
30
|
+
o.on("--#{dashed(key)} N", Integer){ |n| @options[key] = n }
|
31
|
+
end
|
32
|
+
|
33
|
+
[
|
34
|
+
:strip_html ,
|
35
|
+
:strip_breaks_tabs ,
|
36
|
+
:strip_paragraphs
|
37
|
+
].each do |key|
|
38
|
+
o.on("--#{dashed(key)}"){ |b| @options[key] = b }
|
39
|
+
end
|
40
|
+
|
41
|
+
o.on("--allowed_tags a,b,c", Array){|allowed| @options[:allowed_tags] = allowed }
|
42
|
+
|
43
|
+
o.on('--[no-]ending [STRING]'){ |s| @options[:ending] = s || nil }
|
44
|
+
o.on_tail('-h','--help'){ puts o; exit }
|
45
|
+
end
|
46
|
+
|
47
|
+
begin
|
48
|
+
options_parser.parse!(ARGV)
|
49
|
+
string_or_file = ARGV.last
|
50
|
+
raise(ArgumentError, "Please provide a STRING or FILE to parse.") unless string_or_file
|
51
|
+
string_or_file = File.read(string_or_file) if File.exist?(string_or_file)
|
52
|
+
puts AutoExcerpt.new(string_or_file, @options)
|
53
|
+
rescue => e
|
54
|
+
perr("Error: #{e.message}\n")
|
55
|
+
perr(e.backtrace)
|
56
|
+
exit(1)
|
57
|
+
end
|
58
|
+
|
59
|
+
|
data/lib/auto_excerpt.rb
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
require File.join(File.dirname(__FILE__), *%w[auto_excerpt parser])
|
2
2
|
|
3
3
|
module AutoExcerpt
|
4
|
-
|
5
|
-
|
4
|
+
# @param [String] html A string of html.
|
5
|
+
# @param [Hash] optons A hash of options
|
6
|
+
# return [String]
|
7
|
+
# @see Parser#initialize List of options
|
8
|
+
def self.new(html, options = {})
|
9
|
+
parser = Parser.new(html, options)
|
6
10
|
parser.parse
|
7
11
|
end
|
8
12
|
end
|
data/lib/auto_excerpt/parser.rb
CHANGED
@@ -2,25 +2,26 @@ module AutoExcerpt
|
|
2
2
|
# TODO allow for default options to be set.
|
3
3
|
class Parser
|
4
4
|
DEFAULTS = {
|
5
|
-
:characters
|
6
|
-
:words
|
7
|
-
:sentences
|
8
|
-
:paragraphs
|
5
|
+
:characters => 0,
|
6
|
+
:words => 0,
|
7
|
+
:sentences => 0,
|
8
|
+
:paragraphs => 0,
|
9
9
|
# :skip_characters => 0,
|
10
|
-
:skip_words
|
11
|
-
:skip_sentences
|
12
|
-
:skip_paragraphs
|
13
|
-
:ending
|
14
|
-
:strip_html
|
10
|
+
:skip_words => 0,
|
11
|
+
:skip_sentences => 0,
|
12
|
+
:skip_paragraphs => 0,
|
13
|
+
:ending => '...',
|
14
|
+
:strip_html => false,
|
15
|
+
:allowed_tags => [],
|
15
16
|
:strip_breaks_tabs => false,
|
16
|
-
:strip_paragraphs
|
17
|
+
:strip_paragraphs => false
|
17
18
|
}
|
18
19
|
|
19
|
-
# TODO add and allowwed tags option
|
20
20
|
PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
# tags that do not have opposite closing tags
|
22
|
+
NO_CLOSE = %w( br hr img input )
|
23
|
+
OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
|
24
|
+
CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
|
24
25
|
|
25
26
|
# @param [String] text The text to be excerpted
|
26
27
|
# @param [Hash] settings The settings for creating the excerpt
|
@@ -36,10 +37,11 @@ module AutoExcerpt
|
|
36
37
|
# @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
|
37
38
|
# @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
|
38
39
|
def initialize(text, settings = {})
|
40
|
+
# undo this and change how settings are stored
|
39
41
|
@settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
|
40
42
|
|
41
43
|
# make our copy
|
42
|
-
@body
|
44
|
+
@body = text.dup.strip
|
43
45
|
@excerpt = ""
|
44
46
|
|
45
47
|
if @settings[:strip_html]
|
@@ -49,10 +51,10 @@ module AutoExcerpt
|
|
49
51
|
@body = clean(@body) if @settings[:strip_breaks_tabs]
|
50
52
|
# TODO replace this with better regex
|
51
53
|
@body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
|
52
|
-
@charcount
|
53
|
-
@wordcount
|
54
|
-
@sencount
|
55
|
-
@pghcount
|
54
|
+
@charcount = strip_html(@body).length
|
55
|
+
@wordcount = strip_html(@body).scan(/\w+/).size
|
56
|
+
@sencount = @body.split(PUNCTUATION_MARKS).size
|
57
|
+
@pghcount = @body.split("</p>").size
|
56
58
|
@settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil? }
|
57
59
|
end
|
58
60
|
|
@@ -66,12 +68,14 @@ module AutoExcerpt
|
|
66
68
|
alias_method :parse, :create_excerpt
|
67
69
|
|
68
70
|
protected
|
69
|
-
|
71
|
+
|
72
|
+
# @api private
|
70
73
|
attr_reader :charcount, :wordcount, :sencount, :pghcount
|
74
|
+
|
75
|
+
# @api private
|
71
76
|
attr_accessor :settings, :body, :excerpt
|
72
77
|
|
73
|
-
|
74
|
-
# TODO make this work with new strip_html method. Improve regex
|
78
|
+
# close html tags
|
75
79
|
def close_tags(text)
|
76
80
|
# Don't bother closing tags if html is stripped since there are no tags.
|
77
81
|
if @settings[:strip_html] && @settings[:allowed_tags].empty?
|
@@ -96,7 +100,7 @@ module AutoExcerpt
|
|
96
100
|
end
|
97
101
|
end
|
98
102
|
|
99
|
-
@excerpt = [text, @settings[:ending], tagstoclose].
|
103
|
+
@excerpt = [text, @settings[:ending], tagstoclose].join
|
100
104
|
end
|
101
105
|
|
102
106
|
def non_excerpted_text
|
@@ -174,4 +178,4 @@ module AutoExcerpt
|
|
174
178
|
@stripped_html = html.gsub(reg,'')
|
175
179
|
end
|
176
180
|
end
|
177
|
-
end
|
181
|
+
end
|
data/spec/auto_excerpt_spec.rb
CHANGED
@@ -4,6 +4,14 @@ require File.join(File.dirname(__FILE__), *%w[shared strip_html_spec])
|
|
4
4
|
# I definitely need more tests
|
5
5
|
describe AutoExcerpt do
|
6
6
|
|
7
|
+
it { should respond_to(:new) }
|
8
|
+
|
9
|
+
it "should return a string" do
|
10
|
+
AutoExcerpt.new("foo bar").should be_instance_of(String)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
describe AutoExcerpt::Parser do
|
7
15
|
it "should limit characters" do
|
8
16
|
text = html_excerpt({:characters => 5, :ending => nil})
|
9
17
|
stripped_text(text).length.should eql(5)
|
@@ -18,13 +26,13 @@ describe AutoExcerpt do
|
|
18
26
|
end
|
19
27
|
|
20
28
|
it "does not include html tags or entities in character count" do
|
21
|
-
AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).should == "<h1>Hello</h1>"
|
22
|
-
AutoExcerpt.new("<h1>Copyright © 2010</h1>", {:characters => 11, :ending => nil}).should == "<h1>Copyright ©</h1>"
|
29
|
+
AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).parse.should == "<h1>Hello</h1>"
|
30
|
+
AutoExcerpt::Parser.new("<h1>Copyright © 2010</h1>", {:characters => 11, :ending => nil}).parse.should == "<h1>Copyright ©</h1>"
|
23
31
|
end
|
24
32
|
|
25
33
|
it "should not cutoff in the middle of a word" do
|
26
|
-
AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
|
27
|
-
AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).should == "<h1>Hello World</h1>"
|
34
|
+
AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).parse.should == "<h1>Hello</h1>"
|
35
|
+
AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).parse.should == "<h1>Hello World</h1>"
|
28
36
|
end
|
29
37
|
|
30
38
|
it "should limit words" do
|
@@ -64,17 +72,17 @@ describe AutoExcerpt do
|
|
64
72
|
<br />crap<b>dddd
|
65
73
|
<a href="/activity/read_and_frwd/1251?type=comment">(Open)</a>
|
66
74
|
}
|
67
|
-
text = AutoExcerpt.new(t,{:characters => 270})
|
75
|
+
text = AutoExcerpt::Parser.new(t,{:characters => 270}).parse
|
68
76
|
text.match(/(<(\/|)b>)/).captures.length.should eql(2)
|
69
77
|
end
|
70
|
-
|
71
|
-
end
|
72
78
|
|
73
|
-
describe
|
79
|
+
describe "when stripping HTML" do
|
74
80
|
|
75
|
-
|
81
|
+
it_should_behave_like "an HTML stripper"
|
76
82
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
end
|
83
|
+
it "should not strip P tags if :paragraphs option is set" do
|
84
|
+
AutoExcerpt::Parser.new("<p>this is a paragraph.</p><p>this is also a paragraph.</p>",{:paragraphs => 1, :strip_html => true}).parse.should eql("<p>this is a paragraph.</p>")
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
data/spec/spec_helper.rb
CHANGED
@@ -5,7 +5,8 @@ require "webrick/htmlutils"
|
|
5
5
|
|
6
6
|
Object.class_eval do
|
7
7
|
alias_method :old_pp, :pp
|
8
|
-
|
8
|
+
|
9
|
+
# so I can inspect the html in TextMate when things get hard to interpret
|
9
10
|
def pp(str)
|
10
11
|
str = WEBrick::HTMLUtils.escape(str) if str.is_a?(String)
|
11
12
|
old_pp(str)
|
@@ -18,23 +19,21 @@ module AutoExcerptHelpers
|
|
18
19
|
|
19
20
|
|
20
21
|
def html_excerpt(opts = {})
|
21
|
-
AutoExcerpt.new(HTML_BLOCK, opts)
|
22
|
+
AutoExcerpt::Parser.new(HTML_BLOCK, opts).parse
|
22
23
|
end
|
23
24
|
|
24
25
|
def normal_excerpt(opts = {})
|
25
|
-
AutoExcerpt.new(NORMAL_TEXT, opts)
|
26
|
+
AutoExcerpt::Parser.new(NORMAL_TEXT, opts).parse
|
26
27
|
end
|
27
28
|
|
28
29
|
def heavy_excerpt(opts = {})
|
29
|
-
AutoExcerpt.new(HEAVY_HTML_BLOCK, opts)
|
30
|
+
AutoExcerpt::Parser.new(HEAVY_HTML_BLOCK, opts).parse
|
30
31
|
end
|
31
32
|
|
32
33
|
def stripped_text(t)
|
33
34
|
t.gsub(/<[^>]*(>+|\s*\z)/m, "")
|
34
35
|
end
|
35
|
-
|
36
|
-
CRAP_HTML = ""
|
37
|
-
|
36
|
+
|
38
37
|
NORMAL_TEXT = %{Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
39
38
|
|
40
39
|
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: auto_excerpt
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kabari Hendrick
|
@@ -9,8 +9,8 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
13
|
-
default_executable:
|
12
|
+
date: 2010-02-23 00:00:00 -06:00
|
13
|
+
default_executable: auto_excerpt
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
16
16
|
name: rspec
|
@@ -34,8 +34,8 @@ dependencies:
|
|
34
34
|
version:
|
35
35
|
description: Create excerpts from html formatted text. HTML tags are automatically closed.
|
36
36
|
email: kabari@gmail.com
|
37
|
-
executables:
|
38
|
-
|
37
|
+
executables:
|
38
|
+
- auto_excerpt
|
39
39
|
extensions: []
|
40
40
|
|
41
41
|
extra_rdoc_files:
|
@@ -48,6 +48,7 @@ files:
|
|
48
48
|
- README.textile
|
49
49
|
- Rakefile
|
50
50
|
- VERSION
|
51
|
+
- bin/auto_excerpt
|
51
52
|
- browser_test/browser_test.rb
|
52
53
|
- lib/auto_excerpt.rb
|
53
54
|
- lib/auto_excerpt/parser.rb
|