auto_excerpt 0.7.0 → 0.7.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,5 +1 @@
1
- *.php
2
- *.zip
3
- extra
4
- *.gemspec
5
- pkg
1
+ /extra/
data/CHANGELOG CHANGED
@@ -1,3 +1,5 @@
1
+ == 0.7.1
2
+ * Added auto_excerpt command-line executable
1
3
  == 0.7.0 (2010-01-31)
2
4
  * Changed AutoExcerpt from a Class to a Module in order to return a String object when used
3
5
  * Removed String#clean
@@ -55,6 +55,11 @@ Default: false
55
55
  The number of [which] to skip at the beginning of the html when returned.
56
56
  Default: 0
57
57
 
58
+ h3. Command Line
59
+
60
+ *new: v0.7.1* Run @auto_excerpt@ from the command line with all of the options shown above!
61
+ @$ auto_excerpt --words 10 "<p>Some html string</p>"@
62
+
58
63
  h2. Help out on Github!
59
64
 
60
65
  * Fork the project.
data/Rakefile CHANGED
@@ -30,7 +30,10 @@ end
30
30
 
31
31
  begin
32
32
  require 'yard'
33
- YARD::Rake::YardocTask.new
33
+ YARD::Rake::YardocTask.new do |t|
34
+ t.files = ['lib/**/*.rb', 'README.textile', 'CHANGELOG', 'LICENSE']
35
+ t.options = ['--any', '--extra', '--opts']
36
+ end
34
37
  rescue LoadError
35
38
  task :yard do
36
39
  abort "YARD is not available. In order to run yardoc, you must: sudo gem install yard"
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.7.0
1
+ 0.7.1
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "rubygems"
4
+ require "auto_excerpt"
5
+ require "optparse"
6
+
7
+ def perr(msg)
8
+ $stderr.puts(msg)
9
+ end
10
+
11
+ def dashed(key)
12
+ key.to_s.gsub('_','-')
13
+ end
14
+
15
+ @options = {}
16
+
17
+ options_parser = OptionParser.new do |o|
18
+ o.banner = "Usage: auto_excerpt [options] STRING|FILE"
19
+ o.separator ""
20
+
21
+ [
22
+ :characters,
23
+ :words,
24
+ :sentences,
25
+ :paragraphs,
26
+ :skip_words,
27
+ :skip_sentences,
28
+ :skip_paragraphs
29
+ ].each do |key|
30
+ o.on("--#{dashed(key)} N", Integer){ |n| @options[key] = n }
31
+ end
32
+
33
+ [
34
+ :strip_html ,
35
+ :strip_breaks_tabs ,
36
+ :strip_paragraphs
37
+ ].each do |key|
38
+ o.on("--#{dashed(key)}"){ |b| @options[key] = b }
39
+ end
40
+
41
+ o.on("--allowed_tags a,b,c", Array){|allowed| @options[:allowed_tags] = allowed }
42
+
43
+ o.on('--[no-]ending [STRING]'){ |s| @options[:ending] = s || nil }
44
+ o.on_tail('-h','--help'){ puts o; exit }
45
+ end
46
+
47
+ begin
48
+ options_parser.parse!(ARGV)
49
+ string_or_file = ARGV.last
50
+ raise(ArgumentError, "Please provide a STRING or FILE to parse.") unless string_or_file
51
+ string_or_file = File.read(string_or_file) if File.exist?(string_or_file)
52
+ puts AutoExcerpt.new(string_or_file, @options)
53
+ rescue => e
54
+ perr("Error: #{e.message}\n")
55
+ perr(e.backtrace)
56
+ exit(1)
57
+ end
58
+
59
+
@@ -1,8 +1,12 @@
1
1
  require File.join(File.dirname(__FILE__), *%w[auto_excerpt parser])
2
2
 
3
3
  module AutoExcerpt
4
- def self.new(text, options = {})
5
- parser = Parser.new(text, options)
4
+ # @param [String] html A string of html.
5
+ # @param [Hash] optons A hash of options
6
+ # return [String]
7
+ # @see Parser#initialize List of options
8
+ def self.new(html, options = {})
9
+ parser = Parser.new(html, options)
6
10
  parser.parse
7
11
  end
8
12
  end
@@ -2,25 +2,26 @@ module AutoExcerpt
2
2
  # TODO allow for default options to be set.
3
3
  class Parser
4
4
  DEFAULTS = {
5
- :characters => 0,
6
- :words => 0,
7
- :sentences => 0,
8
- :paragraphs => 0,
5
+ :characters => 0,
6
+ :words => 0,
7
+ :sentences => 0,
8
+ :paragraphs => 0,
9
9
  # :skip_characters => 0,
10
- :skip_words => 0,
11
- :skip_sentences => 0,
12
- :skip_paragraphs => 0,
13
- :ending => '...',
14
- :strip_html => false, :allowed_tags => [],
10
+ :skip_words => 0,
11
+ :skip_sentences => 0,
12
+ :skip_paragraphs => 0,
13
+ :ending => '...',
14
+ :strip_html => false,
15
+ :allowed_tags => [],
15
16
  :strip_breaks_tabs => false,
16
- :strip_paragraphs => false
17
+ :strip_paragraphs => false
17
18
  }
18
19
 
19
- # TODO add and allowwed tags option
20
20
  PUNCTUATION_MARKS = /\!\s|\.\s|\?\s/
21
- NO_CLOSE = %w( br hr img input ) # tags that do not have opposite closing tags
22
- OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
23
- CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
21
+ # tags that do not have opposite closing tags
22
+ NO_CLOSE = %w( br hr img input )
23
+ OPENING_TAG = /<([a-z0-9]{1,})\b[^>]*>/im
24
+ CLOSING_TAG = /<\/([a-z0-9]{1,})>/im
24
25
 
25
26
  # @param [String] text The text to be excerpted
26
27
  # @param [Hash] settings The settings for creating the excerpt
@@ -36,10 +37,11 @@ module AutoExcerpt
36
37
  # @option settings [Boolean] :strip_html (false) Strip all HTML from the text before creating the excerpt
37
38
  # @option settings [Boolean] :strip_paragraphs (false) Strip all <p> tags from the HTML before creating the excerpt
38
39
  def initialize(text, settings = {})
40
+ # undo this and change how settings are stored
39
41
  @settings = Marshal.load(Marshal.dump(DEFAULTS)).merge(settings)
40
42
 
41
43
  # make our copy
42
- @body = text.dup.strip
44
+ @body = text.dup.strip
43
45
  @excerpt = ""
44
46
 
45
47
  if @settings[:strip_html]
@@ -49,10 +51,10 @@ module AutoExcerpt
49
51
  @body = clean(@body) if @settings[:strip_breaks_tabs]
50
52
  # TODO replace this with better regex
51
53
  @body.replace(@body.gsub(/<(\/|)p>/,'')) if @settings[:strip_paragraphs]
52
- @charcount = strip_html(@body).length
53
- @wordcount = strip_html(@body).scan(/\w+/).size
54
- @sencount = @body.split(PUNCTUATION_MARKS).size
55
- @pghcount = @body.split("</p>").size
54
+ @charcount = strip_html(@body).length
55
+ @wordcount = strip_html(@body).scan(/\w+/).size
56
+ @sencount = @body.split(PUNCTUATION_MARKS).size
57
+ @pghcount = @body.split("</p>").size
56
58
  @settings[:characters] = 150 if @settings.values_at(:characters, :words, :sentences, :paragraphs).all?{|val| val.zero? || val.nil? }
57
59
  end
58
60
 
@@ -66,12 +68,14 @@ module AutoExcerpt
66
68
  alias_method :parse, :create_excerpt
67
69
 
68
70
  protected
69
-
71
+
72
+ # @api private
70
73
  attr_reader :charcount, :wordcount, :sencount, :pghcount
74
+
75
+ # @api private
71
76
  attr_accessor :settings, :body, :excerpt
72
77
 
73
- # close html tags
74
- # TODO make this work with new strip_html method. Improve regex
78
+ # close html tags
75
79
  def close_tags(text)
76
80
  # Don't bother closing tags if html is stripped since there are no tags.
77
81
  if @settings[:strip_html] && @settings[:allowed_tags].empty?
@@ -96,7 +100,7 @@ module AutoExcerpt
96
100
  end
97
101
  end
98
102
 
99
- @excerpt = [text, @settings[:ending], tagstoclose].compact.join
103
+ @excerpt = [text, @settings[:ending], tagstoclose].join
100
104
  end
101
105
 
102
106
  def non_excerpted_text
@@ -174,4 +178,4 @@ module AutoExcerpt
174
178
  @stripped_html = html.gsub(reg,'')
175
179
  end
176
180
  end
177
- end
181
+ end
@@ -4,6 +4,14 @@ require File.join(File.dirname(__FILE__), *%w[shared strip_html_spec])
4
4
  # I definitely need more tests
5
5
  describe AutoExcerpt do
6
6
 
7
+ it { should respond_to(:new) }
8
+
9
+ it "should return a string" do
10
+ AutoExcerpt.new("foo bar").should be_instance_of(String)
11
+ end
12
+ end
13
+
14
+ describe AutoExcerpt::Parser do
7
15
  it "should limit characters" do
8
16
  text = html_excerpt({:characters => 5, :ending => nil})
9
17
  stripped_text(text).length.should eql(5)
@@ -18,13 +26,13 @@ describe AutoExcerpt do
18
26
  end
19
27
 
20
28
  it "does not include html tags or entities in character count" do
21
- AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).should == "<h1>Hello</h1>"
22
- AutoExcerpt.new("<h1>Copyright &copy; 2010</h1>", {:characters => 11, :ending => nil}).should == "<h1>Copyright &copy;</h1>"
29
+ AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 5, :ending => nil}).parse.should == "<h1>Hello</h1>"
30
+ AutoExcerpt::Parser.new("<h1>Copyright &copy; 2010</h1>", {:characters => 11, :ending => nil}).parse.should == "<h1>Copyright &copy;</h1>"
23
31
  end
24
32
 
25
33
  it "should not cutoff in the middle of a word" do
26
- AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).should == "<h1>Hello</h1>"
27
- AutoExcerpt.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).should == "<h1>Hello World</h1>"
34
+ AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 4, :ending => nil}).parse.should == "<h1>Hello</h1>"
35
+ AutoExcerpt::Parser.new("<h1>Hello World!</h1>", {:characters => 7, :ending => nil}).parse.should == "<h1>Hello World</h1>"
28
36
  end
29
37
 
30
38
  it "should limit words" do
@@ -64,17 +72,17 @@ describe AutoExcerpt do
64
72
  <br />crap<b>dddd
65
73
  <a href="/activity/read_and_frwd/1251?type=comment">(Open)</a>
66
74
  }
67
- text = AutoExcerpt.new(t,{:characters => 270})
75
+ text = AutoExcerpt::Parser.new(t,{:characters => 270}).parse
68
76
  text.match(/(<(\/|)b>)/).captures.length.should eql(2)
69
77
  end
70
-
71
- end
72
78
 
73
- describe AutoExcerpt, "when stripping HTML" do
79
+ describe "when stripping HTML" do
74
80
 
75
- it_should_behave_like "an HTML stripper"
81
+ it_should_behave_like "an HTML stripper"
76
82
 
77
- it "should not strip P tags if :paragraphs option is set" do
78
- AutoExcerpt.new("<p>this is a paragraph.</p><p>this is also a paragraph.</p>",{:paragraphs => 1, :strip_html => true}).should eql("<p>this is a paragraph.</p>")
79
- end
80
- end
83
+ it "should not strip P tags if :paragraphs option is set" do
84
+ AutoExcerpt::Parser.new("<p>this is a paragraph.</p><p>this is also a paragraph.</p>",{:paragraphs => 1, :strip_html => true}).parse.should eql("<p>this is a paragraph.</p>")
85
+ end
86
+ end
87
+ end
88
+
@@ -5,7 +5,8 @@ require "webrick/htmlutils"
5
5
 
6
6
  Object.class_eval do
7
7
  alias_method :old_pp, :pp
8
-
8
+
9
+ # so I can inspect the html in TextMate when things get hard to interpret
9
10
  def pp(str)
10
11
  str = WEBrick::HTMLUtils.escape(str) if str.is_a?(String)
11
12
  old_pp(str)
@@ -18,23 +19,21 @@ module AutoExcerptHelpers
18
19
 
19
20
 
20
21
  def html_excerpt(opts = {})
21
- AutoExcerpt.new(HTML_BLOCK, opts)
22
+ AutoExcerpt::Parser.new(HTML_BLOCK, opts).parse
22
23
  end
23
24
 
24
25
  def normal_excerpt(opts = {})
25
- AutoExcerpt.new(NORMAL_TEXT, opts)
26
+ AutoExcerpt::Parser.new(NORMAL_TEXT, opts).parse
26
27
  end
27
28
 
28
29
  def heavy_excerpt(opts = {})
29
- AutoExcerpt.new(HEAVY_HTML_BLOCK, opts)
30
+ AutoExcerpt::Parser.new(HEAVY_HTML_BLOCK, opts).parse
30
31
  end
31
32
 
32
33
  def stripped_text(t)
33
34
  t.gsub(/<[^>]*(>+|\s*\z)/m, "")
34
35
  end
35
-
36
- CRAP_HTML = ""
37
-
36
+
38
37
  NORMAL_TEXT = %{Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
39
38
 
40
39
  Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: auto_excerpt
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.0
4
+ version: 0.7.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kabari Hendrick
@@ -9,8 +9,8 @@ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
11
 
12
- date: 2010-01-31 00:00:00 -06:00
13
- default_executable:
12
+ date: 2010-02-23 00:00:00 -06:00
13
+ default_executable: auto_excerpt
14
14
  dependencies:
15
15
  - !ruby/object:Gem::Dependency
16
16
  name: rspec
@@ -34,8 +34,8 @@ dependencies:
34
34
  version:
35
35
  description: Create excerpts from html formatted text. HTML tags are automatically closed.
36
36
  email: kabari@gmail.com
37
- executables: []
38
-
37
+ executables:
38
+ - auto_excerpt
39
39
  extensions: []
40
40
 
41
41
  extra_rdoc_files:
@@ -48,6 +48,7 @@ files:
48
48
  - README.textile
49
49
  - Rakefile
50
50
  - VERSION
51
+ - bin/auto_excerpt
51
52
  - browser_test/browser_test.rb
52
53
  - lib/auto_excerpt.rb
53
54
  - lib/auto_excerpt/parser.rb