RubyGems - lingua - Versions diffs - 0.5.2 → 0.6.0 - Mend

lingua 0.5.2 → 0.6.0

Files changed (11) hide show

data/README.rdoc +2 -0
data/VERSION +1 -1
data/lib/lingua/en/paragraph.rb +10 -0
data/lib/lingua/en/readability.rb +142 -141
data/lib/lingua/en/sentence.rb +49 -24
data/lib/lingua/en/syllable/guess.rb +32 -26
data/lib/lingua/en/syllable.rb +10 -10
data/spec/lingua/en/paragraph_spec.rb +29 -0
data/spec/lingua/en/readability_spec.rb +29 -0
data/spec/lingua/en/sentence_spec.rb +136 -0
metadata +11 -4

data/README.rdoc CHANGED Viewed

@@ -4,6 +4,8 @@ This library is originally from http://pressure.to/ruby, by Alex Fenton <alex@pr
 It is currently maintained by David Balatero <dbalatero@gmail.com>.
+Slowly but surely, specs are being added (original codebase was not tested), as well as better functionality.
 == Note on Patches/Pull Requests
 * Fork the project.

data/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.5.2
1	+ 0.6.0

data/lib/lingua/en/paragraph.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module Lingua
+  module EN
+    module Paragraph
+      # Splits text into an array of paragraphs.
+      def self.paragraphs(text)
+        text.strip.split(/(?:\n[\r\t ]*)+/).collect { |p| p.strip }
+      end
+    end
+  end
+end

data/lib/lingua/en/readability.rb CHANGED Viewed

@@ -1,143 +1,144 @@
 module Lingua
-module EN
-# The class Lingua::EN::Readability takes English text and analyses formal
-# characteristic
-class Readability
-	require 'lingua/en/syllable'
-	require 'lingua/en/sentence'
-	attr_reader :text, :paragraphs, :sentences, :words,  :frequencies
-	# The constructor accepts the text to be analysed, and returns a report
-	# object which gives access to the
-	def initialize(text)
-		@text                = text.dup
-		@paragraphs          = text.split(/\n\s*\n\s*/)
-		@sentences           = Lingua::EN::Sentence.sentences(@text)
-		@words               = []
-		@frequencies         = {}
-		@frequencies.default = 0
-		@syllables           = 0
-		@complex_words       = 0
-		count_words
-	end
-	# The number of paragraphs in the sample. A paragraph is defined as a
-	# newline followed by one or more empty or whitespace-only lines.
-	def num_paragraphs
-		@paragraphs.length
-	end
-	# The number of sentences in the sample. The meaning of a "sentence" is
-	# defined by Lingua::EN::Sentence.
-	def num_sentences
-		@sentences.length
-	end
-	# The number of characters in the sample.
-	def num_chars
-		@text.length
-	end
-	alias :num_characters :num_chars
-	# The total number of words used in the sample. Numbers as digits are not
-	# counted.
-	def num_words
-		@words.length
-	end
-	# The total number of syllables in the text sample. Just for completeness.
-	def num_syllables
-		@syllables
-	end
-	# The number of different unique words used in the text sample.
-	def num_unique_words
-		@frequencies.keys.length
-	end
-	# An array containing each unique word used in the text sample.
-	def unique_words
-		@frequencies.keys
-	end
-	# The number of occurences of the word +word+ in the text sample.
-	def occurrences(word)
-		@frequencies[word]
-	end
-	# The average number of words per sentence.
-	def words_per_sentence
-		@words.length.to_f / @sentences.length.to_f
-	end
-	# The average number of syllables per word. The syllable count is performed
-	# by Lingua::EN::Syllable, and so may not be completely accurate, especially
-	# if the Carnegie-Mellon Pronouncing Dictionary is not installed.
-	def syllables_per_word
-		@syllables.to_f / @words.length.to_f
-	end
-	# Flesch-Kincaid level of the text sample. This measure scores text based
-	# on the American school grade system; a score of 7.0 would indicate that
-	# the text is readable by a seventh grader. A score of 7.0 to 8.0 is
-	# regarded as optimal for ordinary text.
-	def kincaid
-		(11.8 * syllables_per_word) +  (0.39 * words_per_sentence) - 15.59
-	end
-	# Flesch reading ease of the text sample. A higher score indicates text that
-	# is easier to read. The score is on a 100-point scale, and a score of 60-70
-	# is regarded as optimal for ordinary text.
-	def flesch
-		206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
-	end
-	# The Gunning Fog Index of the text sample. The index indicates the number
-	# of years of formal education that a reader of average intelligence would
-	# need to comprehend the text. A higher score indicates harder text; a value
-	# of around 12 is indicated as ideal for ordinary text.
-	def fog
-	  ( words_per_sentence +  percent_fog_complex_words ) * 0.4
-	end
-	# The percentage of words that are defined as "complex" for the purpose of
-	# the Fog Index. This is non-hyphenated words of three or more syllabes.
-	def percent_fog_complex_words
-		( @complex_words.to_f / @words.length.to_f ) * 100
-	end
-	# Return a nicely formatted report on the sample, showing most the useful
-	# statistics about the text sample.
-	def report
-		sprintf "Number of paragraphs           %d \n" <<
-				"Number of sentences            %d \n" <<
-				"Number of words                %d \n" <<
-				"Number of characters           %d \n\n" <<
-				"Average words per sentence     %.2f \n" <<
-				"Average syllables per word     %.2f \n\n" <<
-				"Flesch score                   %2.2f \n" <<
-				"Flesh-Kincaid grade level      %2.2f \n" <<
-				"Fog Index                      %2.2f \n",
-				num_paragraphs, num_sentences, num_words, num_characters,
-				words_per_sentence, syllables_per_word,
-				flesch, kincaid, fog
-	end
-	private
-	def count_words
-		for match in @text.scan /\b([a-z][a-z\-']*)\b/i
-			word = match[0]
-			@words.push word
-			@frequencies[word] += 1
-			syllables = Lingua::EN::Syllable.syllables(word)
-			@syllables += syllables
-			if syllables > 2 && word !~ /-/
-				@complex_words += 1 # for Fog Index
-			end
-		end
-	end
-end
-end
+  module EN
+    # The class Lingua::EN::Readability takes English text and analyses formal
+    # characteristic
+    class Readability
+      attr_reader :text, :paragraphs, :sentences, :words, :frequencies
+      # The constructor accepts the text to be analysed, and returns a report
+      # object which gives access to the
+      def initialize(text)
+        @text                = text.dup
+        @paragraphs          = Lingua::EN::Paragraph.paragraphs(self.text)
+        @sentences           = Lingua::EN::Sentence.sentences(self.text)
+        @words               = []
+        @frequencies         = {}
+        @frequencies.default = 0
+        @syllables           = 0
+        @complex_words       = 0
+        count_words
+      end
+      # The number of paragraphs in the sample. A paragraph is defined as a
+      # newline followed by one or more empty or whitespace-only lines.
+      def num_paragraphs
+        paragraphs.length
+      end
+      # The number of sentences in the sample. The meaning of a "sentence" is
+      # defined by Lingua::EN::Sentence.
+      def num_sentences
+        sentences.length
+      end
+      # The number of characters in the sample.
+      def num_chars
+        text.length
+      end
+      alias :num_characters :num_chars
+      # The total number of words used in the sample. Numbers as digits are not
+      # counted.
+      def num_words
+        words.length
+      end
+      # The total number of syllables in the text sample. Just for completeness.
+      def num_syllables
+        @syllables
+      end
+      # The number of different unique words used in the text sample.
+      def num_unique_words
+        @frequencies.keys.length
+      end
+      # An array containing each unique word used in the text sample.
+      def unique_words
+        @frequencies.keys
+      end
+      # The number of occurences of the word +word+ in the text sample.
+      def occurrences(word)
+        @frequencies[word]
+      end
+      # The average number of words per sentence.
+      def words_per_sentence
+        words.length.to_f / sentences.length.to_f
+      end
+      # The average number of syllables per word. The syllable count is
+      # performed by Lingua::EN::Syllable, and so may not be completely
+      # accurate, especially if the Carnegie-Mellon Pronouncing Dictionary
+      # is not installed.
+      def syllables_per_word
+        @syllables.to_f / words.length.to_f
+      end
+      # Flesch-Kincaid level of the text sample. This measure scores text based
+      # on the American school grade system; a score of 7.0 would indicate that
+      # the text is readable by a seventh grader. A score of 7.0 to 8.0 is
+      # regarded as optimal for ordinary text.
+      def kincaid
+        (11.8 * syllables_per_word) +  (0.39 * words_per_sentence) - 15.59
+      end
+      # Flesch reading ease of the text sample. A higher score indicates text
+      # that is easier to read. The score is on a 100-point scale, and a score
+      # of 60-70 is regarded as optimal for ordinary text.
+      def flesch
+        206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
+      end
+      # The Gunning Fog Index of the text sample. The index indicates the number
+      # of years of formal education that a reader of average intelligence would
+      # need to comprehend the text. A higher score indicates harder text; a
+      # value of around 12 is indicated as ideal for ordinary text.
+      def fog
+        ( words_per_sentence +  percent_fog_complex_words ) * 0.4
+      end
+      # The percentage of words that are defined as "complex" for the purpose of
+      # the Fog Index. This is non-hyphenated words of three or more syllabes.
+      def percent_fog_complex_words
+        ( @complex_words.to_f / words.length.to_f ) * 100
+      end
+      # Return a nicely formatted report on the sample, showing most the useful
+      # statistics about the text sample.
+      def report
+        sprintf "Number of paragraphs           %d \n" <<
+        "Number of sentences            %d \n" <<
+        "Number of words                %d \n" <<
+        "Number of characters           %d \n\n" <<
+        "Average words per sentence     %.2f \n" <<
+        "Average syllables per word     %.2f \n\n" <<
+        "Flesch score                   %2.2f \n" <<
+        "Flesh-Kincaid grade level      %2.2f \n" <<
+        "Fog Index                      %2.2f \n",
+          num_paragraphs, num_sentences, num_words, num_characters,
+          words_per_sentence, syllables_per_word,
+          flesch, kincaid, fog
+      end
+      private
+      def count_words
+        @text.scan(/\b([a-z][a-z\-']*)\b/i).each do |match|
+          word = match[0]
+          @words << word
+          # up frequency counts
+          @frequencies[word] += 1
+          # syllable counts
+          syllables = Lingua::EN::Syllable.syllables(word)
+          @syllables += syllables
+          if syllables > 2 && !word.include?('-')
+            @complex_words += 1 # for Fog Index
+          end
+        end
+      end
+    end
+  end
 end

data/lib/lingua/en/sentence.rb CHANGED Viewed

@@ -1,62 +1,87 @@
 module Lingua
   module EN
-    # The module Lingua::EN::Sentence takes English text, and attempts to
+    # The class Lingua::EN::Sentence takes English text, and attempts to
     # split it up into sentences, respecting abbreviations.
-    module Sentence
+    class Sentence
+      class << self
+        attr_reader :abbreviations
+        attr_reader :abbr_regex
+      end
       EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
       Titles   = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
         'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
         'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
       Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
         'corp', 'plc' ] unless defined?(Entities)
       Months   = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
         'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
       Days     = [ 'mon', 'tue', 'wed', 'thu',
                    'fri', 'sat', 'sun' ] unless defined?(Days)
       Misc     = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
       Streets  = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
                    'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
-      @@abbreviations = Titles + Entities + Months + Days + Streets + Misc
+      # Finds abbreviations, like e.g., i.e., U.S., u.S., U.S.S.R.
+      ABBR_DETECT = /(?:\s(?:(?:(?:\w\.){2,}\w?)|(?:\w\.\w)))/ unless defined?(ABBR_DETECT)
+      # Finds punctuation that ends paragraphs.
+      PUNCTUATION_DETECT = /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ unless defined?(PUNCTUATION_DETECT)
+      CORRECT_ABBR = /(#{ABBR_DETECT})#{EOS}(\s+[a-z0-9])/
       # Split the passed text into individual sentences, trim these and return
       # as an array. A sentence is marked by one of the punctuation marks ".", "?"
       # or "!" followed by whitespace. Sequences of full stops (such as an
       # ellipsis marker "..." and stops after a known abbreviation are ignored.
       def self.sentences(text)
+        # Make sure we work with a duplicate, as we are modifying the
+        # text with #gsub!
         text = text.dup
-        # initial split after punctuation - have to preserve trailing whitespace
-        # for the ellipsis correction next
-        # would be nicer to use look-behind and look-ahead assertions to skip
-        # ellipsis marks, but Ruby doesn't support look-behind
-        text.gsub!( /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ ) { $1 << EOS << $2 }
+        # Mark end of sentences with EOS marker.
+        # We preserve the trailing whitespace ($2) so that we can
+        # fix ellipses (...)!
+        text.gsub!(PUNCTUATION_DETECT) { $1 << EOS << $2 }
-        # correct ellipsis marks and rows of stops
-        text.gsub!( /(\.\.\.*)#{EOS}/ ) { $1 }
+        # Correct ellipsis marks.
+        text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
-        # correct abbreviations
-        # TODO - precompile this regex?
-        text.gsub!( /(#{@@abbreviations.join("|")})\.#{EOS}/i ) { $1 << '.' }
+        # Correct e.g, i.e. marks.
+        text.gsub!(CORRECT_ABBR, "\\1\\2")
-        # split on EOS marker, strip gets rid of trailing whitespace
-        text.split(EOS).map { | sentence | sentence.strip }
+        # Correct abbreviations
+        text.gsub!(@abbr_regex) { $1 << '.' }
+        # Split on EOS marker, get rid of trailing whitespace.
+        # Remove empty sentences.
+        text.split(EOS).
+          map { |sentence| sentence.strip }.
+          delete_if { |sentence| sentence.nil? || sentence.empty? }
       end
-      # add a list of abbreviations to the list that's used to detect false
+      # Adds a list of abbreviations to the list that's used to detect false
       # sentence ends. Return the current list of abbreviations in use.
       def self.abbreviation(*abbreviations)
-        @@abbreviations += abbreviations
-        @@abbreviations
+        @abbreviations += abbreviations
+        @abbreviations.uniq!
+        set_abbr_regex!
+        @abbreviations
+      end
+      def self.initialize_abbreviations!
+        @abbreviations = Titles + Entities + Months + Days + Streets + Misc
+        set_abbr_regex!
+      end
+      def self.set_abbr_regex!
+        @abbr_regex = /(#{abbreviations.join("|")})\.#{EOS}/i
       end
+      initialize_abbreviations!
     end
   end
 end

data/lib/lingua/en/syllable/guess.rb CHANGED Viewed

@@ -1,33 +1,39 @@
 module Lingua
   module EN
     module Syllable
-      # Uses english word patterns to guess the number of syllables. A single module
-      # method is made available, +syllables+, which, when passed an english word,
-      # will return the number of syllables it estimates are in the word.
-      # English orthography (the representation of spoken sounds as written signs) is
-      # not regular. The same spoken sound can be represented in multiple different
-      # ways in written English (e.g. rough/cuff), and the same written letters
-      # can be pronounced in different ways in different words (e.g. rough/bough).
-      # As the same series of letters can be pronounced in different ways, it is not
-      # possible to write an algorithm which can always guess the number of syllables
-      # in an english word correctly. However, it is possible to use frequently
-      # recurring patterns in english (such as "a final -e is usually silent") to
-      # guess with a level of accuracy that is acceptable for applications like
-      # syllable counting for readability scoring. This module implements such an
-      # algorithm.
-      # This module is inspired by the Perl Lingua::EN::Syllable module. However, it
-      # uses a different (though not larger) set of patterns to compensate for the
-      # 'special cases' which arise out of English's irregular orthography. A number
-      # of extra patterns (particularly for derived word forms) means that this module
-      # is somewhat more accurate than the Perl original. It also omits a number of
-      # patterns found in the original which seem to me to apply to such a small number
-      # of cases, or to be of dubious value. Testing the guesses against the Carnegie
-      # Mellon Pronouncing Dictionary, this module guesses right around 90% of the
+      # Uses English word patterns to guess the number of syllables. A single
+      # module method is made available, +syllables+, which, when passed an
+      # English word, will return the number of syllables it estimates are in
+      # the word.
+      #
+      # English orthography (the representation of spoken sounds as written
+      # signs) is not regular. The same spoken sound can be represented in
+      # multiple different ways in written English (e.g. rough/cuff), and the
+      # same written letters can be pronounced in different ways in different
+      # words (e.g. rough/bough).
+      #
+      # As the same series of letters can be pronounced in different ways, it is
+      # not possible to write an algorithm which can always guess the number of
+      # syllables in an english word correctly. However, it is possible to use
+      # frequently recurring patterns in english (such as "a final -e is usually
+      # silent") to guess with a level of accuracy that is acceptable for
+      # applications like syllable counting for readability scoring. This module
+      # implements such an algorithm.
+      #
+      # This module is inspired by the Perl Lingua::EN::Syllable module.
+      # However, it uses a different (though not larger) set of patterns to
+      # compensate for the 'special cases' which arise out of English's
+      # irregular orthography. A number of extra patterns (particularly for
+      # derived word forms) means that this module is somewhat more accurate
+      # than the Perl original. It also omits a number of patterns found in the
+      # original which seem to me to apply to such a small number of cases, or
+      # to be of dubious value. Testing the guesses against the Carnegie Mellon
+      # Pronouncing Dictionary, this module guesses right around 90% of the
       # time, as against about 85% of the time for the Perl module. However, the
-      # dictionary contains a large number of foreign loan words and proper names, and
-      # so when the algorithm is tested against 'real world' english, its accuracy
-      # is a good deal better. Testing against a range of samples, it guesses right
-      # about 95-97% of the time.
+      # dictionary contains a large number of foreign loan words and proper
+      # names, and so when the algorithm is tested against 'real world' english,
+      # its accuracy is a good deal better. Testing against a range of samples,
+      # it guesses right about 95-97% of the time.
       module Guess
         # special cases - 1 syllable less than expected
         SubSyl = [

data/lib/lingua/en/syllable.rb CHANGED Viewed

@@ -2,18 +2,18 @@ require 'lingua/en/syllable/guess'
 module Lingua
   module EN
-    # The module Lingua::EN::Syllable contains a single class method, +syllable+,
-    # which will use the most accurate technique available to determine the number
-    # syllables in a string containing a word passed to it.
+    # The module Lingua::EN::Syllable contains a single class method,
+    # +syllable+, which will use the most accurate technique available to
+    # determine the number syllables in a string containing a word passed to it.
+    #
+    ########## REMOVED BY dbalatero:
     # The exact definition of the function depends on the availability of the
-    # Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
-    # the number of syllables as determined by the dictionary will be returned. If
-    # the dictionary is not available, or if a word not contained in the dictionary
-    # is passed, it will return the number of syllables as determined by the
-    # module Lingua::EN::Syllable::Guess. For more details, see there and
+    # Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
+    # the number of syllables as determined by the dictionary will be returned.
+    # If the dictionary is not available, or if a word not contained in the
+    # dictionary is passed, it will return the number of syllables as determined
+    # by the module Lingua::EN::Syllable::Guess. For more details, see there and
     # Lingua::EN::Syllable::Dictionary.
-    #
-    # dbalatero: removed dictionary.
     module Syllable
       def self.syllables(word)
         Guess::syllables word

data/spec/lingua/en/paragraph_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+describe Lingua::EN::Paragraph do
+  describe "#paragraphs" do
+    it "should return paragraphs with extra whitespace in the line breaks" do
+      text = "Ok.\n    \nTest."
+      result = Lingua::EN::Paragraph.paragraphs(text)
+      result.should have(2).things
+      result[0].should == "Ok."
+      result[1].should == "Test."
+    end
+    it "should break up paragraphs with > 2 line breaks" do
+      text = "Ok.\n\n\nTest."
+      result = Lingua::EN::Paragraph.paragraphs(text)
+      result.should have(2).things
+      result[0].should == "Ok."
+      result[1].should == "Test."
+    end
+    it "should ignore trailing newline chars" do
+      text = "Ok.\n  \n\nTest.\n  \r\n  \n\n"
+      result = Lingua::EN::Paragraph.paragraphs(text)
+      result.should have(2).things
+      result[0].should == "Ok."
+      result[1].should == "Test."
+    end
+  end
+end

data/spec/lingua/en/readability_spec.rb ADDED Viewed

@@ -0,0 +1,29 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+describe Lingua::EN::Readability do
+  before(:each) do
+    @text = <<-EOF
+    After marriage, the next big event in the couples lives will be their honeymoon. It is a time when the newly weds can get away from relatives and friends to spend some significant time getting to know one another. This time alone together that the couple shares is called the honeymoon. A great gift idea for the married couple would be to give them a surprise tour package. Most women would like to go on a honeymoon.
+    The week or two before the ceremonies would be the best time to schedule a tour because then the budget for this event could be considered. In winter there are more opportunities for the couple to get close to one another because of the cold weather. It is easier to snuggle when the weather is not favorable to outdoor activities. This would afford the couple ample time to know more about themselves during the honeymoon.
+    Honeymoon plans should be discussed with the wife to ensure that the shock is pleasant and not a negative experience to her. It is also a good idea in this case, to ask her probing questions as to where she would like to go. Perhaps you could get a friend or family member to ask her what would be her favorite travel location. That would ensure that you know just what she is looking for.
+    Make sure that the trip is exactly what she wants. Then on the wedding night tell her about the adventure so that the needed accommodations can be made.
+    EOF
+    @report = Lingua::EN::Readability.new(@text)
+  end
+  describe "#num_paragraphs" do
+    it "should return the correct count of paragraphs" do
+      @report.num_paragraphs.should == 4
+    end
+  end
+  describe "#num_sentences" do
+    it "should be the correct count of sentences" do
+      @report.num_sentences.should == 15
+    end
+  end
+end

data/spec/lingua/en/sentence_spec.rb ADDED Viewed

@@ -0,0 +1,136 @@
+require File.dirname(__FILE__) + "/../../spec_helper"
+describe Lingua::EN::Sentence do
+  klass = Lingua::EN::Sentence
+  describe "#sentences" do
+    describe "multi-paragraph text" do
+      before(:each) do
+        text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"\n\n"
+        text << "Visit http://www.google.com and check out my site. Thanks very much!"
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(5).things
+      end
+      it "should get the correct sentences" do
+        @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
+        @sentences[1].should == "And I'm inclined to agree."
+        @sentences[2].should == "\"Why can't we be friends?\""
+        @sentences[3].should == "Visit http://www.google.com and check out my site."
+        @sentences[4].should == "Thanks very much!"
+      end
+    end
+    describe "quoted sentences" do
+      before(:each) do
+        text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\""
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(3).things
+      end
+      it "should get the correct sentences" do
+        @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
+        @sentences[1].should == "And I'm inclined to agree."
+        @sentences[2].should == "\"Why can't we be friends?\""
+      end
+    end
+    describe "ellipses correction" do
+      before(:each) do
+        text = "Well... why would you do that? Let's not fight."
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(2).things
+      end
+      it "should get the right sentences" do
+        @sentences[0].should == "Well... why would you do that?"
+        @sentences[1].should == "Let's not fight."
+      end
+    end
+    describe "simple URL matching" do
+      before(:each) do
+        text = "Hello, visit http://www.google.com/index.php?ok=ok for more info. Ok?"
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(2).things
+      end
+      it "should get the right sentences" do
+        @sentences[0].should == "Hello, visit http://www.google.com/index.php?ok=ok for more info."
+        @sentences[1].should == "Ok?"
+      end
+    end
+    describe "ending a sentence with an abbreviation" do
+      before(:each) do
+        text = "I was born in the U.S.S.R. My parents were from the U.S. This is not weird."
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(3).things
+      end
+      it "should get the correct sentences" do
+        @sentences[0].should == "I was born in the U.S.S.R."
+        @sentences[1].should == "My parents were from the U.S."
+        @sentences[2].should == "This is not weird."
+      end
+    end
+    describe "basic sentences" do
+      before(:each) do
+        text = "Hello, my name is David. What is your name?"
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences.should have(2).things
+      end
+    end
+    describe "sentences with URLs and abbreviation" do
+      before(:each) do
+        text = "Many of these leading names now have their own website, e.g.  http://www.kaptest.com/. Hello, e.g. you don't know what you mean. I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
+        @sentences = klass.sentences(text)
+      end
+      it "should get the correct number of sentences" do
+        @sentences[0].should == "Many of these leading names now have their own website, e.g.  http://www.kaptest.com/."
+        @sentences[1].should == "Hello, e.g. you don't know what you mean."
+        @sentences[2].should == "I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
+        @sentences.should have(3).things
+      end
+    end
+  end
+  describe "#abbreviation" do
+    it "should change the abbreviations list" do
+      klass.abbreviation('monkey', 'pig')
+      klass.abbreviations.should include('monkey')
+      klass.abbreviations.should include('pig')
+    end
+    it "should change the regex for abbreviations" do
+      lambda {
+        klass.abbreviation('monkey')
+      }.should change(klass, :abbr_regex)
+    end
+    after(:each) do
+      klass.initialize_abbreviations!
+    end
+  end
+end

metadata CHANGED Viewed

@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
   prerelease: false
   segments:
   - 0
-  - 5
-  - 2
-  version: 0.5.2
+  - 6
+  - 0
+  version: 0.6.0
 platform: ruby
 authors:
 - David Balatero
@@ -14,7 +14,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2010-04-11 00:00:00 -07:00
+date: 2010-04-20 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
@@ -48,10 +48,14 @@ files:
 - Rakefile
 - VERSION
 - lib/lingua.rb
+- lib/lingua/en/paragraph.rb
 - lib/lingua/en/readability.rb
 - lib/lingua/en/sentence.rb
 - lib/lingua/en/syllable.rb
 - lib/lingua/en/syllable/guess.rb
+- spec/lingua/en/paragraph_spec.rb
+- spec/lingua/en/readability_spec.rb
+- spec/lingua/en/sentence_spec.rb
 - spec/spec.opts
 - spec/spec_helper.rb
 has_rdoc: true
@@ -85,4 +89,7 @@ signing_key:
 specification_version: 3
 summary: This is a maintained version of Ruby's Lingua port.
 test_files:
+- spec/lingua/en/paragraph_spec.rb
+- spec/lingua/en/readability_spec.rb
+- spec/lingua/en/sentence_spec.rb
 - spec/spec_helper.rb