lingua 0.5.2 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -4,6 +4,8 @@ This library is originally from http://pressure.to/ruby, by Alex Fenton <alex@pr
4
4
 
5
5
  It is currently maintained by David Balatero <dbalatero@gmail.com>.
6
6
 
7
+ Slowly but surely, specs are being added (original codebase was not tested), as well as better functionality.
8
+
7
9
  == Note on Patches/Pull Requests
8
10
 
9
11
  * Fork the project.
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.5.2
1
+ 0.6.0
@@ -0,0 +1,10 @@
1
+ module Lingua
2
+ module EN
3
+ module Paragraph
4
+ # Splits text into an array of paragraphs.
5
+ def self.paragraphs(text)
6
+ text.strip.split(/(?:\n[\r\t ]*)+/).collect { |p| p.strip }
7
+ end
8
+ end
9
+ end
10
+ end
@@ -1,143 +1,144 @@
1
1
  module Lingua
2
- module EN
3
-
4
- # The class Lingua::EN::Readability takes English text and analyses formal
5
- # characteristic
6
- class Readability
7
- require 'lingua/en/syllable'
8
- require 'lingua/en/sentence'
9
-
10
- attr_reader :text, :paragraphs, :sentences, :words, :frequencies
11
-
12
- # The constructor accepts the text to be analysed, and returns a report
13
- # object which gives access to the
14
- def initialize(text)
15
- @text = text.dup
16
- @paragraphs = text.split(/\n\s*\n\s*/)
17
- @sentences = Lingua::EN::Sentence.sentences(@text)
18
- @words = []
19
- @frequencies = {}
20
- @frequencies.default = 0
21
- @syllables = 0
22
- @complex_words = 0
23
- count_words
24
- end
25
-
26
- # The number of paragraphs in the sample. A paragraph is defined as a
27
- # newline followed by one or more empty or whitespace-only lines.
28
- def num_paragraphs
29
- @paragraphs.length
30
- end
31
-
32
- # The number of sentences in the sample. The meaning of a "sentence" is
33
- # defined by Lingua::EN::Sentence.
34
- def num_sentences
35
- @sentences.length
36
- end
37
-
38
- # The number of characters in the sample.
39
- def num_chars
40
- @text.length
41
- end
42
- alias :num_characters :num_chars
43
-
44
- # The total number of words used in the sample. Numbers as digits are not
45
- # counted.
46
- def num_words
47
- @words.length
48
- end
49
-
50
- # The total number of syllables in the text sample. Just for completeness.
51
- def num_syllables
52
- @syllables
53
- end
54
-
55
- # The number of different unique words used in the text sample.
56
- def num_unique_words
57
- @frequencies.keys.length
58
- end
59
-
60
- # An array containing each unique word used in the text sample.
61
- def unique_words
62
- @frequencies.keys
63
- end
64
-
65
- # The number of occurences of the word +word+ in the text sample.
66
- def occurrences(word)
67
- @frequencies[word]
68
- end
69
-
70
- # The average number of words per sentence.
71
- def words_per_sentence
72
- @words.length.to_f / @sentences.length.to_f
73
- end
74
-
75
- # The average number of syllables per word. The syllable count is performed
76
- # by Lingua::EN::Syllable, and so may not be completely accurate, especially
77
- # if the Carnegie-Mellon Pronouncing Dictionary is not installed.
78
- def syllables_per_word
79
- @syllables.to_f / @words.length.to_f
80
- end
81
-
82
- # Flesch-Kincaid level of the text sample. This measure scores text based
83
- # on the American school grade system; a score of 7.0 would indicate that
84
- # the text is readable by a seventh grader. A score of 7.0 to 8.0 is
85
- # regarded as optimal for ordinary text.
86
- def kincaid
87
- (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59
88
- end
89
-
90
- # Flesch reading ease of the text sample. A higher score indicates text that
91
- # is easier to read. The score is on a 100-point scale, and a score of 60-70
92
- # is regarded as optimal for ordinary text.
93
- def flesch
94
- 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
95
- end
96
-
97
- # The Gunning Fog Index of the text sample. The index indicates the number
98
- # of years of formal education that a reader of average intelligence would
99
- # need to comprehend the text. A higher score indicates harder text; a value
100
- # of around 12 is indicated as ideal for ordinary text.
101
- def fog
102
- ( words_per_sentence + percent_fog_complex_words ) * 0.4
103
- end
104
-
105
- # The percentage of words that are defined as "complex" for the purpose of
106
- # the Fog Index. This is non-hyphenated words of three or more syllabes.
107
- def percent_fog_complex_words
108
- ( @complex_words.to_f / @words.length.to_f ) * 100
109
- end
110
-
111
- # Return a nicely formatted report on the sample, showing most the useful
112
- # statistics about the text sample.
113
- def report
114
- sprintf "Number of paragraphs %d \n" <<
115
- "Number of sentences %d \n" <<
116
- "Number of words %d \n" <<
117
- "Number of characters %d \n\n" <<
118
- "Average words per sentence %.2f \n" <<
119
- "Average syllables per word %.2f \n\n" <<
120
- "Flesch score %2.2f \n" <<
121
- "Flesh-Kincaid grade level %2.2f \n" <<
122
- "Fog Index %2.2f \n",
123
- num_paragraphs, num_sentences, num_words, num_characters,
124
- words_per_sentence, syllables_per_word,
125
- flesch, kincaid, fog
126
- end
127
-
128
- private
129
- def count_words
130
- for match in @text.scan /\b([a-z][a-z\-']*)\b/i
131
- word = match[0]
132
- @words.push word
133
- @frequencies[word] += 1
134
- syllables = Lingua::EN::Syllable.syllables(word)
135
- @syllables += syllables
136
- if syllables > 2 && word !~ /-/
137
- @complex_words += 1 # for Fog Index
138
- end
139
- end
140
- end
141
- end
142
- end
2
+ module EN
3
+ # The class Lingua::EN::Readability takes English text and analyses formal
4
+ # characteristic
5
+ class Readability
6
+ attr_reader :text, :paragraphs, :sentences, :words, :frequencies
7
+
8
+ # The constructor accepts the text to be analysed, and returns a report
9
+ # object which gives access to the
10
+ def initialize(text)
11
+ @text = text.dup
12
+ @paragraphs = Lingua::EN::Paragraph.paragraphs(self.text)
13
+ @sentences = Lingua::EN::Sentence.sentences(self.text)
14
+ @words = []
15
+ @frequencies = {}
16
+ @frequencies.default = 0
17
+ @syllables = 0
18
+ @complex_words = 0
19
+ count_words
20
+ end
21
+
22
+ # The number of paragraphs in the sample. A paragraph is defined as a
23
+ # newline followed by one or more empty or whitespace-only lines.
24
+ def num_paragraphs
25
+ paragraphs.length
26
+ end
27
+
28
+ # The number of sentences in the sample. The meaning of a "sentence" is
29
+ # defined by Lingua::EN::Sentence.
30
+ def num_sentences
31
+ sentences.length
32
+ end
33
+
34
+ # The number of characters in the sample.
35
+ def num_chars
36
+ text.length
37
+ end
38
+ alias :num_characters :num_chars
39
+
40
+ # The total number of words used in the sample. Numbers as digits are not
41
+ # counted.
42
+ def num_words
43
+ words.length
44
+ end
45
+
46
+ # The total number of syllables in the text sample. Just for completeness.
47
+ def num_syllables
48
+ @syllables
49
+ end
50
+
51
+ # The number of different unique words used in the text sample.
52
+ def num_unique_words
53
+ @frequencies.keys.length
54
+ end
55
+
56
+ # An array containing each unique word used in the text sample.
57
+ def unique_words
58
+ @frequencies.keys
59
+ end
60
+
61
+ # The number of occurences of the word +word+ in the text sample.
62
+ def occurrences(word)
63
+ @frequencies[word]
64
+ end
65
+
66
+ # The average number of words per sentence.
67
+ def words_per_sentence
68
+ words.length.to_f / sentences.length.to_f
69
+ end
70
+
71
+ # The average number of syllables per word. The syllable count is
72
+ # performed by Lingua::EN::Syllable, and so may not be completely
73
+ # accurate, especially if the Carnegie-Mellon Pronouncing Dictionary
74
+ # is not installed.
75
+ def syllables_per_word
76
+ @syllables.to_f / words.length.to_f
77
+ end
78
+
79
+ # Flesch-Kincaid level of the text sample. This measure scores text based
80
+ # on the American school grade system; a score of 7.0 would indicate that
81
+ # the text is readable by a seventh grader. A score of 7.0 to 8.0 is
82
+ # regarded as optimal for ordinary text.
83
+ def kincaid
84
+ (11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59
85
+ end
86
+
87
+ # Flesch reading ease of the text sample. A higher score indicates text
88
+ # that is easier to read. The score is on a 100-point scale, and a score
89
+ # of 60-70 is regarded as optimal for ordinary text.
90
+ def flesch
91
+ 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
92
+ end
93
+
94
+ # The Gunning Fog Index of the text sample. The index indicates the number
95
+ # of years of formal education that a reader of average intelligence would
96
+ # need to comprehend the text. A higher score indicates harder text; a
97
+ # value of around 12 is indicated as ideal for ordinary text.
98
+ def fog
99
+ ( words_per_sentence + percent_fog_complex_words ) * 0.4
100
+ end
101
+
102
+ # The percentage of words that are defined as "complex" for the purpose of
103
+ # the Fog Index. This is non-hyphenated words of three or more syllabes.
104
+ def percent_fog_complex_words
105
+ ( @complex_words.to_f / words.length.to_f ) * 100
106
+ end
107
+
108
+ # Return a nicely formatted report on the sample, showing most the useful
109
+ # statistics about the text sample.
110
+ def report
111
+ sprintf "Number of paragraphs %d \n" <<
112
+ "Number of sentences %d \n" <<
113
+ "Number of words %d \n" <<
114
+ "Number of characters %d \n\n" <<
115
+ "Average words per sentence %.2f \n" <<
116
+ "Average syllables per word %.2f \n\n" <<
117
+ "Flesch score %2.2f \n" <<
118
+ "Flesh-Kincaid grade level %2.2f \n" <<
119
+ "Fog Index %2.2f \n",
120
+ num_paragraphs, num_sentences, num_words, num_characters,
121
+ words_per_sentence, syllables_per_word,
122
+ flesch, kincaid, fog
123
+ end
124
+
125
+ private
126
+ def count_words
127
+ @text.scan(/\b([a-z][a-z\-']*)\b/i).each do |match|
128
+ word = match[0]
129
+ @words << word
130
+
131
+ # up frequency counts
132
+ @frequencies[word] += 1
133
+
134
+ # syllable counts
135
+ syllables = Lingua::EN::Syllable.syllables(word)
136
+ @syllables += syllables
137
+ if syllables > 2 && !word.include?('-')
138
+ @complex_words += 1 # for Fog Index
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end
143
144
  end
@@ -1,62 +1,87 @@
1
1
  module Lingua
2
2
  module EN
3
- # The module Lingua::EN::Sentence takes English text, and attempts to
3
+ # The class Lingua::EN::Sentence takes English text, and attempts to
4
4
  # split it up into sentences, respecting abbreviations.
5
5
 
6
- module Sentence
6
+ class Sentence
7
+ class << self
8
+ attr_reader :abbreviations
9
+ attr_reader :abbr_regex
10
+ end
11
+
7
12
  EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
8
13
 
9
14
  Titles = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
10
15
  'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
11
16
  'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
12
-
13
17
  Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
14
18
  'corp', 'plc' ] unless defined?(Entities)
15
-
16
19
  Months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
17
20
  'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
18
-
19
21
  Days = [ 'mon', 'tue', 'wed', 'thu',
20
22
  'fri', 'sat', 'sun' ] unless defined?(Days)
21
-
22
23
  Misc = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
23
-
24
24
  Streets = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
25
25
  'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
26
26
 
27
- @@abbreviations = Titles + Entities + Months + Days + Streets + Misc
27
+
28
+ # Finds abbreviations, like e.g., i.e., U.S., u.S., U.S.S.R.
29
+ ABBR_DETECT = /(?:\s(?:(?:(?:\w\.){2,}\w?)|(?:\w\.\w)))/ unless defined?(ABBR_DETECT)
30
+
31
+ # Finds punctuation that ends paragraphs.
32
+ PUNCTUATION_DETECT = /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ unless defined?(PUNCTUATION_DETECT)
33
+
34
+ CORRECT_ABBR = /(#{ABBR_DETECT})#{EOS}(\s+[a-z0-9])/
28
35
 
29
36
  # Split the passed text into individual sentences, trim these and return
30
37
  # as an array. A sentence is marked by one of the punctuation marks ".", "?"
31
38
  # or "!" followed by whitespace. Sequences of full stops (such as an
32
39
  # ellipsis marker "..." and stops after a known abbreviation are ignored.
33
40
  def self.sentences(text)
34
-
41
+ # Make sure we work with a duplicate, as we are modifying the
42
+ # text with #gsub!
35
43
  text = text.dup
36
44
 
37
- # initial split after punctuation - have to preserve trailing whitespace
38
- # for the ellipsis correction next
39
- # would be nicer to use look-behind and look-ahead assertions to skip
40
- # ellipsis marks, but Ruby doesn't support look-behind
41
- text.gsub!( /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ ) { $1 << EOS << $2 }
45
+ # Mark end of sentences with EOS marker.
46
+ # We preserve the trailing whitespace ($2) so that we can
47
+ # fix ellipses (...)!
48
+ text.gsub!(PUNCTUATION_DETECT) { $1 << EOS << $2 }
42
49
 
43
- # correct ellipsis marks and rows of stops
44
- text.gsub!( /(\.\.\.*)#{EOS}/ ) { $1 }
50
+ # Correct ellipsis marks.
51
+ text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
45
52
 
46
- # correct abbreviations
47
- # TODO - precompile this regex?
48
- text.gsub!( /(#{@@abbreviations.join("|")})\.#{EOS}/i ) { $1 << '.' }
53
+ # Correct e.g, i.e. marks.
54
+ text.gsub!(CORRECT_ABBR, "\\1\\2")
49
55
 
50
- # split on EOS marker, strip gets rid of trailing whitespace
51
- text.split(EOS).map { | sentence | sentence.strip }
56
+ # Correct abbreviations
57
+ text.gsub!(@abbr_regex) { $1 << '.' }
58
+
59
+ # Split on EOS marker, get rid of trailing whitespace.
60
+ # Remove empty sentences.
61
+ text.split(EOS).
62
+ map { |sentence| sentence.strip }.
63
+ delete_if { |sentence| sentence.nil? || sentence.empty? }
52
64
  end
53
65
 
54
- # add a list of abbreviations to the list that's used to detect false
66
+ # Adds a list of abbreviations to the list that's used to detect false
55
67
  # sentence ends. Return the current list of abbreviations in use.
56
68
  def self.abbreviation(*abbreviations)
57
- @@abbreviations += abbreviations
58
- @@abbreviations
69
+ @abbreviations += abbreviations
70
+ @abbreviations.uniq!
71
+ set_abbr_regex!
72
+ @abbreviations
73
+ end
74
+
75
+ def self.initialize_abbreviations!
76
+ @abbreviations = Titles + Entities + Months + Days + Streets + Misc
77
+ set_abbr_regex!
78
+ end
79
+
80
+ def self.set_abbr_regex!
81
+ @abbr_regex = /(#{abbreviations.join("|")})\.#{EOS}/i
59
82
  end
83
+
84
+ initialize_abbreviations!
60
85
  end
61
86
  end
62
87
  end
@@ -1,33 +1,39 @@
1
1
  module Lingua
2
2
  module EN
3
3
  module Syllable
4
- # Uses english word patterns to guess the number of syllables. A single module
5
- # method is made available, +syllables+, which, when passed an english word,
6
- # will return the number of syllables it estimates are in the word.
7
- # English orthography (the representation of spoken sounds as written signs) is
8
- # not regular. The same spoken sound can be represented in multiple different
9
- # ways in written English (e.g. rough/cuff), and the same written letters
10
- # can be pronounced in different ways in different words (e.g. rough/bough).
11
- # As the same series of letters can be pronounced in different ways, it is not
12
- # possible to write an algorithm which can always guess the number of syllables
13
- # in an english word correctly. However, it is possible to use frequently
14
- # recurring patterns in english (such as "a final -e is usually silent") to
15
- # guess with a level of accuracy that is acceptable for applications like
16
- # syllable counting for readability scoring. This module implements such an
17
- # algorithm.
18
- # This module is inspired by the Perl Lingua::EN::Syllable module. However, it
19
- # uses a different (though not larger) set of patterns to compensate for the
20
- # 'special cases' which arise out of English's irregular orthography. A number
21
- # of extra patterns (particularly for derived word forms) means that this module
22
- # is somewhat more accurate than the Perl original. It also omits a number of
23
- # patterns found in the original which seem to me to apply to such a small number
24
- # of cases, or to be of dubious value. Testing the guesses against the Carnegie
25
- # Mellon Pronouncing Dictionary, this module guesses right around 90% of the
4
+ # Uses English word patterns to guess the number of syllables. A single
5
+ # module method is made available, +syllables+, which, when passed an
6
+ # English word, will return the number of syllables it estimates are in
7
+ # the word.
8
+ #
9
+ # English orthography (the representation of spoken sounds as written
10
+ # signs) is not regular. The same spoken sound can be represented in
11
+ # multiple different ways in written English (e.g. rough/cuff), and the
12
+ # same written letters can be pronounced in different ways in different
13
+ # words (e.g. rough/bough).
14
+ #
15
+ # As the same series of letters can be pronounced in different ways, it is
16
+ # not possible to write an algorithm which can always guess the number of
17
+ # syllables in an english word correctly. However, it is possible to use
18
+ # frequently recurring patterns in english (such as "a final -e is usually
19
+ # silent") to guess with a level of accuracy that is acceptable for
20
+ # applications like syllable counting for readability scoring. This module
21
+ # implements such an algorithm.
22
+ #
23
+ # This module is inspired by the Perl Lingua::EN::Syllable module.
24
+ # However, it uses a different (though not larger) set of patterns to
25
+ # compensate for the 'special cases' which arise out of English's
26
+ # irregular orthography. A number of extra patterns (particularly for
27
+ # derived word forms) means that this module is somewhat more accurate
28
+ # than the Perl original. It also omits a number of patterns found in the
29
+ # original which seem to me to apply to such a small number of cases, or
30
+ # to be of dubious value. Testing the guesses against the Carnegie Mellon
31
+ # Pronouncing Dictionary, this module guesses right around 90% of the
26
32
  # time, as against about 85% of the time for the Perl module. However, the
27
- # dictionary contains a large number of foreign loan words and proper names, and
28
- # so when the algorithm is tested against 'real world' english, its accuracy
29
- # is a good deal better. Testing against a range of samples, it guesses right
30
- # about 95-97% of the time.
33
+ # dictionary contains a large number of foreign loan words and proper
34
+ # names, and so when the algorithm is tested against 'real world' english,
35
+ # its accuracy is a good deal better. Testing against a range of samples,
36
+ # it guesses right about 95-97% of the time.
31
37
  module Guess
32
38
  # special cases - 1 syllable less than expected
33
39
  SubSyl = [
@@ -2,18 +2,18 @@ require 'lingua/en/syllable/guess'
2
2
 
3
3
  module Lingua
4
4
  module EN
5
- # The module Lingua::EN::Syllable contains a single class method, +syllable+,
6
- # which will use the most accurate technique available to determine the number
7
- # syllables in a string containing a word passed to it.
5
+ # The module Lingua::EN::Syllable contains a single class method,
6
+ # +syllable+, which will use the most accurate technique available to
7
+ # determine the number syllables in a string containing a word passed to it.
8
+ #
9
+ ########## REMOVED BY dbalatero:
8
10
  # The exact definition of the function depends on the availability of the
9
- # Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
10
- # the number of syllables as determined by the dictionary will be returned. If
11
- # the dictionary is not available, or if a word not contained in the dictionary
12
- # is passed, it will return the number of syllables as determined by the
13
- # module Lingua::EN::Syllable::Guess. For more details, see there and
11
+ # Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
12
+ # the number of syllables as determined by the dictionary will be returned.
13
+ # If the dictionary is not available, or if a word not contained in the
14
+ # dictionary is passed, it will return the number of syllables as determined
15
+ # by the module Lingua::EN::Syllable::Guess. For more details, see there and
14
16
  # Lingua::EN::Syllable::Dictionary.
15
- #
16
- # dbalatero: removed dictionary.
17
17
  module Syllable
18
18
  def self.syllables(word)
19
19
  Guess::syllables word
@@ -0,0 +1,29 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe Lingua::EN::Paragraph do
4
+ describe "#paragraphs" do
5
+ it "should return paragraphs with extra whitespace in the line breaks" do
6
+ text = "Ok.\n \nTest."
7
+ result = Lingua::EN::Paragraph.paragraphs(text)
8
+ result.should have(2).things
9
+ result[0].should == "Ok."
10
+ result[1].should == "Test."
11
+ end
12
+
13
+ it "should break up paragraphs with > 2 line breaks" do
14
+ text = "Ok.\n\n\nTest."
15
+ result = Lingua::EN::Paragraph.paragraphs(text)
16
+ result.should have(2).things
17
+ result[0].should == "Ok."
18
+ result[1].should == "Test."
19
+ end
20
+
21
+ it "should ignore trailing newline chars" do
22
+ text = "Ok.\n \n\nTest.\n \r\n \n\n"
23
+ result = Lingua::EN::Paragraph.paragraphs(text)
24
+ result.should have(2).things
25
+ result[0].should == "Ok."
26
+ result[1].should == "Test."
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe Lingua::EN::Readability do
4
+ before(:each) do
5
+ @text = <<-EOF
6
+ After marriage, the next big event in the couples lives will be their honeymoon. It is a time when the newly weds can get away from relatives and friends to spend some significant time getting to know one another. This time alone together that the couple shares is called the honeymoon. A great gift idea for the married couple would be to give them a surprise tour package. Most women would like to go on a honeymoon.
7
+
8
+ The week or two before the ceremonies would be the best time to schedule a tour because then the budget for this event could be considered. In winter there are more opportunities for the couple to get close to one another because of the cold weather. It is easier to snuggle when the weather is not favorable to outdoor activities. This would afford the couple ample time to know more about themselves during the honeymoon.
9
+
10
+ Honeymoon plans should be discussed with the wife to ensure that the shock is pleasant and not a negative experience to her. It is also a good idea in this case, to ask her probing questions as to where she would like to go. Perhaps you could get a friend or family member to ask her what would be her favorite travel location. That would ensure that you know just what she is looking for.
11
+
12
+ Make sure that the trip is exactly what she wants. Then on the wedding night tell her about the adventure so that the needed accommodations can be made.
13
+ EOF
14
+
15
+ @report = Lingua::EN::Readability.new(@text)
16
+ end
17
+
18
+ describe "#num_paragraphs" do
19
+ it "should return the correct count of paragraphs" do
20
+ @report.num_paragraphs.should == 4
21
+ end
22
+ end
23
+
24
+ describe "#num_sentences" do
25
+ it "should be the correct count of sentences" do
26
+ @report.num_sentences.should == 15
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,136 @@
1
+ require File.dirname(__FILE__) + "/../../spec_helper"
2
+
3
+ describe Lingua::EN::Sentence do
4
+ klass = Lingua::EN::Sentence
5
+
6
+ describe "#sentences" do
7
+ describe "multi-paragraph text" do
8
+ before(:each) do
9
+ text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"\n\n"
10
+ text << "Visit http://www.google.com and check out my site. Thanks very much!"
11
+ @sentences = klass.sentences(text)
12
+ end
13
+
14
+ it "should get the correct number of sentences" do
15
+ @sentences.should have(5).things
16
+ end
17
+
18
+ it "should get the correct sentences" do
19
+ @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
20
+ @sentences[1].should == "And I'm inclined to agree."
21
+ @sentences[2].should == "\"Why can't we be friends?\""
22
+ @sentences[3].should == "Visit http://www.google.com and check out my site."
23
+ @sentences[4].should == "Thanks very much!"
24
+ end
25
+ end
26
+
27
+ describe "quoted sentences" do
28
+ before(:each) do
29
+ text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\""
30
+ @sentences = klass.sentences(text)
31
+ end
32
+
33
+ it "should get the correct number of sentences" do
34
+ @sentences.should have(3).things
35
+ end
36
+
37
+ it "should get the correct sentences" do
38
+ @sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
39
+ @sentences[1].should == "And I'm inclined to agree."
40
+ @sentences[2].should == "\"Why can't we be friends?\""
41
+ end
42
+ end
43
+
44
+ describe "ellipses correction" do
45
+ before(:each) do
46
+ text = "Well... why would you do that? Let's not fight."
47
+ @sentences = klass.sentences(text)
48
+ end
49
+
50
+ it "should get the correct number of sentences" do
51
+ @sentences.should have(2).things
52
+ end
53
+
54
+ it "should get the right sentences" do
55
+ @sentences[0].should == "Well... why would you do that?"
56
+ @sentences[1].should == "Let's not fight."
57
+ end
58
+ end
59
+
60
+ describe "simple URL matching" do
61
+ before(:each) do
62
+ text = "Hello, visit http://www.google.com/index.php?ok=ok for more info. Ok?"
63
+ @sentences = klass.sentences(text)
64
+ end
65
+
66
+ it "should get the correct number of sentences" do
67
+ @sentences.should have(2).things
68
+ end
69
+
70
+ it "should get the right sentences" do
71
+ @sentences[0].should == "Hello, visit http://www.google.com/index.php?ok=ok for more info."
72
+ @sentences[1].should == "Ok?"
73
+ end
74
+ end
75
+
76
+ describe "ending a sentence with an abbreviation" do
77
+ before(:each) do
78
+ text = "I was born in the U.S.S.R. My parents were from the U.S. This is not weird."
79
+ @sentences = klass.sentences(text)
80
+ end
81
+
82
+ it "should get the correct number of sentences" do
83
+ @sentences.should have(3).things
84
+ end
85
+
86
+ it "should get the correct sentences" do
87
+ @sentences[0].should == "I was born in the U.S.S.R."
88
+ @sentences[1].should == "My parents were from the U.S."
89
+ @sentences[2].should == "This is not weird."
90
+ end
91
+ end
92
+
93
+ describe "basic sentences" do
94
+ before(:each) do
95
+ text = "Hello, my name is David. What is your name?"
96
+ @sentences = klass.sentences(text)
97
+ end
98
+
99
+ it "should get the correct number of sentences" do
100
+ @sentences.should have(2).things
101
+ end
102
+ end
103
+
104
+ describe "sentences with URLs and abbreviation" do
105
+ before(:each) do
106
+ text = "Many of these leading names now have their own website, e.g. http://www.kaptest.com/. Hello, e.g. you don't know what you mean. I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
107
+ @sentences = klass.sentences(text)
108
+ end
109
+
110
+ it "should get the correct number of sentences" do
111
+ @sentences[0].should == "Many of these leading names now have their own website, e.g. http://www.kaptest.com/."
112
+ @sentences[1].should == "Hello, e.g. you don't know what you mean."
113
+ @sentences[2].should == "I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
114
+ @sentences.should have(3).things
115
+ end
116
+ end
117
+ end
118
+
119
+ describe "#abbreviation" do
120
+ it "should change the abbreviations list" do
121
+ klass.abbreviation('monkey', 'pig')
122
+ klass.abbreviations.should include('monkey')
123
+ klass.abbreviations.should include('pig')
124
+ end
125
+
126
+ it "should change the regex for abbreviations" do
127
+ lambda {
128
+ klass.abbreviation('monkey')
129
+ }.should change(klass, :abbr_regex)
130
+ end
131
+
132
+ after(:each) do
133
+ klass.initialize_abbreviations!
134
+ end
135
+ end
136
+ end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 5
8
- - 2
9
- version: 0.5.2
7
+ - 6
8
+ - 0
9
+ version: 0.6.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - David Balatero
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-04-11 00:00:00 -07:00
17
+ date: 2010-04-20 00:00:00 -07:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -48,10 +48,14 @@ files:
48
48
  - Rakefile
49
49
  - VERSION
50
50
  - lib/lingua.rb
51
+ - lib/lingua/en/paragraph.rb
51
52
  - lib/lingua/en/readability.rb
52
53
  - lib/lingua/en/sentence.rb
53
54
  - lib/lingua/en/syllable.rb
54
55
  - lib/lingua/en/syllable/guess.rb
56
+ - spec/lingua/en/paragraph_spec.rb
57
+ - spec/lingua/en/readability_spec.rb
58
+ - spec/lingua/en/sentence_spec.rb
55
59
  - spec/spec.opts
56
60
  - spec/spec_helper.rb
57
61
  has_rdoc: true
@@ -85,4 +89,7 @@ signing_key:
85
89
  specification_version: 3
86
90
  summary: This is a maintained version of Ruby's Lingua port.
87
91
  test_files:
92
+ - spec/lingua/en/paragraph_spec.rb
93
+ - spec/lingua/en/readability_spec.rb
94
+ - spec/lingua/en/sentence_spec.rb
88
95
  - spec/spec_helper.rb