lingua 0.5.2 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +2 -0
- data/VERSION +1 -1
- data/lib/lingua/en/paragraph.rb +10 -0
- data/lib/lingua/en/readability.rb +142 -141
- data/lib/lingua/en/sentence.rb +49 -24
- data/lib/lingua/en/syllable/guess.rb +32 -26
- data/lib/lingua/en/syllable.rb +10 -10
- data/spec/lingua/en/paragraph_spec.rb +29 -0
- data/spec/lingua/en/readability_spec.rb +29 -0
- data/spec/lingua/en/sentence_spec.rb +136 -0
- metadata +11 -4
data/README.rdoc
CHANGED
@@ -4,6 +4,8 @@ This library is originally from http://pressure.to/ruby, by Alex Fenton <alex@pr
|
|
4
4
|
|
5
5
|
It is currently maintained by David Balatero <dbalatero@gmail.com>.
|
6
6
|
|
7
|
+
Slowly but surely, specs are being added (original codebase was not tested), as well as better functionality.
|
8
|
+
|
7
9
|
== Note on Patches/Pull Requests
|
8
10
|
|
9
11
|
* Fork the project.
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.
|
1
|
+
0.6.0
|
@@ -1,143 +1,144 @@
|
|
1
1
|
module Lingua
|
2
|
-
module EN
|
3
|
-
|
4
|
-
#
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
end
|
142
|
-
end
|
2
|
+
module EN
|
3
|
+
# The class Lingua::EN::Readability takes English text and analyses formal
|
4
|
+
# characteristic
|
5
|
+
class Readability
|
6
|
+
attr_reader :text, :paragraphs, :sentences, :words, :frequencies
|
7
|
+
|
8
|
+
# The constructor accepts the text to be analysed, and returns a report
|
9
|
+
# object which gives access to the
|
10
|
+
def initialize(text)
|
11
|
+
@text = text.dup
|
12
|
+
@paragraphs = Lingua::EN::Paragraph.paragraphs(self.text)
|
13
|
+
@sentences = Lingua::EN::Sentence.sentences(self.text)
|
14
|
+
@words = []
|
15
|
+
@frequencies = {}
|
16
|
+
@frequencies.default = 0
|
17
|
+
@syllables = 0
|
18
|
+
@complex_words = 0
|
19
|
+
count_words
|
20
|
+
end
|
21
|
+
|
22
|
+
# The number of paragraphs in the sample. A paragraph is defined as a
|
23
|
+
# newline followed by one or more empty or whitespace-only lines.
|
24
|
+
def num_paragraphs
|
25
|
+
paragraphs.length
|
26
|
+
end
|
27
|
+
|
28
|
+
# The number of sentences in the sample. The meaning of a "sentence" is
|
29
|
+
# defined by Lingua::EN::Sentence.
|
30
|
+
def num_sentences
|
31
|
+
sentences.length
|
32
|
+
end
|
33
|
+
|
34
|
+
# The number of characters in the sample.
|
35
|
+
def num_chars
|
36
|
+
text.length
|
37
|
+
end
|
38
|
+
alias :num_characters :num_chars
|
39
|
+
|
40
|
+
# The total number of words used in the sample. Numbers as digits are not
|
41
|
+
# counted.
|
42
|
+
def num_words
|
43
|
+
words.length
|
44
|
+
end
|
45
|
+
|
46
|
+
# The total number of syllables in the text sample. Just for completeness.
|
47
|
+
def num_syllables
|
48
|
+
@syllables
|
49
|
+
end
|
50
|
+
|
51
|
+
# The number of different unique words used in the text sample.
|
52
|
+
def num_unique_words
|
53
|
+
@frequencies.keys.length
|
54
|
+
end
|
55
|
+
|
56
|
+
# An array containing each unique word used in the text sample.
|
57
|
+
def unique_words
|
58
|
+
@frequencies.keys
|
59
|
+
end
|
60
|
+
|
61
|
+
# The number of occurences of the word +word+ in the text sample.
|
62
|
+
def occurrences(word)
|
63
|
+
@frequencies[word]
|
64
|
+
end
|
65
|
+
|
66
|
+
# The average number of words per sentence.
|
67
|
+
def words_per_sentence
|
68
|
+
words.length.to_f / sentences.length.to_f
|
69
|
+
end
|
70
|
+
|
71
|
+
# The average number of syllables per word. The syllable count is
|
72
|
+
# performed by Lingua::EN::Syllable, and so may not be completely
|
73
|
+
# accurate, especially if the Carnegie-Mellon Pronouncing Dictionary
|
74
|
+
# is not installed.
|
75
|
+
def syllables_per_word
|
76
|
+
@syllables.to_f / words.length.to_f
|
77
|
+
end
|
78
|
+
|
79
|
+
# Flesch-Kincaid level of the text sample. This measure scores text based
|
80
|
+
# on the American school grade system; a score of 7.0 would indicate that
|
81
|
+
# the text is readable by a seventh grader. A score of 7.0 to 8.0 is
|
82
|
+
# regarded as optimal for ordinary text.
|
83
|
+
def kincaid
|
84
|
+
(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59
|
85
|
+
end
|
86
|
+
|
87
|
+
# Flesch reading ease of the text sample. A higher score indicates text
|
88
|
+
# that is easier to read. The score is on a 100-point scale, and a score
|
89
|
+
# of 60-70 is regarded as optimal for ordinary text.
|
90
|
+
def flesch
|
91
|
+
206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
92
|
+
end
|
93
|
+
|
94
|
+
# The Gunning Fog Index of the text sample. The index indicates the number
|
95
|
+
# of years of formal education that a reader of average intelligence would
|
96
|
+
# need to comprehend the text. A higher score indicates harder text; a
|
97
|
+
# value of around 12 is indicated as ideal for ordinary text.
|
98
|
+
def fog
|
99
|
+
( words_per_sentence + percent_fog_complex_words ) * 0.4
|
100
|
+
end
|
101
|
+
|
102
|
+
# The percentage of words that are defined as "complex" for the purpose of
|
103
|
+
# the Fog Index. This is non-hyphenated words of three or more syllabes.
|
104
|
+
def percent_fog_complex_words
|
105
|
+
( @complex_words.to_f / words.length.to_f ) * 100
|
106
|
+
end
|
107
|
+
|
108
|
+
# Return a nicely formatted report on the sample, showing most the useful
|
109
|
+
# statistics about the text sample.
|
110
|
+
def report
|
111
|
+
sprintf "Number of paragraphs %d \n" <<
|
112
|
+
"Number of sentences %d \n" <<
|
113
|
+
"Number of words %d \n" <<
|
114
|
+
"Number of characters %d \n\n" <<
|
115
|
+
"Average words per sentence %.2f \n" <<
|
116
|
+
"Average syllables per word %.2f \n\n" <<
|
117
|
+
"Flesch score %2.2f \n" <<
|
118
|
+
"Flesh-Kincaid grade level %2.2f \n" <<
|
119
|
+
"Fog Index %2.2f \n",
|
120
|
+
num_paragraphs, num_sentences, num_words, num_characters,
|
121
|
+
words_per_sentence, syllables_per_word,
|
122
|
+
flesch, kincaid, fog
|
123
|
+
end
|
124
|
+
|
125
|
+
private
|
126
|
+
def count_words
|
127
|
+
@text.scan(/\b([a-z][a-z\-']*)\b/i).each do |match|
|
128
|
+
word = match[0]
|
129
|
+
@words << word
|
130
|
+
|
131
|
+
# up frequency counts
|
132
|
+
@frequencies[word] += 1
|
133
|
+
|
134
|
+
# syllable counts
|
135
|
+
syllables = Lingua::EN::Syllable.syllables(word)
|
136
|
+
@syllables += syllables
|
137
|
+
if syllables > 2 && !word.include?('-')
|
138
|
+
@complex_words += 1 # for Fog Index
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
143
144
|
end
|
data/lib/lingua/en/sentence.rb
CHANGED
@@ -1,62 +1,87 @@
|
|
1
1
|
module Lingua
|
2
2
|
module EN
|
3
|
-
# The
|
3
|
+
# The class Lingua::EN::Sentence takes English text, and attempts to
|
4
4
|
# split it up into sentences, respecting abbreviations.
|
5
5
|
|
6
|
-
|
6
|
+
class Sentence
|
7
|
+
class << self
|
8
|
+
attr_reader :abbreviations
|
9
|
+
attr_reader :abbr_regex
|
10
|
+
end
|
11
|
+
|
7
12
|
EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
|
8
13
|
|
9
14
|
Titles = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
|
10
15
|
'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
|
11
16
|
'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
|
12
|
-
|
13
17
|
Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
|
14
18
|
'corp', 'plc' ] unless defined?(Entities)
|
15
|
-
|
16
19
|
Months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
|
17
20
|
'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
|
18
|
-
|
19
21
|
Days = [ 'mon', 'tue', 'wed', 'thu',
|
20
22
|
'fri', 'sat', 'sun' ] unless defined?(Days)
|
21
|
-
|
22
23
|
Misc = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
|
23
|
-
|
24
24
|
Streets = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
|
25
25
|
'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
|
26
26
|
|
27
|
-
|
27
|
+
|
28
|
+
# Finds abbreviations, like e.g., i.e., U.S., u.S., U.S.S.R.
|
29
|
+
ABBR_DETECT = /(?:\s(?:(?:(?:\w\.){2,}\w?)|(?:\w\.\w)))/ unless defined?(ABBR_DETECT)
|
30
|
+
|
31
|
+
# Finds punctuation that ends paragraphs.
|
32
|
+
PUNCTUATION_DETECT = /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ unless defined?(PUNCTUATION_DETECT)
|
33
|
+
|
34
|
+
CORRECT_ABBR = /(#{ABBR_DETECT})#{EOS}(\s+[a-z0-9])/
|
28
35
|
|
29
36
|
# Split the passed text into individual sentences, trim these and return
|
30
37
|
# as an array. A sentence is marked by one of the punctuation marks ".", "?"
|
31
38
|
# or "!" followed by whitespace. Sequences of full stops (such as an
|
32
39
|
# ellipsis marker "..." and stops after a known abbreviation are ignored.
|
33
40
|
def self.sentences(text)
|
34
|
-
|
41
|
+
# Make sure we work with a duplicate, as we are modifying the
|
42
|
+
# text with #gsub!
|
35
43
|
text = text.dup
|
36
44
|
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
40
|
-
|
41
|
-
text.gsub!( /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ ) { $1 << EOS << $2 }
|
45
|
+
# Mark end of sentences with EOS marker.
|
46
|
+
# We preserve the trailing whitespace ($2) so that we can
|
47
|
+
# fix ellipses (...)!
|
48
|
+
text.gsub!(PUNCTUATION_DETECT) { $1 << EOS << $2 }
|
42
49
|
|
43
|
-
#
|
44
|
-
text.gsub!(
|
50
|
+
# Correct ellipsis marks.
|
51
|
+
text.gsub!(/(\.\.\.*)#{EOS}/) { $1 }
|
45
52
|
|
46
|
-
#
|
47
|
-
|
48
|
-
text.gsub!( /(#{@@abbreviations.join("|")})\.#{EOS}/i ) { $1 << '.' }
|
53
|
+
# Correct e.g, i.e. marks.
|
54
|
+
text.gsub!(CORRECT_ABBR, "\\1\\2")
|
49
55
|
|
50
|
-
#
|
51
|
-
text.
|
56
|
+
# Correct abbreviations
|
57
|
+
text.gsub!(@abbr_regex) { $1 << '.' }
|
58
|
+
|
59
|
+
# Split on EOS marker, get rid of trailing whitespace.
|
60
|
+
# Remove empty sentences.
|
61
|
+
text.split(EOS).
|
62
|
+
map { |sentence| sentence.strip }.
|
63
|
+
delete_if { |sentence| sentence.nil? || sentence.empty? }
|
52
64
|
end
|
53
65
|
|
54
|
-
#
|
66
|
+
# Adds a list of abbreviations to the list that's used to detect false
|
55
67
|
# sentence ends. Return the current list of abbreviations in use.
|
56
68
|
def self.abbreviation(*abbreviations)
|
57
|
-
|
58
|
-
|
69
|
+
@abbreviations += abbreviations
|
70
|
+
@abbreviations.uniq!
|
71
|
+
set_abbr_regex!
|
72
|
+
@abbreviations
|
73
|
+
end
|
74
|
+
|
75
|
+
def self.initialize_abbreviations!
|
76
|
+
@abbreviations = Titles + Entities + Months + Days + Streets + Misc
|
77
|
+
set_abbr_regex!
|
78
|
+
end
|
79
|
+
|
80
|
+
def self.set_abbr_regex!
|
81
|
+
@abbr_regex = /(#{abbreviations.join("|")})\.#{EOS}/i
|
59
82
|
end
|
83
|
+
|
84
|
+
initialize_abbreviations!
|
60
85
|
end
|
61
86
|
end
|
62
87
|
end
|
@@ -1,33 +1,39 @@
|
|
1
1
|
module Lingua
|
2
2
|
module EN
|
3
3
|
module Syllable
|
4
|
-
# Uses
|
5
|
-
# method is made available, +syllables+, which, when passed an
|
6
|
-
# will return the number of syllables it estimates are in
|
7
|
-
#
|
8
|
-
#
|
9
|
-
#
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
22
|
-
#
|
23
|
-
#
|
24
|
-
#
|
25
|
-
#
|
4
|
+
# Uses English word patterns to guess the number of syllables. A single
|
5
|
+
# module method is made available, +syllables+, which, when passed an
|
6
|
+
# English word, will return the number of syllables it estimates are in
|
7
|
+
# the word.
|
8
|
+
#
|
9
|
+
# English orthography (the representation of spoken sounds as written
|
10
|
+
# signs) is not regular. The same spoken sound can be represented in
|
11
|
+
# multiple different ways in written English (e.g. rough/cuff), and the
|
12
|
+
# same written letters can be pronounced in different ways in different
|
13
|
+
# words (e.g. rough/bough).
|
14
|
+
#
|
15
|
+
# As the same series of letters can be pronounced in different ways, it is
|
16
|
+
# not possible to write an algorithm which can always guess the number of
|
17
|
+
# syllables in an english word correctly. However, it is possible to use
|
18
|
+
# frequently recurring patterns in english (such as "a final -e is usually
|
19
|
+
# silent") to guess with a level of accuracy that is acceptable for
|
20
|
+
# applications like syllable counting for readability scoring. This module
|
21
|
+
# implements such an algorithm.
|
22
|
+
#
|
23
|
+
# This module is inspired by the Perl Lingua::EN::Syllable module.
|
24
|
+
# However, it uses a different (though not larger) set of patterns to
|
25
|
+
# compensate for the 'special cases' which arise out of English's
|
26
|
+
# irregular orthography. A number of extra patterns (particularly for
|
27
|
+
# derived word forms) means that this module is somewhat more accurate
|
28
|
+
# than the Perl original. It also omits a number of patterns found in the
|
29
|
+
# original which seem to me to apply to such a small number of cases, or
|
30
|
+
# to be of dubious value. Testing the guesses against the Carnegie Mellon
|
31
|
+
# Pronouncing Dictionary, this module guesses right around 90% of the
|
26
32
|
# time, as against about 85% of the time for the Perl module. However, the
|
27
|
-
# dictionary contains a large number of foreign loan words and proper
|
28
|
-
# so when the algorithm is tested against 'real world' english,
|
29
|
-
# is a good deal better. Testing against a range of samples,
|
30
|
-
# about 95-97% of the time.
|
33
|
+
# dictionary contains a large number of foreign loan words and proper
|
34
|
+
# names, and so when the algorithm is tested against 'real world' english,
|
35
|
+
# its accuracy is a good deal better. Testing against a range of samples,
|
36
|
+
# it guesses right about 95-97% of the time.
|
31
37
|
module Guess
|
32
38
|
# special cases - 1 syllable less than expected
|
33
39
|
SubSyl = [
|
data/lib/lingua/en/syllable.rb
CHANGED
@@ -2,18 +2,18 @@ require 'lingua/en/syllable/guess'
|
|
2
2
|
|
3
3
|
module Lingua
|
4
4
|
module EN
|
5
|
-
# The module Lingua::EN::Syllable contains a single class method,
|
6
|
-
# which will use the most accurate technique available to
|
7
|
-
# syllables in a string containing a word passed to it.
|
5
|
+
# The module Lingua::EN::Syllable contains a single class method,
|
6
|
+
# +syllable+, which will use the most accurate technique available to
|
7
|
+
# determine the number syllables in a string containing a word passed to it.
|
8
|
+
#
|
9
|
+
########## REMOVED BY dbalatero:
|
8
10
|
# The exact definition of the function depends on the availability of the
|
9
|
-
# Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
|
10
|
-
# the number of syllables as determined by the dictionary will be returned.
|
11
|
-
# the dictionary is not available, or if a word not contained in the
|
12
|
-
# is passed, it will return the number of syllables as determined
|
13
|
-
# module Lingua::EN::Syllable::Guess. For more details, see there and
|
11
|
+
# Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
|
12
|
+
# the number of syllables as determined by the dictionary will be returned.
|
13
|
+
# If the dictionary is not available, or if a word not contained in the
|
14
|
+
# dictionary is passed, it will return the number of syllables as determined
|
15
|
+
# by the module Lingua::EN::Syllable::Guess. For more details, see there and
|
14
16
|
# Lingua::EN::Syllable::Dictionary.
|
15
|
-
#
|
16
|
-
# dbalatero: removed dictionary.
|
17
17
|
module Syllable
|
18
18
|
def self.syllables(word)
|
19
19
|
Guess::syllables word
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe Lingua::EN::Paragraph do
|
4
|
+
describe "#paragraphs" do
|
5
|
+
it "should return paragraphs with extra whitespace in the line breaks" do
|
6
|
+
text = "Ok.\n \nTest."
|
7
|
+
result = Lingua::EN::Paragraph.paragraphs(text)
|
8
|
+
result.should have(2).things
|
9
|
+
result[0].should == "Ok."
|
10
|
+
result[1].should == "Test."
|
11
|
+
end
|
12
|
+
|
13
|
+
it "should break up paragraphs with > 2 line breaks" do
|
14
|
+
text = "Ok.\n\n\nTest."
|
15
|
+
result = Lingua::EN::Paragraph.paragraphs(text)
|
16
|
+
result.should have(2).things
|
17
|
+
result[0].should == "Ok."
|
18
|
+
result[1].should == "Test."
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should ignore trailing newline chars" do
|
22
|
+
text = "Ok.\n \n\nTest.\n \r\n \n\n"
|
23
|
+
result = Lingua::EN::Paragraph.paragraphs(text)
|
24
|
+
result.should have(2).things
|
25
|
+
result[0].should == "Ok."
|
26
|
+
result[1].should == "Test."
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe Lingua::EN::Readability do
|
4
|
+
before(:each) do
|
5
|
+
@text = <<-EOF
|
6
|
+
After marriage, the next big event in the couples lives will be their honeymoon. It is a time when the newly weds can get away from relatives and friends to spend some significant time getting to know one another. This time alone together that the couple shares is called the honeymoon. A great gift idea for the married couple would be to give them a surprise tour package. Most women would like to go on a honeymoon.
|
7
|
+
|
8
|
+
The week or two before the ceremonies would be the best time to schedule a tour because then the budget for this event could be considered. In winter there are more opportunities for the couple to get close to one another because of the cold weather. It is easier to snuggle when the weather is not favorable to outdoor activities. This would afford the couple ample time to know more about themselves during the honeymoon.
|
9
|
+
|
10
|
+
Honeymoon plans should be discussed with the wife to ensure that the shock is pleasant and not a negative experience to her. It is also a good idea in this case, to ask her probing questions as to where she would like to go. Perhaps you could get a friend or family member to ask her what would be her favorite travel location. That would ensure that you know just what she is looking for.
|
11
|
+
|
12
|
+
Make sure that the trip is exactly what she wants. Then on the wedding night tell her about the adventure so that the needed accommodations can be made.
|
13
|
+
EOF
|
14
|
+
|
15
|
+
@report = Lingua::EN::Readability.new(@text)
|
16
|
+
end
|
17
|
+
|
18
|
+
describe "#num_paragraphs" do
|
19
|
+
it "should return the correct count of paragraphs" do
|
20
|
+
@report.num_paragraphs.should == 4
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
describe "#num_sentences" do
|
25
|
+
it "should be the correct count of sentences" do
|
26
|
+
@report.num_sentences.should == 15
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../spec_helper"
|
2
|
+
|
3
|
+
describe Lingua::EN::Sentence do
|
4
|
+
klass = Lingua::EN::Sentence
|
5
|
+
|
6
|
+
describe "#sentences" do
|
7
|
+
describe "multi-paragraph text" do
|
8
|
+
before(:each) do
|
9
|
+
text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\"\n\n"
|
10
|
+
text << "Visit http://www.google.com and check out my site. Thanks very much!"
|
11
|
+
@sentences = klass.sentences(text)
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should get the correct number of sentences" do
|
15
|
+
@sentences.should have(5).things
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should get the correct sentences" do
|
19
|
+
@sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
|
20
|
+
@sentences[1].should == "And I'm inclined to agree."
|
21
|
+
@sentences[2].should == "\"Why can't we be friends?\""
|
22
|
+
@sentences[3].should == "Visit http://www.google.com and check out my site."
|
23
|
+
@sentences[4].should == "Thanks very much!"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "quoted sentences" do
|
28
|
+
before(:each) do
|
29
|
+
text = "As Milton Bradley once said, \"board games are the shit.\" And I'm inclined to agree. \"Why can't we be friends?\""
|
30
|
+
@sentences = klass.sentences(text)
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should get the correct number of sentences" do
|
34
|
+
@sentences.should have(3).things
|
35
|
+
end
|
36
|
+
|
37
|
+
it "should get the correct sentences" do
|
38
|
+
@sentences[0].should == "As Milton Bradley once said, \"board games are the shit.\""
|
39
|
+
@sentences[1].should == "And I'm inclined to agree."
|
40
|
+
@sentences[2].should == "\"Why can't we be friends?\""
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
describe "ellipses correction" do
|
45
|
+
before(:each) do
|
46
|
+
text = "Well... why would you do that? Let's not fight."
|
47
|
+
@sentences = klass.sentences(text)
|
48
|
+
end
|
49
|
+
|
50
|
+
it "should get the correct number of sentences" do
|
51
|
+
@sentences.should have(2).things
|
52
|
+
end
|
53
|
+
|
54
|
+
it "should get the right sentences" do
|
55
|
+
@sentences[0].should == "Well... why would you do that?"
|
56
|
+
@sentences[1].should == "Let's not fight."
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "simple URL matching" do
|
61
|
+
before(:each) do
|
62
|
+
text = "Hello, visit http://www.google.com/index.php?ok=ok for more info. Ok?"
|
63
|
+
@sentences = klass.sentences(text)
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should get the correct number of sentences" do
|
67
|
+
@sentences.should have(2).things
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should get the right sentences" do
|
71
|
+
@sentences[0].should == "Hello, visit http://www.google.com/index.php?ok=ok for more info."
|
72
|
+
@sentences[1].should == "Ok?"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
describe "ending a sentence with an abbreviation" do
|
77
|
+
before(:each) do
|
78
|
+
text = "I was born in the U.S.S.R. My parents were from the U.S. This is not weird."
|
79
|
+
@sentences = klass.sentences(text)
|
80
|
+
end
|
81
|
+
|
82
|
+
it "should get the correct number of sentences" do
|
83
|
+
@sentences.should have(3).things
|
84
|
+
end
|
85
|
+
|
86
|
+
it "should get the correct sentences" do
|
87
|
+
@sentences[0].should == "I was born in the U.S.S.R."
|
88
|
+
@sentences[1].should == "My parents were from the U.S."
|
89
|
+
@sentences[2].should == "This is not weird."
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
describe "basic sentences" do
|
94
|
+
before(:each) do
|
95
|
+
text = "Hello, my name is David. What is your name?"
|
96
|
+
@sentences = klass.sentences(text)
|
97
|
+
end
|
98
|
+
|
99
|
+
it "should get the correct number of sentences" do
|
100
|
+
@sentences.should have(2).things
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
describe "sentences with URLs and abbreviation" do
|
105
|
+
before(:each) do
|
106
|
+
text = "Many of these leading names now have their own website, e.g. http://www.kaptest.com/. Hello, e.g. you don't know what you mean. I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
|
107
|
+
@sentences = klass.sentences(text)
|
108
|
+
end
|
109
|
+
|
110
|
+
it "should get the correct number of sentences" do
|
111
|
+
@sentences[0].should == "Many of these leading names now have their own website, e.g. http://www.kaptest.com/."
|
112
|
+
@sentences[1].should == "Hello, e.g. you don't know what you mean."
|
113
|
+
@sentences[2].should == "I'm so angry about what you said about the U.S.A. or the u.S. or the U.S.S.R. ok."
|
114
|
+
@sentences.should have(3).things
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
describe "#abbreviation" do
|
120
|
+
it "should change the abbreviations list" do
|
121
|
+
klass.abbreviation('monkey', 'pig')
|
122
|
+
klass.abbreviations.should include('monkey')
|
123
|
+
klass.abbreviations.should include('pig')
|
124
|
+
end
|
125
|
+
|
126
|
+
it "should change the regex for abbreviations" do
|
127
|
+
lambda {
|
128
|
+
klass.abbreviation('monkey')
|
129
|
+
}.should change(klass, :abbr_regex)
|
130
|
+
end
|
131
|
+
|
132
|
+
after(:each) do
|
133
|
+
klass.initialize_abbreviations!
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 6
|
8
|
+
- 0
|
9
|
+
version: 0.6.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- David Balatero
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-04-
|
17
|
+
date: 2010-04-20 00:00:00 -07:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -48,10 +48,14 @@ files:
|
|
48
48
|
- Rakefile
|
49
49
|
- VERSION
|
50
50
|
- lib/lingua.rb
|
51
|
+
- lib/lingua/en/paragraph.rb
|
51
52
|
- lib/lingua/en/readability.rb
|
52
53
|
- lib/lingua/en/sentence.rb
|
53
54
|
- lib/lingua/en/syllable.rb
|
54
55
|
- lib/lingua/en/syllable/guess.rb
|
56
|
+
- spec/lingua/en/paragraph_spec.rb
|
57
|
+
- spec/lingua/en/readability_spec.rb
|
58
|
+
- spec/lingua/en/sentence_spec.rb
|
55
59
|
- spec/spec.opts
|
56
60
|
- spec/spec_helper.rb
|
57
61
|
has_rdoc: true
|
@@ -85,4 +89,7 @@ signing_key:
|
|
85
89
|
specification_version: 3
|
86
90
|
summary: This is a maintained version of Ruby's Lingua port.
|
87
91
|
test_files:
|
92
|
+
- spec/lingua/en/paragraph_spec.rb
|
93
|
+
- spec/lingua/en/readability_spec.rb
|
94
|
+
- spec/lingua/en/sentence_spec.rb
|
88
95
|
- spec/spec_helper.rb
|