lingua 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +23 -0
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/lingua/en/readability.rb +143 -0
- data/lib/lingua/en/sentence.rb +62 -0
- data/lib/lingua/en/syllable/dict +0 -0
- data/lib/lingua/en/syllable/dictionary.rb +107 -0
- data/lib/lingua/en/syllable/guess.rb +92 -0
- data/lib/lingua/en/syllable.rb +39 -0
- data/lib/lingua.rb +6 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +90 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Copyright (c) 2009 David Balatero
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
4
|
+
a copy of this software and associated documentation files (the
|
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
9
|
+
the following conditions:
|
|
10
|
+
|
|
11
|
+
The above copyright notice and this permission notice shall be
|
|
12
|
+
included in all copies or substantial portions of the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
= lingua
|
|
2
|
+
|
|
3
|
+
Description goes here.
|
|
4
|
+
|
|
5
|
+
== Note on Patches/Pull Requests
|
|
6
|
+
|
|
7
|
+
* Fork the project.
|
|
8
|
+
* Make your feature addition or bug fix.
|
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
|
10
|
+
future version unintentionally.
|
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
|
12
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
|
13
|
+
* Send me a pull request. Bonus points for topic branches.
|
|
14
|
+
|
|
15
|
+
== Copyright
|
|
16
|
+
|
|
17
|
+
Copyright (c) 2010 David Balatero. See LICENSE for details.
|
data/Rakefile
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'rake'
|
|
3
|
+
|
|
4
|
+
begin
|
|
5
|
+
require 'jeweler'
|
|
6
|
+
Jeweler::Tasks.new do |gem|
|
|
7
|
+
gem.name = "lingua"
|
|
8
|
+
gem.summary = %Q{This is a maintained version of Ruby's Lingua port.}
|
|
9
|
+
gem.description = %Q{Provides sentence splitting, syllable, and text-quality algorithms.}
|
|
10
|
+
gem.email = "dbalatero@gmail.com"
|
|
11
|
+
gem.homepage = "http://github.com/dbalatero/lingua"
|
|
12
|
+
gem.authors = ["David Balatero"]
|
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
15
|
+
end
|
|
16
|
+
Jeweler::GemcutterTasks.new
|
|
17
|
+
rescue LoadError
|
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
require 'spec/rake/spectask'
|
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
|
30
|
+
spec.rcov = true
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
task :spec => :check_dependencies
|
|
34
|
+
|
|
35
|
+
task :default => :spec
|
|
36
|
+
|
|
37
|
+
require 'rake/rdoctask'
|
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
|
40
|
+
|
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
|
42
|
+
rdoc.title = "lingua #{version}"
|
|
43
|
+
rdoc.rdoc_files.include('README*')
|
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
|
45
|
+
end
|
data/VERSION
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
0.5.0
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
module Lingua
|
|
2
|
+
module EN
|
|
3
|
+
|
|
4
|
+
# The class Lingua::EN::Readability takes English text and analyses formal
|
|
5
|
+
# characteristic
|
|
6
|
+
class Readability
|
|
7
|
+
require 'lingua/en/syllable'
|
|
8
|
+
require 'lingua/en/sentence'
|
|
9
|
+
|
|
10
|
+
attr_reader :text, :paragraphs, :sentences, :words, :frequencies
|
|
11
|
+
|
|
12
|
+
# The constructor accepts the text to be analysed, and returns a report
|
|
13
|
+
# object which gives access to the
|
|
14
|
+
def initialize(text)
|
|
15
|
+
@text = text.dup
|
|
16
|
+
@paragraphs = text.split(/\n\s*\n\s*/)
|
|
17
|
+
@sentences = Lingua::EN::Sentence.sentences(@text)
|
|
18
|
+
@words = []
|
|
19
|
+
@frequencies = {}
|
|
20
|
+
@frequencies.default = 0
|
|
21
|
+
@syllables = 0
|
|
22
|
+
@complex_words = 0
|
|
23
|
+
count_words
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# The number of paragraphs in the sample. A paragraph is defined as a
|
|
27
|
+
# newline followed by one or more empty or whitespace-only lines.
|
|
28
|
+
def num_paragraphs
|
|
29
|
+
@paragraphs.length
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# The number of sentences in the sample. The meaning of a "sentence" is
|
|
33
|
+
# defined by Lingua::EN::Sentence.
|
|
34
|
+
def num_sentences
|
|
35
|
+
@sentences.length
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# The number of characters in the sample.
|
|
39
|
+
def num_chars
|
|
40
|
+
@text.length
|
|
41
|
+
end
|
|
42
|
+
alias :num_characters :num_chars
|
|
43
|
+
|
|
44
|
+
# The total number of words used in the sample. Numbers as digits are not
|
|
45
|
+
# counted.
|
|
46
|
+
def num_words
|
|
47
|
+
@words.length
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
# The total number of syllables in the text sample. Just for completeness.
|
|
51
|
+
def num_syllables
|
|
52
|
+
@syllables
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# The number of different unique words used in the text sample.
|
|
56
|
+
def num_unique_words
|
|
57
|
+
@frequencies.keys.length
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
# An array containing each unique word used in the text sample.
|
|
61
|
+
def unique_words
|
|
62
|
+
@frequencies.keys
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# The number of occurences of the word +word+ in the text sample.
|
|
66
|
+
def occurrences(word)
|
|
67
|
+
@frequencies[word]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# The average number of words per sentence.
|
|
71
|
+
def words_per_sentence
|
|
72
|
+
@words.length.to_f / @sentences.length.to_f
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# The average number of syllables per word. The syllable count is performed
|
|
76
|
+
# by Lingua::EN::Syllable, and so may not be completely accurate, especially
|
|
77
|
+
# if the Carnegie-Mellon Pronouncing Dictionary is not installed.
|
|
78
|
+
def syllables_per_word
|
|
79
|
+
@syllables.to_f / @words.length.to_f
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
# Flesch-Kincaid level of the text sample. This measure scores text based
|
|
83
|
+
# on the American school grade system; a score of 7.0 would indicate that
|
|
84
|
+
# the text is readable by a seventh grader. A score of 7.0 to 8.0 is
|
|
85
|
+
# regarded as optimal for ordinary text.
|
|
86
|
+
def kincaid
|
|
87
|
+
(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Flesch reading ease of the text sample. A higher score indicates text that
|
|
91
|
+
# is easier to read. The score is on a 100-point scale, and a score of 60-70
|
|
92
|
+
# is regarded as optimal for ordinary text.
|
|
93
|
+
def flesch
|
|
94
|
+
206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# The Gunning Fog Index of the text sample. The index indicates the number
|
|
98
|
+
# of years of formal education that a reader of average intelligence would
|
|
99
|
+
# need to comprehend the text. A higher score indicates harder text; a value
|
|
100
|
+
# of around 12 is indicated as ideal for ordinary text.
|
|
101
|
+
def fog
|
|
102
|
+
( words_per_sentence + percent_fog_complex_words ) * 0.4
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
# The percentage of words that are defined as "complex" for the purpose of
|
|
106
|
+
# the Fog Index. This is non-hyphenated words of three or more syllabes.
|
|
107
|
+
def percent_fog_complex_words
|
|
108
|
+
( @complex_words.to_f / @words.length.to_f ) * 100
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Return a nicely formatted report on the sample, showing most the useful
|
|
112
|
+
# statistics about the text sample.
|
|
113
|
+
def report
|
|
114
|
+
sprintf "Number of paragraphs %d \n" <<
|
|
115
|
+
"Number of sentences %d \n" <<
|
|
116
|
+
"Number of words %d \n" <<
|
|
117
|
+
"Number of characters %d \n\n" <<
|
|
118
|
+
"Average words per sentence %.2f \n" <<
|
|
119
|
+
"Average syllables per word %.2f \n\n" <<
|
|
120
|
+
"Flesch score %2.2f \n" <<
|
|
121
|
+
"Flesh-Kincaid grade level %2.2f \n" <<
|
|
122
|
+
"Fog Index %2.2f \n",
|
|
123
|
+
num_paragraphs, num_sentences, num_words, num_characters,
|
|
124
|
+
words_per_sentence, syllables_per_word,
|
|
125
|
+
flesch, kincaid, fog
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
private
|
|
129
|
+
def count_words
|
|
130
|
+
for match in @text.scan /\b([a-z][a-z\-']*)\b/i
|
|
131
|
+
word = match[0]
|
|
132
|
+
@words.push word
|
|
133
|
+
@frequencies[word] += 1
|
|
134
|
+
syllables = Lingua::EN::Syllable.syllables(word)
|
|
135
|
+
@syllables += syllables
|
|
136
|
+
if syllables > 2 && word !~ /-/
|
|
137
|
+
@complex_words += 1 # for Fog Index
|
|
138
|
+
end
|
|
139
|
+
end
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
module Lingua
|
|
2
|
+
module EN
|
|
3
|
+
# The module Lingua::EN::Sentence takes English text, and attempts to
|
|
4
|
+
# split it up into sentences, respecting abbreviations.
|
|
5
|
+
|
|
6
|
+
module Sentence
|
|
7
|
+
EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
|
|
8
|
+
|
|
9
|
+
Titles = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
|
|
10
|
+
'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
|
|
11
|
+
'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
|
|
12
|
+
|
|
13
|
+
Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
|
|
14
|
+
'corp', 'plc' ] unless defined?(Entities)
|
|
15
|
+
|
|
16
|
+
Months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
|
|
17
|
+
'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
|
|
18
|
+
|
|
19
|
+
Days = [ 'mon', 'tue', 'wed', 'thu',
|
|
20
|
+
'fri', 'sat', 'sun' ] unless defined?(Days)
|
|
21
|
+
|
|
22
|
+
Misc = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
|
|
23
|
+
|
|
24
|
+
Streets = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
|
|
25
|
+
'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
|
|
26
|
+
|
|
27
|
+
@@abbreviations = Titles + Entities + Months + Days + Streets + Misc
|
|
28
|
+
|
|
29
|
+
# Split the passed text into individual sentences, trim these and return
|
|
30
|
+
# as an array. A sentence is marked by one of the punctuation marks ".", "?"
|
|
31
|
+
# or "!" followed by whitespace. Sequences of full stops (such as an
|
|
32
|
+
# ellipsis marker "..." and stops after a known abbreviation are ignored.
|
|
33
|
+
def self.sentences(text)
|
|
34
|
+
|
|
35
|
+
text = text.dup
|
|
36
|
+
|
|
37
|
+
# initial split after punctuation - have to preserve trailing whitespace
|
|
38
|
+
# for the ellipsis correction next
|
|
39
|
+
# would be nicer to use look-behind and look-ahead assertions to skip
|
|
40
|
+
# ellipsis marks, but Ruby doesn't support look-behind
|
|
41
|
+
text.gsub!( /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ ) { $1 << EOS << $2 }
|
|
42
|
+
|
|
43
|
+
# correct ellipsis marks and rows of stops
|
|
44
|
+
text.gsub!( /(\.\.\.*)#{EOS}/ ) { $1 }
|
|
45
|
+
|
|
46
|
+
# correct abbreviations
|
|
47
|
+
# TODO - precompile this regex?
|
|
48
|
+
text.gsub!( /(#{@@abbreviations.join("|")})\.#{EOS}/i ) { $1 << '.' }
|
|
49
|
+
|
|
50
|
+
# split on EOS marker, strip gets rid of trailing whitespace
|
|
51
|
+
text.split(EOS).map { | sentence | sentence.strip }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# add a list of abbreviations to the list that's used to detect false
|
|
55
|
+
# sentence ends. Return the current list of abbreviations in use.
|
|
56
|
+
def self.abbreviation(*abbreviations)
|
|
57
|
+
@@abbreviations += abbreviations
|
|
58
|
+
@@abbreviations
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
Binary file
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
module Lingua
|
|
2
|
+
module EN
|
|
3
|
+
module Syllable
|
|
4
|
+
|
|
5
|
+
module Dictionary
|
|
6
|
+
class LookUpError < IndexError
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
@@dictionary = nil
|
|
10
|
+
@@dbmclass = nil
|
|
11
|
+
@@dbmext = nil
|
|
12
|
+
|
|
13
|
+
# use an available dbm-style hash
|
|
14
|
+
[ 'gdbm', 'dbm'].each do | dbm |
|
|
15
|
+
begin
|
|
16
|
+
require dbm
|
|
17
|
+
@@dbmclass = Module.const_get(dbm.upcase)
|
|
18
|
+
rescue
|
|
19
|
+
next
|
|
20
|
+
end
|
|
21
|
+
break
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
if @@dbmclass.nil?
|
|
25
|
+
raise LoadError,
|
|
26
|
+
"no dbm class available for Lingua::EN::Syllable::Dictionary"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Look up word in the dbm dictionary.
|
|
30
|
+
def Dictionary.syllables(word)
|
|
31
|
+
if @@dictionary.nil?
|
|
32
|
+
load_dictionary
|
|
33
|
+
end
|
|
34
|
+
word = word.upcase
|
|
35
|
+
begin
|
|
36
|
+
pronounce = @@dictionary.fetch(word)
|
|
37
|
+
rescue IndexError
|
|
38
|
+
if word =~ /'/
|
|
39
|
+
word = word.delete "'"
|
|
40
|
+
retry
|
|
41
|
+
end
|
|
42
|
+
raise LookUpError, "word #{word} not in dictionary"
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
pronounce.split(/-/).grep(/^[AEIUO]/).length
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def Dictionary.dictionary
|
|
49
|
+
if @@dictionary.nil?
|
|
50
|
+
load_dictionary
|
|
51
|
+
end
|
|
52
|
+
@@dictionary
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# convert a text file dictionary into dbm files. Returns the file names
|
|
56
|
+
# of the created dbms.
|
|
57
|
+
def Dictionary.make_dictionary(source_file, output_dir)
|
|
58
|
+
begin
|
|
59
|
+
Dir.mkdir(output_dir)
|
|
60
|
+
rescue
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# clean old dictionary dbms
|
|
64
|
+
Dir.foreach(output_dir) do | x |
|
|
65
|
+
next if x =~ /^\.\.?$/
|
|
66
|
+
File.unlink(File.join(output_dir, x))
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
dbm = @@dbmclass.new(File.join(output_dir, 'dict'))
|
|
70
|
+
|
|
71
|
+
begin
|
|
72
|
+
IO.foreach(source_file) do | line |
|
|
73
|
+
next if line !~ /^[A-Z]/
|
|
74
|
+
line.chomp!
|
|
75
|
+
(word, *phonemes) = line.split(/ ?/)
|
|
76
|
+
next if word =~ /\(\d\) ?$/ # ignore alternative pronunciations
|
|
77
|
+
dbm.store(word, phonemes.join("-"))
|
|
78
|
+
end
|
|
79
|
+
rescue
|
|
80
|
+
# close and clean up
|
|
81
|
+
dbm.close
|
|
82
|
+
Dir.foreach(output_dir) do | x |
|
|
83
|
+
next if x =~ /^\.\.?$/
|
|
84
|
+
File.unlink(File.join('dict', x))
|
|
85
|
+
end
|
|
86
|
+
# delete files
|
|
87
|
+
raise
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
dbm.close
|
|
91
|
+
|
|
92
|
+
Dir.entries(output_dir).collect { | x |
|
|
93
|
+
x =~ /^\.\.?$/ ? nil : File.join("dict", x)
|
|
94
|
+
}.compact
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
private
|
|
98
|
+
def Dictionary.load_dictionary
|
|
99
|
+
@@dictionary = @@dbmclass.new( __FILE__[0..-14] + 'dict')
|
|
100
|
+
if @@dictionary.keys.length.zero?
|
|
101
|
+
raise LoadError, "dictionary file not found"
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
module Lingua
|
|
2
|
+
module EN
|
|
3
|
+
module Syllable
|
|
4
|
+
# Uses english word patterns to guess the number of syllables. A single module
|
|
5
|
+
# method is made available, +syllables+, which, when passed an english word,
|
|
6
|
+
# will return the number of syllables it estimates are in the word.
|
|
7
|
+
# English orthography (the representation of spoken sounds as written signs) is
|
|
8
|
+
# not regular. The same spoken sound can be represented in multiple different
|
|
9
|
+
# ways in written English (e.g. rough/cuff), and the same written letters
|
|
10
|
+
# can be pronounced in different ways in different words (e.g. rough/bough).
|
|
11
|
+
# As the same series of letters can be pronounced in different ways, it is not
|
|
12
|
+
# possible to write an algorithm which can always guess the number of syllables
|
|
13
|
+
# in an english word correctly. However, it is possible to use frequently
|
|
14
|
+
# recurring patterns in english (such as "a final -e is usually silent") to
|
|
15
|
+
# guess with a level of accuracy that is acceptable for applications like
|
|
16
|
+
# syllable counting for readability scoring. This module implements such an
|
|
17
|
+
# algorithm.
|
|
18
|
+
# This module is inspired by the Perl Lingua::EN::Syllable module. However, it
|
|
19
|
+
# uses a different (though not larger) set of patterns to compensate for the
|
|
20
|
+
# 'special cases' which arise out of English's irregular orthography. A number
|
|
21
|
+
# of extra patterns (particularly for derived word forms) means that this module
|
|
22
|
+
# is somewhat more accurate than the Perl original. It also omits a number of
|
|
23
|
+
# patterns found in the original which seem to me to apply to such a small number
|
|
24
|
+
# of cases, or to be of dubious value. Testing the guesses against the Carnegie
|
|
25
|
+
# Mellon Pronouncing Dictionary, this module guesses right around 90% of the
|
|
26
|
+
# time, as against about 85% of the time for the Perl module. However, the
|
|
27
|
+
# dictionary contains a large number of foreign loan words and proper names, and
|
|
28
|
+
# so when the algorithm is tested against 'real world' english, its accuracy
|
|
29
|
+
# is a good deal better. Testing against a range of samples, it guesses right
|
|
30
|
+
# about 95-97% of the time.
|
|
31
|
+
module Guess
|
|
32
|
+
# special cases - 1 syllable less than expected
|
|
33
|
+
SubSyl = [
|
|
34
|
+
/[^aeiou]e$/, # give, love, bone, done, ride ...
|
|
35
|
+
/[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/,
|
|
36
|
+
# (passive) past participles and 3rd person sing present verbs:
|
|
37
|
+
# bared, liked, called, tricked, bashed, matched
|
|
38
|
+
|
|
39
|
+
/.e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/,
|
|
40
|
+
# nominal, adjectival and adverbial derivatives from -e$ roots:
|
|
41
|
+
# absolutely, nicely, likeness, basement, hopeless
|
|
42
|
+
# hopeful, tastefully, wasteful
|
|
43
|
+
|
|
44
|
+
/ion/, # action, diction, fiction
|
|
45
|
+
/[ct]ia[nl]/, # special(ly), initial, physician, christian
|
|
46
|
+
/[^cx]iou/, # illustrious, NOT spacious, gracious, anxious, noxious
|
|
47
|
+
/sia$/, # amnesia, polynesia
|
|
48
|
+
/.gue$/ # dialogue, intrigue, colleague
|
|
49
|
+
] unless defined?(SubSyl)
|
|
50
|
+
|
|
51
|
+
# special cases - 1 syllable more than expected
|
|
52
|
+
AddSyl = [
|
|
53
|
+
/i[aiou]/, # alias, science, phobia
|
|
54
|
+
/[dls]ien/, # salient, gradient, transient
|
|
55
|
+
/[aeiouym]ble$/, # -Vble, plus -mble
|
|
56
|
+
/[aeiou]{3}/, # agreeable
|
|
57
|
+
/^mc/, # mcwhatever
|
|
58
|
+
/ism$/, # sexism, racism
|
|
59
|
+
/(?:([^aeiouy])\1|ck|mp|ng)le$/, # bubble, cattle, cackle, sample, angle
|
|
60
|
+
/dnt$/, # couldn/t
|
|
61
|
+
/[aeiou]y[aeiou]/ # annoying, layer
|
|
62
|
+
] unless defined?(AddSyl)
|
|
63
|
+
|
|
64
|
+
# special cases not actually used - these seem to me to be either very
|
|
65
|
+
# marginal or actually break more stuff than they fix
|
|
66
|
+
NotUsed = [
|
|
67
|
+
/^coa[dglx]./, # +1 coagulate, coaxial, coalition, coalesce - marginal
|
|
68
|
+
/[^gq]ua[^auieo]/, # +1 'du-al' - only for some speakers, and breaks
|
|
69
|
+
/riet/, # variety, parietal, notoriety - marginal?
|
|
70
|
+
] unless defined?(NotUsed)
|
|
71
|
+
|
|
72
|
+
def self.syllables(word)
|
|
73
|
+
return 1 if word.length == 1
|
|
74
|
+
word = word.downcase.delete("'")
|
|
75
|
+
|
|
76
|
+
syllables = word.scan(/[aeiouy]+/).length
|
|
77
|
+
|
|
78
|
+
# special cases
|
|
79
|
+
for pat in SubSyl
|
|
80
|
+
syllables -= 1 if pat.match(word)
|
|
81
|
+
end
|
|
82
|
+
for pat in AddSyl
|
|
83
|
+
syllables += 1 if pat.match(word)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
syllables = 1 if syllables < 1 # no vowels?
|
|
87
|
+
syllables
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module Lingua
|
|
2
|
+
module EN
|
|
3
|
+
# The module Lingua::EN::Syllable contains a single class method, +syllable+,
|
|
4
|
+
# which will use the most accurate technique available to determine the number
|
|
5
|
+
# syllables in a string containing a word passed to it.
|
|
6
|
+
# The exact definition of the function depends on the availability of the
|
|
7
|
+
# Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
|
|
8
|
+
# the number of syllables as determined by the dictionary will be returned. If
|
|
9
|
+
# the dictionary is not available, or if a word not contained in the dictionary
|
|
10
|
+
# is passed, it will return the number of syllables as determined by the
|
|
11
|
+
# module Lingua::EN::Syllable::Guess. For more details, see there and
|
|
12
|
+
# Lingua::EN::Syllable::Dictionary.
|
|
13
|
+
module Syllable
|
|
14
|
+
# use dictionary if possible
|
|
15
|
+
begin
|
|
16
|
+
require 'lingua/en/syllable/dictionary.rb'
|
|
17
|
+
require 'lingua/en/syllable/guess.rb'
|
|
18
|
+
|
|
19
|
+
def Syllable.syllables(word)
|
|
20
|
+
begin
|
|
21
|
+
return Dictionary::syllables(word)
|
|
22
|
+
rescue Dictionary::LookUpError
|
|
23
|
+
return Guess::syllables(word)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
rescue LoadError # dictionary not available?
|
|
27
|
+
require 'lingua/en/syllable/guess.rb'
|
|
28
|
+
def Syllable.syllables(word)
|
|
29
|
+
Guess::syllables word
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
if __FILE__ == $0
|
|
37
|
+
ARGV.each { | word | puts "'#{word}' : " +
|
|
38
|
+
Lingua::EN::Syllable::syllables(word).to_s }
|
|
39
|
+
end
|
data/lib/lingua.rb
ADDED
data/spec/spec.opts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: lingua
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
prerelease: false
|
|
5
|
+
segments:
|
|
6
|
+
- 0
|
|
7
|
+
- 5
|
|
8
|
+
- 0
|
|
9
|
+
version: 0.5.0
|
|
10
|
+
platform: ruby
|
|
11
|
+
authors:
|
|
12
|
+
- David Balatero
|
|
13
|
+
autorequire:
|
|
14
|
+
bindir: bin
|
|
15
|
+
cert_chain: []
|
|
16
|
+
|
|
17
|
+
date: 2010-04-11 00:00:00 -07:00
|
|
18
|
+
default_executable:
|
|
19
|
+
dependencies:
|
|
20
|
+
- !ruby/object:Gem::Dependency
|
|
21
|
+
name: rspec
|
|
22
|
+
prerelease: false
|
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
|
24
|
+
requirements:
|
|
25
|
+
- - ">="
|
|
26
|
+
- !ruby/object:Gem::Version
|
|
27
|
+
segments:
|
|
28
|
+
- 1
|
|
29
|
+
- 2
|
|
30
|
+
- 9
|
|
31
|
+
version: 1.2.9
|
|
32
|
+
type: :development
|
|
33
|
+
version_requirements: *id001
|
|
34
|
+
description: Provides sentence splitting, syllable, and text-quality algorithms.
|
|
35
|
+
email: dbalatero@gmail.com
|
|
36
|
+
executables: []
|
|
37
|
+
|
|
38
|
+
extensions: []
|
|
39
|
+
|
|
40
|
+
extra_rdoc_files:
|
|
41
|
+
- LICENSE
|
|
42
|
+
- README.rdoc
|
|
43
|
+
files:
|
|
44
|
+
- .document
|
|
45
|
+
- .gitignore
|
|
46
|
+
- LICENSE
|
|
47
|
+
- README.rdoc
|
|
48
|
+
- Rakefile
|
|
49
|
+
- VERSION
|
|
50
|
+
- lib/lingua.rb
|
|
51
|
+
- lib/lingua/en/readability.rb
|
|
52
|
+
- lib/lingua/en/sentence.rb
|
|
53
|
+
- lib/lingua/en/syllable.rb
|
|
54
|
+
- lib/lingua/en/syllable/dict
|
|
55
|
+
- lib/lingua/en/syllable/dictionary.rb
|
|
56
|
+
- lib/lingua/en/syllable/guess.rb
|
|
57
|
+
- spec/spec.opts
|
|
58
|
+
- spec/spec_helper.rb
|
|
59
|
+
has_rdoc: true
|
|
60
|
+
homepage: http://github.com/dbalatero/lingua
|
|
61
|
+
licenses: []
|
|
62
|
+
|
|
63
|
+
post_install_message:
|
|
64
|
+
rdoc_options:
|
|
65
|
+
- --charset=UTF-8
|
|
66
|
+
require_paths:
|
|
67
|
+
- lib
|
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
69
|
+
requirements:
|
|
70
|
+
- - ">="
|
|
71
|
+
- !ruby/object:Gem::Version
|
|
72
|
+
segments:
|
|
73
|
+
- 0
|
|
74
|
+
version: "0"
|
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
|
+
requirements:
|
|
77
|
+
- - ">="
|
|
78
|
+
- !ruby/object:Gem::Version
|
|
79
|
+
segments:
|
|
80
|
+
- 0
|
|
81
|
+
version: "0"
|
|
82
|
+
requirements: []
|
|
83
|
+
|
|
84
|
+
rubyforge_project:
|
|
85
|
+
rubygems_version: 1.3.6
|
|
86
|
+
signing_key:
|
|
87
|
+
specification_version: 3
|
|
88
|
+
summary: This is a maintained version of Ruby's Lingua port.
|
|
89
|
+
test_files:
|
|
90
|
+
- spec/spec_helper.rb
|