lingua 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +23 -0
- data/LICENSE +20 -0
- data/README.rdoc +17 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/lingua/en/readability.rb +143 -0
- data/lib/lingua/en/sentence.rb +62 -0
- data/lib/lingua/en/syllable/dict +0 -0
- data/lib/lingua/en/syllable/dictionary.rb +107 -0
- data/lib/lingua/en/syllable/guess.rb +92 -0
- data/lib/lingua/en/syllable.rb +39 -0
- data/lib/lingua.rb +6 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +90 -0
data/.document
ADDED
data/.gitignore
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 David Balatero
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
= lingua
|
2
|
+
|
3
|
+
Description goes here.
|
4
|
+
|
5
|
+
== Note on Patches/Pull Requests
|
6
|
+
|
7
|
+
* Fork the project.
|
8
|
+
* Make your feature addition or bug fix.
|
9
|
+
* Add tests for it. This is important so I don't break it in a
|
10
|
+
future version unintentionally.
|
11
|
+
* Commit, do not mess with rakefile, version, or history.
|
12
|
+
(if you want to have your own version, that is fine but bump version in a commit by itself I can ignore when I pull)
|
13
|
+
* Send me a pull request. Bonus points for topic branches.
|
14
|
+
|
15
|
+
== Copyright
|
16
|
+
|
17
|
+
Copyright (c) 2010 David Balatero. See LICENSE for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "lingua"
|
8
|
+
gem.summary = %Q{This is a maintained version of Ruby's Lingua port.}
|
9
|
+
gem.description = %Q{Provides sentence splitting, syllable, and text-quality algorithms.}
|
10
|
+
gem.email = "dbalatero@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/dbalatero/lingua"
|
12
|
+
gem.authors = ["David Balatero"]
|
13
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
Jeweler::GemcutterTasks.new
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
task :spec => :check_dependencies
|
34
|
+
|
35
|
+
task :default => :spec
|
36
|
+
|
37
|
+
require 'rake/rdoctask'
|
38
|
+
Rake::RDocTask.new do |rdoc|
|
39
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
40
|
+
|
41
|
+
rdoc.rdoc_dir = 'rdoc'
|
42
|
+
rdoc.title = "lingua #{version}"
|
43
|
+
rdoc.rdoc_files.include('README*')
|
44
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
45
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.5.0
|
@@ -0,0 +1,143 @@
|
|
1
|
+
module Lingua
|
2
|
+
module EN
|
3
|
+
|
4
|
+
# The class Lingua::EN::Readability takes English text and analyses formal
|
5
|
+
# characteristic
|
6
|
+
class Readability
|
7
|
+
require 'lingua/en/syllable'
|
8
|
+
require 'lingua/en/sentence'
|
9
|
+
|
10
|
+
attr_reader :text, :paragraphs, :sentences, :words, :frequencies
|
11
|
+
|
12
|
+
# The constructor accepts the text to be analysed, and returns a report
|
13
|
+
# object which gives access to the
|
14
|
+
def initialize(text)
|
15
|
+
@text = text.dup
|
16
|
+
@paragraphs = text.split(/\n\s*\n\s*/)
|
17
|
+
@sentences = Lingua::EN::Sentence.sentences(@text)
|
18
|
+
@words = []
|
19
|
+
@frequencies = {}
|
20
|
+
@frequencies.default = 0
|
21
|
+
@syllables = 0
|
22
|
+
@complex_words = 0
|
23
|
+
count_words
|
24
|
+
end
|
25
|
+
|
26
|
+
# The number of paragraphs in the sample. A paragraph is defined as a
|
27
|
+
# newline followed by one or more empty or whitespace-only lines.
|
28
|
+
def num_paragraphs
|
29
|
+
@paragraphs.length
|
30
|
+
end
|
31
|
+
|
32
|
+
# The number of sentences in the sample. The meaning of a "sentence" is
|
33
|
+
# defined by Lingua::EN::Sentence.
|
34
|
+
def num_sentences
|
35
|
+
@sentences.length
|
36
|
+
end
|
37
|
+
|
38
|
+
# The number of characters in the sample.
|
39
|
+
def num_chars
|
40
|
+
@text.length
|
41
|
+
end
|
42
|
+
alias :num_characters :num_chars
|
43
|
+
|
44
|
+
# The total number of words used in the sample. Numbers as digits are not
|
45
|
+
# counted.
|
46
|
+
def num_words
|
47
|
+
@words.length
|
48
|
+
end
|
49
|
+
|
50
|
+
# The total number of syllables in the text sample. Just for completeness.
|
51
|
+
def num_syllables
|
52
|
+
@syllables
|
53
|
+
end
|
54
|
+
|
55
|
+
# The number of different unique words used in the text sample.
|
56
|
+
def num_unique_words
|
57
|
+
@frequencies.keys.length
|
58
|
+
end
|
59
|
+
|
60
|
+
# An array containing each unique word used in the text sample.
|
61
|
+
def unique_words
|
62
|
+
@frequencies.keys
|
63
|
+
end
|
64
|
+
|
65
|
+
# The number of occurences of the word +word+ in the text sample.
|
66
|
+
def occurrences(word)
|
67
|
+
@frequencies[word]
|
68
|
+
end
|
69
|
+
|
70
|
+
# The average number of words per sentence.
|
71
|
+
def words_per_sentence
|
72
|
+
@words.length.to_f / @sentences.length.to_f
|
73
|
+
end
|
74
|
+
|
75
|
+
# The average number of syllables per word. The syllable count is performed
|
76
|
+
# by Lingua::EN::Syllable, and so may not be completely accurate, especially
|
77
|
+
# if the Carnegie-Mellon Pronouncing Dictionary is not installed.
|
78
|
+
def syllables_per_word
|
79
|
+
@syllables.to_f / @words.length.to_f
|
80
|
+
end
|
81
|
+
|
82
|
+
# Flesch-Kincaid level of the text sample. This measure scores text based
|
83
|
+
# on the American school grade system; a score of 7.0 would indicate that
|
84
|
+
# the text is readable by a seventh grader. A score of 7.0 to 8.0 is
|
85
|
+
# regarded as optimal for ordinary text.
|
86
|
+
def kincaid
|
87
|
+
(11.8 * syllables_per_word) + (0.39 * words_per_sentence) - 15.59
|
88
|
+
end
|
89
|
+
|
90
|
+
# Flesch reading ease of the text sample. A higher score indicates text that
|
91
|
+
# is easier to read. The score is on a 100-point scale, and a score of 60-70
|
92
|
+
# is regarded as optimal for ordinary text.
|
93
|
+
def flesch
|
94
|
+
206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
95
|
+
end
|
96
|
+
|
97
|
+
# The Gunning Fog Index of the text sample. The index indicates the number
|
98
|
+
# of years of formal education that a reader of average intelligence would
|
99
|
+
# need to comprehend the text. A higher score indicates harder text; a value
|
100
|
+
# of around 12 is indicated as ideal for ordinary text.
|
101
|
+
def fog
|
102
|
+
( words_per_sentence + percent_fog_complex_words ) * 0.4
|
103
|
+
end
|
104
|
+
|
105
|
+
# The percentage of words that are defined as "complex" for the purpose of
|
106
|
+
# the Fog Index. This is non-hyphenated words of three or more syllabes.
|
107
|
+
def percent_fog_complex_words
|
108
|
+
( @complex_words.to_f / @words.length.to_f ) * 100
|
109
|
+
end
|
110
|
+
|
111
|
+
# Return a nicely formatted report on the sample, showing most the useful
|
112
|
+
# statistics about the text sample.
|
113
|
+
def report
|
114
|
+
sprintf "Number of paragraphs %d \n" <<
|
115
|
+
"Number of sentences %d \n" <<
|
116
|
+
"Number of words %d \n" <<
|
117
|
+
"Number of characters %d \n\n" <<
|
118
|
+
"Average words per sentence %.2f \n" <<
|
119
|
+
"Average syllables per word %.2f \n\n" <<
|
120
|
+
"Flesch score %2.2f \n" <<
|
121
|
+
"Flesh-Kincaid grade level %2.2f \n" <<
|
122
|
+
"Fog Index %2.2f \n",
|
123
|
+
num_paragraphs, num_sentences, num_words, num_characters,
|
124
|
+
words_per_sentence, syllables_per_word,
|
125
|
+
flesch, kincaid, fog
|
126
|
+
end
|
127
|
+
|
128
|
+
private
|
129
|
+
def count_words
|
130
|
+
for match in @text.scan /\b([a-z][a-z\-']*)\b/i
|
131
|
+
word = match[0]
|
132
|
+
@words.push word
|
133
|
+
@frequencies[word] += 1
|
134
|
+
syllables = Lingua::EN::Syllable.syllables(word)
|
135
|
+
@syllables += syllables
|
136
|
+
if syllables > 2 && word !~ /-/
|
137
|
+
@complex_words += 1 # for Fog Index
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Lingua
|
2
|
+
module EN
|
3
|
+
# The module Lingua::EN::Sentence takes English text, and attempts to
|
4
|
+
# split it up into sentences, respecting abbreviations.
|
5
|
+
|
6
|
+
module Sentence
|
7
|
+
EOS = "\001" unless defined?(EOS) # temporary end of sentence marker
|
8
|
+
|
9
|
+
Titles = [ 'jr', 'mr', 'mrs', 'ms', 'dr', 'prof', 'sr', 'sen', 'rep',
|
10
|
+
'rev', 'gov', 'atty', 'supt', 'det', 'rev', 'col','gen', 'lt',
|
11
|
+
'cmdr', 'adm', 'capt', 'sgt', 'cpl', 'maj' ] unless defined?(Titles)
|
12
|
+
|
13
|
+
Entities = [ 'dept', 'univ', 'uni', 'assn', 'bros', 'inc', 'ltd', 'co',
|
14
|
+
'corp', 'plc' ] unless defined?(Entities)
|
15
|
+
|
16
|
+
Months = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
|
17
|
+
'aug', 'sep', 'oct', 'nov', 'dec', 'sept' ] unless defined?(Months)
|
18
|
+
|
19
|
+
Days = [ 'mon', 'tue', 'wed', 'thu',
|
20
|
+
'fri', 'sat', 'sun' ] unless defined?(Days)
|
21
|
+
|
22
|
+
Misc = [ 'vs', 'etc', 'no', 'esp', 'cf' ] unless defined?(Misc)
|
23
|
+
|
24
|
+
Streets = [ 'ave', 'bld', 'blvd', 'cl', 'ct',
|
25
|
+
'cres', 'dr', 'rd', 'st' ] unless defined?(Streets)
|
26
|
+
|
27
|
+
@@abbreviations = Titles + Entities + Months + Days + Streets + Misc
|
28
|
+
|
29
|
+
# Split the passed text into individual sentences, trim these and return
|
30
|
+
# as an array. A sentence is marked by one of the punctuation marks ".", "?"
|
31
|
+
# or "!" followed by whitespace. Sequences of full stops (such as an
|
32
|
+
# ellipsis marker "..." and stops after a known abbreviation are ignored.
|
33
|
+
def self.sentences(text)
|
34
|
+
|
35
|
+
text = text.dup
|
36
|
+
|
37
|
+
# initial split after punctuation - have to preserve trailing whitespace
|
38
|
+
# for the ellipsis correction next
|
39
|
+
# would be nicer to use look-behind and look-ahead assertions to skip
|
40
|
+
# ellipsis marks, but Ruby doesn't support look-behind
|
41
|
+
text.gsub!( /([\.?!](?:\"|\'|\)|\]|\})?)(\s+)/ ) { $1 << EOS << $2 }
|
42
|
+
|
43
|
+
# correct ellipsis marks and rows of stops
|
44
|
+
text.gsub!( /(\.\.\.*)#{EOS}/ ) { $1 }
|
45
|
+
|
46
|
+
# correct abbreviations
|
47
|
+
# TODO - precompile this regex?
|
48
|
+
text.gsub!( /(#{@@abbreviations.join("|")})\.#{EOS}/i ) { $1 << '.' }
|
49
|
+
|
50
|
+
# split on EOS marker, strip gets rid of trailing whitespace
|
51
|
+
text.split(EOS).map { | sentence | sentence.strip }
|
52
|
+
end
|
53
|
+
|
54
|
+
# add a list of abbreviations to the list that's used to detect false
|
55
|
+
# sentence ends. Return the current list of abbreviations in use.
|
56
|
+
def self.abbreviation(*abbreviations)
|
57
|
+
@@abbreviations += abbreviations
|
58
|
+
@@abbreviations
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
Binary file
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module Lingua
|
2
|
+
module EN
|
3
|
+
module Syllable
|
4
|
+
|
5
|
+
module Dictionary
|
6
|
+
class LookUpError < IndexError
|
7
|
+
end
|
8
|
+
|
9
|
+
@@dictionary = nil
|
10
|
+
@@dbmclass = nil
|
11
|
+
@@dbmext = nil
|
12
|
+
|
13
|
+
# use an available dbm-style hash
|
14
|
+
[ 'gdbm', 'dbm'].each do | dbm |
|
15
|
+
begin
|
16
|
+
require dbm
|
17
|
+
@@dbmclass = Module.const_get(dbm.upcase)
|
18
|
+
rescue
|
19
|
+
next
|
20
|
+
end
|
21
|
+
break
|
22
|
+
end
|
23
|
+
|
24
|
+
if @@dbmclass.nil?
|
25
|
+
raise LoadError,
|
26
|
+
"no dbm class available for Lingua::EN::Syllable::Dictionary"
|
27
|
+
end
|
28
|
+
|
29
|
+
# Look up word in the dbm dictionary.
|
30
|
+
def Dictionary.syllables(word)
|
31
|
+
if @@dictionary.nil?
|
32
|
+
load_dictionary
|
33
|
+
end
|
34
|
+
word = word.upcase
|
35
|
+
begin
|
36
|
+
pronounce = @@dictionary.fetch(word)
|
37
|
+
rescue IndexError
|
38
|
+
if word =~ /'/
|
39
|
+
word = word.delete "'"
|
40
|
+
retry
|
41
|
+
end
|
42
|
+
raise LookUpError, "word #{word} not in dictionary"
|
43
|
+
end
|
44
|
+
|
45
|
+
pronounce.split(/-/).grep(/^[AEIUO]/).length
|
46
|
+
end
|
47
|
+
|
48
|
+
def Dictionary.dictionary
|
49
|
+
if @@dictionary.nil?
|
50
|
+
load_dictionary
|
51
|
+
end
|
52
|
+
@@dictionary
|
53
|
+
end
|
54
|
+
|
55
|
+
# convert a text file dictionary into dbm files. Returns the file names
|
56
|
+
# of the created dbms.
|
57
|
+
def Dictionary.make_dictionary(source_file, output_dir)
|
58
|
+
begin
|
59
|
+
Dir.mkdir(output_dir)
|
60
|
+
rescue
|
61
|
+
end
|
62
|
+
|
63
|
+
# clean old dictionary dbms
|
64
|
+
Dir.foreach(output_dir) do | x |
|
65
|
+
next if x =~ /^\.\.?$/
|
66
|
+
File.unlink(File.join(output_dir, x))
|
67
|
+
end
|
68
|
+
|
69
|
+
dbm = @@dbmclass.new(File.join(output_dir, 'dict'))
|
70
|
+
|
71
|
+
begin
|
72
|
+
IO.foreach(source_file) do | line |
|
73
|
+
next if line !~ /^[A-Z]/
|
74
|
+
line.chomp!
|
75
|
+
(word, *phonemes) = line.split(/ ?/)
|
76
|
+
next if word =~ /\(\d\) ?$/ # ignore alternative pronunciations
|
77
|
+
dbm.store(word, phonemes.join("-"))
|
78
|
+
end
|
79
|
+
rescue
|
80
|
+
# close and clean up
|
81
|
+
dbm.close
|
82
|
+
Dir.foreach(output_dir) do | x |
|
83
|
+
next if x =~ /^\.\.?$/
|
84
|
+
File.unlink(File.join('dict', x))
|
85
|
+
end
|
86
|
+
# delete files
|
87
|
+
raise
|
88
|
+
end
|
89
|
+
|
90
|
+
dbm.close
|
91
|
+
|
92
|
+
Dir.entries(output_dir).collect { | x |
|
93
|
+
x =~ /^\.\.?$/ ? nil : File.join("dict", x)
|
94
|
+
}.compact
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
def Dictionary.load_dictionary
|
99
|
+
@@dictionary = @@dbmclass.new( __FILE__[0..-14] + 'dict')
|
100
|
+
if @@dictionary.keys.length.zero?
|
101
|
+
raise LoadError, "dictionary file not found"
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
module Lingua
|
2
|
+
module EN
|
3
|
+
module Syllable
|
4
|
+
# Uses english word patterns to guess the number of syllables. A single module
|
5
|
+
# method is made available, +syllables+, which, when passed an english word,
|
6
|
+
# will return the number of syllables it estimates are in the word.
|
7
|
+
# English orthography (the representation of spoken sounds as written signs) is
|
8
|
+
# not regular. The same spoken sound can be represented in multiple different
|
9
|
+
# ways in written English (e.g. rough/cuff), and the same written letters
|
10
|
+
# can be pronounced in different ways in different words (e.g. rough/bough).
|
11
|
+
# As the same series of letters can be pronounced in different ways, it is not
|
12
|
+
# possible to write an algorithm which can always guess the number of syllables
|
13
|
+
# in an english word correctly. However, it is possible to use frequently
|
14
|
+
# recurring patterns in english (such as "a final -e is usually silent") to
|
15
|
+
# guess with a level of accuracy that is acceptable for applications like
|
16
|
+
# syllable counting for readability scoring. This module implements such an
|
17
|
+
# algorithm.
|
18
|
+
# This module is inspired by the Perl Lingua::EN::Syllable module. However, it
|
19
|
+
# uses a different (though not larger) set of patterns to compensate for the
|
20
|
+
# 'special cases' which arise out of English's irregular orthography. A number
|
21
|
+
# of extra patterns (particularly for derived word forms) means that this module
|
22
|
+
# is somewhat more accurate than the Perl original. It also omits a number of
|
23
|
+
# patterns found in the original which seem to me to apply to such a small number
|
24
|
+
# of cases, or to be of dubious value. Testing the guesses against the Carnegie
|
25
|
+
# Mellon Pronouncing Dictionary, this module guesses right around 90% of the
|
26
|
+
# time, as against about 85% of the time for the Perl module. However, the
|
27
|
+
# dictionary contains a large number of foreign loan words and proper names, and
|
28
|
+
# so when the algorithm is tested against 'real world' english, its accuracy
|
29
|
+
# is a good deal better. Testing against a range of samples, it guesses right
|
30
|
+
# about 95-97% of the time.
|
31
|
+
module Guess
|
32
|
+
# special cases - 1 syllable less than expected
|
33
|
+
SubSyl = [
|
34
|
+
/[^aeiou]e$/, # give, love, bone, done, ride ...
|
35
|
+
/[aeiou](?:([cfghklmnprsvwz])\1?|ck|sh|[rt]ch)e[ds]$/,
|
36
|
+
# (passive) past participles and 3rd person sing present verbs:
|
37
|
+
# bared, liked, called, tricked, bashed, matched
|
38
|
+
|
39
|
+
/.e(?:ly|less(?:ly)?|ness?|ful(?:ly)?|ments?)$/,
|
40
|
+
# nominal, adjectival and adverbial derivatives from -e$ roots:
|
41
|
+
# absolutely, nicely, likeness, basement, hopeless
|
42
|
+
# hopeful, tastefully, wasteful
|
43
|
+
|
44
|
+
/ion/, # action, diction, fiction
|
45
|
+
/[ct]ia[nl]/, # special(ly), initial, physician, christian
|
46
|
+
/[^cx]iou/, # illustrious, NOT spacious, gracious, anxious, noxious
|
47
|
+
/sia$/, # amnesia, polynesia
|
48
|
+
/.gue$/ # dialogue, intrigue, colleague
|
49
|
+
] unless defined?(SubSyl)
|
50
|
+
|
51
|
+
# special cases - 1 syllable more than expected
|
52
|
+
AddSyl = [
|
53
|
+
/i[aiou]/, # alias, science, phobia
|
54
|
+
/[dls]ien/, # salient, gradient, transient
|
55
|
+
/[aeiouym]ble$/, # -Vble, plus -mble
|
56
|
+
/[aeiou]{3}/, # agreeable
|
57
|
+
/^mc/, # mcwhatever
|
58
|
+
/ism$/, # sexism, racism
|
59
|
+
/(?:([^aeiouy])\1|ck|mp|ng)le$/, # bubble, cattle, cackle, sample, angle
|
60
|
+
/dnt$/, # couldn/t
|
61
|
+
/[aeiou]y[aeiou]/ # annoying, layer
|
62
|
+
] unless defined?(AddSyl)
|
63
|
+
|
64
|
+
# special cases not actually used - these seem to me to be either very
|
65
|
+
# marginal or actually break more stuff than they fix
|
66
|
+
NotUsed = [
|
67
|
+
/^coa[dglx]./, # +1 coagulate, coaxial, coalition, coalesce - marginal
|
68
|
+
/[^gq]ua[^auieo]/, # +1 'du-al' - only for some speakers, and breaks
|
69
|
+
/riet/, # variety, parietal, notoriety - marginal?
|
70
|
+
] unless defined?(NotUsed)
|
71
|
+
|
72
|
+
def self.syllables(word)
|
73
|
+
return 1 if word.length == 1
|
74
|
+
word = word.downcase.delete("'")
|
75
|
+
|
76
|
+
syllables = word.scan(/[aeiouy]+/).length
|
77
|
+
|
78
|
+
# special cases
|
79
|
+
for pat in SubSyl
|
80
|
+
syllables -= 1 if pat.match(word)
|
81
|
+
end
|
82
|
+
for pat in AddSyl
|
83
|
+
syllables += 1 if pat.match(word)
|
84
|
+
end
|
85
|
+
|
86
|
+
syllables = 1 if syllables < 1 # no vowels?
|
87
|
+
syllables
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Lingua
|
2
|
+
module EN
|
3
|
+
# The module Lingua::EN::Syllable contains a single class method, +syllable+,
|
4
|
+
# which will use the most accurate technique available to determine the number
|
5
|
+
# syllables in a string containing a word passed to it.
|
6
|
+
# The exact definition of the function depends on the availability of the
|
7
|
+
# Carnegie Mellon Pronouncing Dictionary on the system. If it is available,
|
8
|
+
# the number of syllables as determined by the dictionary will be returned. If
|
9
|
+
# the dictionary is not available, or if a word not contained in the dictionary
|
10
|
+
# is passed, it will return the number of syllables as determined by the
|
11
|
+
# module Lingua::EN::Syllable::Guess. For more details, see there and
|
12
|
+
# Lingua::EN::Syllable::Dictionary.
|
13
|
+
module Syllable
|
14
|
+
# use dictionary if possible
|
15
|
+
begin
|
16
|
+
require 'lingua/en/syllable/dictionary.rb'
|
17
|
+
require 'lingua/en/syllable/guess.rb'
|
18
|
+
|
19
|
+
def Syllable.syllables(word)
|
20
|
+
begin
|
21
|
+
return Dictionary::syllables(word)
|
22
|
+
rescue Dictionary::LookUpError
|
23
|
+
return Guess::syllables(word)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
rescue LoadError # dictionary not available?
|
27
|
+
require 'lingua/en/syllable/guess.rb'
|
28
|
+
def Syllable.syllables(word)
|
29
|
+
Guess::syllables word
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
if __FILE__ == $0
|
37
|
+
ARGV.each { | word | puts "'#{word}' : " +
|
38
|
+
Lingua::EN::Syllable::syllables(word).to_s }
|
39
|
+
end
|
data/lib/lingua.rb
ADDED
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,90 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: lingua
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 5
|
8
|
+
- 0
|
9
|
+
version: 0.5.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- David Balatero
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-04-11 00:00:00 -07:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: rspec
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
requirements:
|
25
|
+
- - ">="
|
26
|
+
- !ruby/object:Gem::Version
|
27
|
+
segments:
|
28
|
+
- 1
|
29
|
+
- 2
|
30
|
+
- 9
|
31
|
+
version: 1.2.9
|
32
|
+
type: :development
|
33
|
+
version_requirements: *id001
|
34
|
+
description: Provides sentence splitting, syllable, and text-quality algorithms.
|
35
|
+
email: dbalatero@gmail.com
|
36
|
+
executables: []
|
37
|
+
|
38
|
+
extensions: []
|
39
|
+
|
40
|
+
extra_rdoc_files:
|
41
|
+
- LICENSE
|
42
|
+
- README.rdoc
|
43
|
+
files:
|
44
|
+
- .document
|
45
|
+
- .gitignore
|
46
|
+
- LICENSE
|
47
|
+
- README.rdoc
|
48
|
+
- Rakefile
|
49
|
+
- VERSION
|
50
|
+
- lib/lingua.rb
|
51
|
+
- lib/lingua/en/readability.rb
|
52
|
+
- lib/lingua/en/sentence.rb
|
53
|
+
- lib/lingua/en/syllable.rb
|
54
|
+
- lib/lingua/en/syllable/dict
|
55
|
+
- lib/lingua/en/syllable/dictionary.rb
|
56
|
+
- lib/lingua/en/syllable/guess.rb
|
57
|
+
- spec/spec.opts
|
58
|
+
- spec/spec_helper.rb
|
59
|
+
has_rdoc: true
|
60
|
+
homepage: http://github.com/dbalatero/lingua
|
61
|
+
licenses: []
|
62
|
+
|
63
|
+
post_install_message:
|
64
|
+
rdoc_options:
|
65
|
+
- --charset=UTF-8
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - ">="
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
segments:
|
80
|
+
- 0
|
81
|
+
version: "0"
|
82
|
+
requirements: []
|
83
|
+
|
84
|
+
rubyforge_project:
|
85
|
+
rubygems_version: 1.3.6
|
86
|
+
signing_key:
|
87
|
+
specification_version: 3
|
88
|
+
summary: This is a maintained version of Ruby's Lingua port.
|
89
|
+
test_files:
|
90
|
+
- spec/spec_helper.rb
|