text-metrics 0.0.1 → 1.0.0.beta2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +38 -1
- data/README.md +70 -32
- data/UPGRADING.md +73 -0
- data/lib/text-metrics.rb +6 -0
- data/lib/text_metrics/dictionaries/english_word_syllable_database.txt +126052 -0
- data/lib/text_metrics/levenshtein.rb +46 -0
- data/lib/text_metrics/processors/american_english.rb +38 -10
- data/lib/text_metrics/processors/base.rb +117 -126
- data/lib/text_metrics/processors/french.rb +32 -14
- data/lib/text_metrics/version.rb +1 -1
- data/lib/text_metrics.rb +28 -25
- metadata +12 -14
- data/lib/text_metrics/dictionnaries/en_us.txt +0 -2945
- data/lib/text_metrics/dictionnaries/fr.txt +0 -1462
- data/lib/text_metrics/dictionnaries/french_word_syllable_database.yml +0 -125345
- data/lib/text_metrics/dictionnaries/lexique-383.csv +0 -142695
- /data/lib/text_metrics/{dictionnaries → dictionaries}/french_word_syllable_exceptions.yml +0 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module TextMetrics
|
|
4
|
+
# Levenshtein edit distance between two strings, plus a normalized similarity score.
|
|
5
|
+
# Comparison is a property of two texts, so it lives here rather than on a single-text analyzer.
|
|
6
|
+
module Levenshtein
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
# Raw edit distance: the number of single-character insertions, deletions or
|
|
10
|
+
# substitutions needed to turn +first+ into +second+. Case-sensitive.
|
|
11
|
+
def distance(first, second)
|
|
12
|
+
first = first.to_s
|
|
13
|
+
second = second.to_s
|
|
14
|
+
m = first.length
|
|
15
|
+
n = second.length
|
|
16
|
+
|
|
17
|
+
return n if m.zero?
|
|
18
|
+
return m if n.zero?
|
|
19
|
+
|
|
20
|
+
matrix = Array.new(m + 1) { Array.new(n + 1, 0) }
|
|
21
|
+
(0..m).each { |i| matrix[i][0] = i }
|
|
22
|
+
(0..n).each { |j| matrix[0][j] = j }
|
|
23
|
+
|
|
24
|
+
(1..m).each do |i|
|
|
25
|
+
(1..n).each do |j|
|
|
26
|
+
cost = (first[i - 1] == second[j - 1]) ? 0 : 1
|
|
27
|
+
matrix[i][j] = [
|
|
28
|
+
matrix[i - 1][j] + 1, # deletion
|
|
29
|
+
matrix[i][j - 1] + 1, # insertion
|
|
30
|
+
matrix[i - 1][j - 1] + cost # substitution
|
|
31
|
+
].min
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
matrix[m][n]
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Similarity as a 0–100 score: 100.0 means identical, 0.0 means nothing in common.
|
|
39
|
+
def similarity(first, second)
|
|
40
|
+
max_length = [first.to_s.length, second.to_s.length].max
|
|
41
|
+
return 100.0 if max_length.zero?
|
|
42
|
+
|
|
43
|
+
((max_length - distance(first, second)).to_f / max_length * 100).round(2)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
@@ -5,25 +5,53 @@ require "text_metrics/processors/base"
|
|
|
5
5
|
module TextMetrics
|
|
6
6
|
module Processors
|
|
7
7
|
class AmericanEnglish < TextMetrics::Processors::Base
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
8
|
+
SYLLABLE_DATABASE_PATH = File.join(GEM_PATH, "dictionaries/english_word_syllable_database.txt").freeze
|
|
9
|
+
DATABASE_LOAD_MUTEX = Mutex.new
|
|
10
|
+
|
|
11
|
+
class << self
|
|
12
|
+
# CMU Pronouncing Dictionary syllable counts, loaded once and shared across all
|
|
13
|
+
# instances and threads. Lazy so requiring the gem (or using only
|
|
14
|
+
# French/Levenshtein) doesn't pay the load cost; the mutex guarantees the file is
|
|
15
|
+
# parsed exactly once under concurrent first use, and the double check keeps the
|
|
16
|
+
# common path lock-free. The result is frozen, so concurrent reads are safe.
|
|
17
|
+
def syllable_database
|
|
18
|
+
return @syllable_database if @syllable_database
|
|
19
|
+
|
|
20
|
+
DATABASE_LOAD_MUTEX.synchronize do
|
|
21
|
+
@syllable_database ||= load_syllable_database
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def load_syllable_database
|
|
28
|
+
database = {}
|
|
29
|
+
File.foreach(SYLLABLE_DATABASE_PATH) do |line|
|
|
30
|
+
word, count = line.split(" ", 2)
|
|
31
|
+
database[word] = count.to_i
|
|
32
|
+
end
|
|
33
|
+
database.freeze
|
|
34
|
+
end
|
|
35
|
+
end
|
|
12
36
|
|
|
13
|
-
|
|
37
|
+
def initialize(text, language: :en_us)
|
|
38
|
+
super
|
|
14
39
|
end
|
|
15
40
|
|
|
16
|
-
def
|
|
17
|
-
|
|
18
|
-
syllables_per_word = syllables_per_word_average
|
|
19
|
-
flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
|
|
41
|
+
def flesch_reading_ease
|
|
42
|
+
return 0.0 if words_count.zero?
|
|
20
43
|
|
|
21
|
-
|
|
44
|
+
(206.835 - 1.015 * average_words_per_sentence - 84.6 * average_syllables_per_word).round(2)
|
|
22
45
|
end
|
|
23
46
|
|
|
24
47
|
private
|
|
25
48
|
|
|
49
|
+
# CMUdict is the source of truth; fall back to hyphenation for out-of-vocabulary words.
|
|
26
50
|
def count_syllables_in_word(word)
|
|
51
|
+
self.class.syllable_database.fetch(word.downcase) { hyphenated_syllable_count(word) }
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def hyphenated_syllable_count(word)
|
|
27
55
|
hyphen_dictionary.visualise(word).count("-") + 1
|
|
28
56
|
end
|
|
29
57
|
|
|
@@ -5,43 +5,52 @@ require "text/hyphen"
|
|
|
5
5
|
module TextMetrics
|
|
6
6
|
module Processors
|
|
7
7
|
class Base
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
8
|
+
GEM_PATH = File.dirname(__FILE__, 2).freeze
|
|
9
|
+
|
|
10
|
+
# The public metric surface. #to_h and the individual readers are both derived
|
|
11
|
+
# from this list, so they can never drift apart.
|
|
12
|
+
METRICS = %i[
|
|
13
|
+
words_count
|
|
14
|
+
characters_count
|
|
15
|
+
sentences_count
|
|
16
|
+
syllables_count
|
|
17
|
+
punctuation_count
|
|
18
|
+
syllables_per_word_average
|
|
19
|
+
letters_per_word_average
|
|
20
|
+
words_per_sentence_average
|
|
21
|
+
characters_per_sentence_average
|
|
22
|
+
words_per_punctuation_average
|
|
23
|
+
punctuation_per_sentence_average
|
|
24
|
+
flesch_reading_ease
|
|
25
|
+
flesch_kincaid_grade
|
|
26
|
+
lix
|
|
27
|
+
smog_index
|
|
28
|
+
gunning_fog_index
|
|
29
|
+
coleman_liau_index
|
|
30
|
+
].freeze
|
|
31
|
+
|
|
32
|
+
attr_reader :text, :language
|
|
33
|
+
|
|
34
|
+
def initialize(text, language: nil)
|
|
35
|
+
@text = (text || "").squeeze(" ")
|
|
36
|
+
@language = language
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Every metric in one hash. Single source of truth for the public surface.
|
|
40
|
+
# Memoized — the analyzer is immutable once built.
|
|
41
|
+
def to_h
|
|
42
|
+
@to_h ||= METRICS.to_h { |metric| [metric, public_send(metric)] }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# counts
|
|
41
46
|
def words_count
|
|
42
47
|
words.size
|
|
43
48
|
end
|
|
44
49
|
|
|
50
|
+
def characters_count(ignore_spaces: true)
|
|
51
|
+
ignore_spaces ? text.delete(" ").length : text.length
|
|
52
|
+
end
|
|
53
|
+
|
|
45
54
|
def sentences_count
|
|
46
55
|
return 0 if words_count.zero?
|
|
47
56
|
|
|
@@ -52,81 +61,79 @@ module TextMetrics
|
|
|
52
61
|
words.sum { |word| count_syllables_in_word(word) }
|
|
53
62
|
end
|
|
54
63
|
|
|
55
|
-
def
|
|
56
|
-
|
|
57
|
-
end
|
|
58
|
-
|
|
59
|
-
def punctuations_count
|
|
60
|
-
punctuations.size
|
|
64
|
+
def punctuation_count
|
|
65
|
+
punctuation_marks.size
|
|
61
66
|
end
|
|
62
67
|
|
|
63
|
-
#
|
|
64
|
-
|
|
68
|
+
# averages — rounded for display only. The readability scores below are computed
|
|
69
|
+
# from the full-precision ratios (#average_*), not from these rounded values.
|
|
65
70
|
def syllables_per_word_average
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
(syllables_count.to_f / words_count).round(1)
|
|
71
|
+
average_syllables_per_word.round(1)
|
|
69
72
|
end
|
|
70
73
|
|
|
71
74
|
def letters_per_word_average
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
(characters_count.to_f / words_count).round(2)
|
|
75
|
+
average_letters_per_word.round(2)
|
|
75
76
|
end
|
|
76
77
|
|
|
77
78
|
def words_per_sentence_average
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
(words_count.to_f / sentences_count).round(2)
|
|
79
|
+
average_words_per_sentence.round(2)
|
|
81
80
|
end
|
|
82
81
|
|
|
83
82
|
def characters_per_sentence_average
|
|
84
|
-
return 0.0 if
|
|
83
|
+
return 0.0 if sentences_count.zero?
|
|
85
84
|
|
|
86
85
|
(characters_count.to_f / sentences_count).round(2)
|
|
87
86
|
end
|
|
88
87
|
|
|
89
|
-
def
|
|
90
|
-
return 0.0 if
|
|
88
|
+
def words_per_punctuation_average
|
|
89
|
+
return 0.0 if words_count.zero? || punctuation_count.zero?
|
|
91
90
|
|
|
92
|
-
(
|
|
91
|
+
(words_count.to_f / punctuation_count).round(2)
|
|
93
92
|
end
|
|
94
93
|
|
|
95
|
-
def
|
|
96
|
-
return 0.0 if
|
|
94
|
+
def punctuation_per_sentence_average
|
|
95
|
+
return 0.0 if punctuation_count.zero? || sentences_count.zero?
|
|
97
96
|
|
|
98
|
-
(
|
|
97
|
+
(punctuation_count.to_f / sentences_count).round(2)
|
|
99
98
|
end
|
|
100
99
|
|
|
101
|
-
# readability scores
|
|
100
|
+
# readability scores — computed from full-precision ratios, rounded only at the end,
|
|
101
|
+
# and returned unclamped (a Flesch score can legitimately exceed 100 or go negative).
|
|
102
|
+
|
|
103
|
+
# Language-specific; subclasses supply the constants.
|
|
102
104
|
def flesch_reading_ease
|
|
103
105
|
raise NotImplementedError
|
|
104
106
|
end
|
|
105
107
|
|
|
108
|
+
# Flesch-Kincaid Grade Level (US school grade). The same formula is used for every
|
|
109
|
+
# language — there is no validated non-English adaptation.
|
|
106
110
|
def flesch_kincaid_grade
|
|
107
|
-
|
|
111
|
+
return 0.0 if words_count.zero?
|
|
112
|
+
|
|
113
|
+
(0.39 * average_words_per_sentence + 11.8 * average_syllables_per_word - 15.59).round(1)
|
|
108
114
|
end
|
|
109
115
|
|
|
110
116
|
def smog_index
|
|
111
|
-
if sentences_count
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
117
|
+
return 0.0 if sentences_count < 3
|
|
118
|
+
|
|
119
|
+
(1.043 * Math.sqrt(30.0 * count_polysyllabic_words / sentences_count) + 3.1291).round(1)
|
|
120
|
+
rescue ZeroDivisionError
|
|
121
|
+
0.0
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def gunning_fog_index
|
|
125
|
+
return 0.0 if words_count.zero?
|
|
126
|
+
|
|
127
|
+
(0.4 * (average_words_per_sentence + 100.0 * count_polysyllabic_words / words_count)).round(1)
|
|
121
128
|
end
|
|
122
129
|
|
|
123
130
|
def coleman_liau_index
|
|
124
|
-
return 0.0 if
|
|
131
|
+
return 0.0 if words_count.zero?
|
|
132
|
+
|
|
133
|
+
letters_per_100_words = average_letters_per_word * 100
|
|
134
|
+
sentences_per_100_words = sentences_count.to_f / words_count * 100
|
|
125
135
|
|
|
126
|
-
|
|
127
|
-
sentences = (1.to_f / words_per_sentence_average * 100).round(2)
|
|
128
|
-
coleman = 0.0588 * letters - 0.296 * sentences - 15.8
|
|
129
|
-
coleman.round(2).clamp(0.0, 20.0)
|
|
136
|
+
(0.0588 * letters_per_100_words - 0.296 * sentences_per_100_words - 15.8).round(2)
|
|
130
137
|
end
|
|
131
138
|
|
|
132
139
|
def lix
|
|
@@ -134,39 +141,56 @@ module TextMetrics
|
|
|
134
141
|
|
|
135
142
|
long_words = words.count { |word| word.length > 6 }
|
|
136
143
|
|
|
137
|
-
|
|
138
|
-
|
|
144
|
+
(average_words_per_sentence + 100.0 * long_words / words_count).round(2)
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
private
|
|
148
|
+
|
|
149
|
+
# full-precision ratios feeding the readability formulas
|
|
150
|
+
def average_syllables_per_word
|
|
151
|
+
return 0.0 if words_count.zero?
|
|
139
152
|
|
|
140
|
-
|
|
153
|
+
syllables_count.to_f / words_count
|
|
141
154
|
end
|
|
142
155
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
distance = levenshtein_distance(@text, other_text)
|
|
146
|
-
return distance unless normalize
|
|
156
|
+
def average_letters_per_word
|
|
157
|
+
return 0.0 if words_count.zero?
|
|
147
158
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
159
|
+
letters_count.to_f / words_count
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def average_words_per_sentence
|
|
163
|
+
return 0.0 if sentences_count.zero?
|
|
164
|
+
|
|
165
|
+
words_count.to_f / sentences_count
|
|
166
|
+
end
|
|
155
167
|
|
|
156
|
-
|
|
168
|
+
# Count of alphabetic characters only (letters), as required by Coleman-Liau and the
|
|
169
|
+
# letters-per-word metric — distinct from #characters_count, which includes digits
|
|
170
|
+
# and punctuation.
|
|
171
|
+
def letters_count
|
|
172
|
+
@letters_count ||= text.scan(/[[:alpha:]]/).size
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Subclasses provide the language-specific syllable counting.
|
|
176
|
+
def count_syllables_in_word(word)
|
|
177
|
+
raise NotImplementedError
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def count_polysyllabic_words
|
|
181
|
+
words.count { |word| count_syllables_in_word(word) >= 3 }
|
|
157
182
|
end
|
|
158
183
|
|
|
159
184
|
# tokenizers
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
@punctuations ||= text.scan(/[.,!?;:]/)
|
|
185
|
+
def punctuation_marks
|
|
186
|
+
@punctuation_marks ||= text.scan(/[.,!?;:]/)
|
|
163
187
|
end
|
|
164
188
|
|
|
165
189
|
def words
|
|
166
190
|
@words ||= begin
|
|
167
191
|
normalized_text = text.downcase.strip
|
|
168
192
|
|
|
169
|
-
# Split
|
|
193
|
+
# Split into words, including hyphenated words, and excluding numbers
|
|
170
194
|
normalized_text.scan(/\b[A-Za-zÀ-ÖØ-öø-ÿ'-]+\b/)
|
|
171
195
|
end
|
|
172
196
|
end
|
|
@@ -174,39 +198,6 @@ module TextMetrics
|
|
|
174
198
|
def sentences
|
|
175
199
|
@sentences ||= text.scan(/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|$)/)
|
|
176
200
|
end
|
|
177
|
-
|
|
178
|
-
private
|
|
179
|
-
|
|
180
|
-
def levenshtein_distance(s1, s2)
|
|
181
|
-
m = s1.length
|
|
182
|
-
n = s2.length
|
|
183
|
-
|
|
184
|
-
# Return if one of the strings is empty
|
|
185
|
-
return n if m == 0
|
|
186
|
-
return m if n == 0
|
|
187
|
-
|
|
188
|
-
# Create a matrix
|
|
189
|
-
matrix = Array.new(m + 1) { Array.new(n + 1) }
|
|
190
|
-
|
|
191
|
-
# Initialize the first row and column
|
|
192
|
-
(0..m).each { |i| matrix[i][0] = i }
|
|
193
|
-
(0..n).each { |j| matrix[0][j] = j }
|
|
194
|
-
|
|
195
|
-
# Fill in the matrix
|
|
196
|
-
(1..m).each do |i|
|
|
197
|
-
(1..n).each do |j|
|
|
198
|
-
cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1
|
|
199
|
-
matrix[i][j] = [
|
|
200
|
-
matrix[i - 1][j] + 1, # Deletion
|
|
201
|
-
matrix[i][j - 1] + 1, # Insertion
|
|
202
|
-
matrix[i - 1][j - 1] + cost # Substitution
|
|
203
|
-
].min
|
|
204
|
-
end
|
|
205
|
-
end
|
|
206
|
-
|
|
207
|
-
# Return the Levenshtein distance
|
|
208
|
-
matrix[m][n]
|
|
209
|
-
end
|
|
210
201
|
end
|
|
211
202
|
end
|
|
212
203
|
end
|
|
@@ -6,30 +6,48 @@ require "yaml"
|
|
|
6
6
|
module TextMetrics
|
|
7
7
|
module Processors
|
|
8
8
|
class French < TextMetrics::Processors::Base
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
SYLLABLE_EXCEPTIONS = YAML.load_file(SYLLABLE_EXCEPTIONS_PATH).freeze
|
|
9
|
+
SYLLABLE_EXCEPTIONS_PATH = File.join(GEM_PATH, "dictionaries/french_word_syllable_exceptions.yml").freeze
|
|
10
|
+
EXCEPTIONS_LOAD_MUTEX = Mutex.new
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
12
|
+
class << self
|
|
13
|
+
# Syllable counts for the words the heuristic gets wrong (derived from Lexique),
|
|
14
|
+
# loaded once and shared across all instances and threads. Lazy so requiring the gem
|
|
15
|
+
# (or using only English/Levenshtein) doesn't pay the YAML load; the mutex guarantees
|
|
16
|
+
# the file is parsed exactly once under concurrent first use, and the double check keeps
|
|
17
|
+
# the common path lock-free. The result is frozen, so concurrent reads are safe.
|
|
18
|
+
def syllable_exceptions
|
|
19
|
+
return @syllable_exceptions if @syllable_exceptions
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
EXCEPTIONS_LOAD_MUTEX.synchronize do
|
|
22
|
+
@syllable_exceptions ||= YAML.load_file(SYLLABLE_EXCEPTIONS_PATH).freeze
|
|
23
|
+
end
|
|
24
|
+
end
|
|
19
25
|
end
|
|
20
26
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
27
|
+
# +with_syllable_exceptions+ is an internal toggle used by the dictionary-generation
|
|
28
|
+
# scripts to run the bare heuristic; the public API always leaves it on.
|
|
29
|
+
def initialize(text, language: :fr, with_syllable_exceptions: true)
|
|
30
|
+
super(text, language: language)
|
|
31
|
+
@with_syllable_exceptions = with_syllable_exceptions
|
|
32
|
+
end
|
|
25
33
|
|
|
26
|
-
|
|
34
|
+
# French Flesch Reading Ease — the Kandel-Moles (1958) adaptation:
|
|
35
|
+
# 207 - 1.015 * (words / sentences) - 73.6 * (syllables / words).
|
|
36
|
+
def flesch_reading_ease
|
|
37
|
+
return 0.0 if words_count.zero?
|
|
38
|
+
|
|
39
|
+
(207 - 1.015 * average_words_per_sentence - 73.6 * average_syllables_per_word).round(2)
|
|
27
40
|
end
|
|
28
41
|
|
|
29
42
|
private
|
|
30
43
|
|
|
44
|
+
attr_reader :with_syllable_exceptions
|
|
45
|
+
|
|
31
46
|
def count_syllables_in_word(word)
|
|
32
|
-
|
|
47
|
+
if with_syllable_exceptions
|
|
48
|
+
exceptions = self.class.syllable_exceptions
|
|
49
|
+
return exceptions[word].to_i if exceptions.key?(word)
|
|
50
|
+
end
|
|
33
51
|
|
|
34
52
|
word = word.downcase.gsub(/[^a-zàâäéèêëîïôöùûüç]/, "")
|
|
35
53
|
|
data/lib/text_metrics/version.rb
CHANGED
data/lib/text_metrics.rb
CHANGED
|
@@ -1,41 +1,44 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require "text_metrics/version"
|
|
4
|
-
require "text_metrics/
|
|
4
|
+
require "text_metrics/levenshtein"
|
|
5
5
|
require "text_metrics/processors/american_english"
|
|
6
|
-
require "
|
|
6
|
+
require "text_metrics/processors/french"
|
|
7
7
|
|
|
8
8
|
module TextMetrics
|
|
9
9
|
class Error < StandardError; end
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
extend Forwardable
|
|
13
|
-
def_delegators :text_metrics_processor, :words_count, :characters_count, :syllables_count,
|
|
14
|
-
:sentences_count, :words_per_sentence_average, :syllables_per_word_average,
|
|
15
|
-
:letters_per_word_average, :words_per_sentence_average, :characters_per_sentence_average,
|
|
16
|
-
:flesch_reading_ease, :flesch_kincaid_grade, :all, :levenshtein_distance_from
|
|
17
|
-
|
|
18
|
-
PROCESSORS = {
|
|
19
|
-
"fr" => Processors::French,
|
|
20
|
-
"en_us" => Processors::AmericanEnglish
|
|
21
|
-
}
|
|
11
|
+
DEFAULT_LANGUAGE = :en_us
|
|
22
12
|
|
|
23
|
-
|
|
13
|
+
PROCESSORS = {
|
|
14
|
+
en_us: Processors::AmericanEnglish,
|
|
15
|
+
fr: Processors::French
|
|
16
|
+
}.freeze
|
|
24
17
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
18
|
+
# Build an analyzer for +text+ in the given +language+ (:en_us or :fr).
|
|
19
|
+
# Returns the language-specific processor, which exposes every metric and #to_h.
|
|
20
|
+
def self.new(text, language: DEFAULT_LANGUAGE)
|
|
21
|
+
language = resolve_language(language)
|
|
22
|
+
PROCESSORS.fetch(language).new(text, language: language)
|
|
23
|
+
end
|
|
30
24
|
|
|
31
|
-
|
|
25
|
+
# Raw Levenshtein edit distance between two texts.
|
|
26
|
+
def self.distance(text, other)
|
|
27
|
+
Levenshtein.distance(text, other)
|
|
28
|
+
end
|
|
32
29
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
30
|
+
# Levenshtein similarity between two texts, as a 0–100 score (100.0 == identical).
|
|
31
|
+
def self.similarity(text, other)
|
|
32
|
+
Levenshtein.similarity(text, other)
|
|
36
33
|
end
|
|
37
34
|
|
|
38
|
-
|
|
39
|
-
|
|
35
|
+
# Coerce to a known language symbol, or raise a helpful error.
|
|
36
|
+
# Handles nil, strings and symbols without leaking a NoMethodError.
|
|
37
|
+
def self.resolve_language(language)
|
|
38
|
+
resolved = language.to_s.to_sym
|
|
39
|
+
return resolved if PROCESSORS.key?(resolved)
|
|
40
|
+
|
|
41
|
+
raise Error, "Unknown language #{language.inspect}. Available languages: #{PROCESSORS.keys.join(", ")}"
|
|
40
42
|
end
|
|
43
|
+
private_class_method :resolve_language
|
|
41
44
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: text-metrics
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 1.0.0.beta2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Adrien POLY
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: text-hyphen
|
|
@@ -108,8 +107,9 @@ dependencies:
|
|
|
108
107
|
- - ">="
|
|
109
108
|
- !ruby/object:Gem::Version
|
|
110
109
|
version: '0'
|
|
111
|
-
description:
|
|
112
|
-
|
|
110
|
+
description: Text Metrics computes readability scores (Flesch Reading Ease, Flesch-Kincaid
|
|
111
|
+
Grade, SMOG, Coleman-Liau, LIX) along with word, sentence, syllable and character
|
|
112
|
+
statistics, plus Levenshtein distance and similarity. English and French are supported.
|
|
113
113
|
email:
|
|
114
114
|
- adrienpoly@gmail.com
|
|
115
115
|
executables: []
|
|
@@ -119,12 +119,12 @@ files:
|
|
|
119
119
|
- CHANGELOG.md
|
|
120
120
|
- LICENSE.txt
|
|
121
121
|
- README.md
|
|
122
|
+
- UPGRADING.md
|
|
123
|
+
- lib/text-metrics.rb
|
|
122
124
|
- lib/text_metrics.rb
|
|
123
|
-
- lib/text_metrics/
|
|
124
|
-
- lib/text_metrics/
|
|
125
|
-
- lib/text_metrics/
|
|
126
|
-
- lib/text_metrics/dictionnaries/french_word_syllable_exceptions.yml
|
|
127
|
-
- lib/text_metrics/dictionnaries/lexique-383.csv
|
|
125
|
+
- lib/text_metrics/dictionaries/english_word_syllable_database.txt
|
|
126
|
+
- lib/text_metrics/dictionaries/french_word_syllable_exceptions.yml
|
|
127
|
+
- lib/text_metrics/levenshtein.rb
|
|
128
128
|
- lib/text_metrics/processors/american_english.rb
|
|
129
129
|
- lib/text_metrics/processors/base.rb
|
|
130
130
|
- lib/text_metrics/processors/french.rb
|
|
@@ -138,7 +138,6 @@ metadata:
|
|
|
138
138
|
documentation_uri: https://github.com/plume-app/text-metrics
|
|
139
139
|
homepage_uri: https://github.com/plume-app/text-metrics
|
|
140
140
|
source_code_uri: https://github.com/plume-app/text-metrics
|
|
141
|
-
post_install_message:
|
|
142
141
|
rdoc_options: []
|
|
143
142
|
require_paths:
|
|
144
143
|
- lib
|
|
@@ -153,8 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
153
152
|
- !ruby/object:Gem::Version
|
|
154
153
|
version: '0'
|
|
155
154
|
requirements: []
|
|
156
|
-
rubygems_version: 3.
|
|
157
|
-
signing_key:
|
|
155
|
+
rubygems_version: 3.6.9
|
|
158
156
|
specification_version: 4
|
|
159
|
-
summary:
|
|
157
|
+
summary: Readability scores and text statistics for English and French
|
|
160
158
|
test_files: []
|