text-metrics 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "text_metrics/processors/base"
4
+
5
+ module TextMetrics
6
+ module Processors
7
+ class AmericanEnglish < TextMetrics::Processors::Base
8
+ def flesch_reading_ease
9
+ sentence_length = words_per_sentence_average
10
+ syllables_per_word = syllables_per_word_average
11
+ flesch = 206.835 - 1.015 * sentence_length - 84.6 * syllables_per_word
12
+
13
+ flesch.round(2).clamp(0.0, 100.0)
14
+ end
15
+
16
+ def flesch_kincaid_grade
17
+ sentence_length = words_per_sentence_average
18
+ syllables_per_word = syllables_per_word_average
19
+ flesch = 0.39 * sentence_length + 11.8 * syllables_per_word - 15.59
20
+
21
+ flesch.round(1).clamp(0.0, 18.0)
22
+ end
23
+
24
+ private
25
+
26
+ def count_syllables_in_word(word)
27
+ hyphen_dictionary.visualise(word).count("-") + 1
28
+ end
29
+
30
+ def hyphen_dictionary
31
+ @hyphen_dictionary ||= Text::Hyphen.new(language: "en_us", left: 0, right: 0)
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,212 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "text/hyphen"
4
+
5
+ module TextMetrics
6
+ module Processors
7
+ class Base
8
+ attr_reader :text, :with_syllable_exceptions
9
+
10
+ def initialize(text:, with_syllable_exceptions: true)
11
+ @text = text&.squeeze(" ") || ""
12
+ @with_syllable_exceptions = with_syllable_exceptions
13
+ end
14
+
15
+ def all
16
+ @all ||= {
17
+ words_count: words_count,
18
+ characters_count: characters_count,
19
+ sentences_count: sentences_count,
20
+ syllables_count: syllables_count,
21
+ punctuations_count: punctuations_count,
22
+ syllables_per_word_average: syllables_per_word_average,
23
+ letters_per_word_average: letters_per_word_average,
24
+ words_per_sentence_average: words_per_sentence_average,
25
+ words_per_punctuations_average: words_per_punctuations_average,
26
+ characters_per_sentence_average: characters_per_sentence_average,
27
+ punctuations_per_sentence_average: punctuations_per_sentence_average,
28
+ flesch_reading_ease: flesch_reading_ease,
29
+ flesch_kincaid_grade: flesch_kincaid_grade,
30
+ lix: lix,
31
+ smog_index: smog_index,
32
+ coleman_liau_index: coleman_liau_index
33
+ }
34
+ end
35
+
36
+ # _count methods
37
+ def characters_count(ignore_spaces: true)
38
+ ignore_spaces ? text.delete(" ").length : text.length
39
+ end
40
+
41
+ def words_count
42
+ words.size
43
+ end
44
+
45
+ def sentences_count
46
+ return 0 if words_count.zero?
47
+
48
+ [1, sentences.size].max
49
+ end
50
+
51
+ def syllables_count
52
+ words.sum { |word| count_syllables_in_word(word) }
53
+ end
54
+
55
+ def poly_syllabes_count
56
+ words.count { |word| count_syllables_in_word(word) >= 3 }
57
+ end
58
+
59
+ def punctuations_count
60
+ punctuations.size
61
+ end
62
+
63
+ # _average methods
64
+
65
+ def syllables_per_word_average
66
+ return 0.0 if words_count.zero? || syllables_count.zero?
67
+
68
+ (syllables_count.to_f / words_count).round(1)
69
+ end
70
+
71
+ def letters_per_word_average
72
+ return 0.0 if words_count.zero? || characters_count.zero?
73
+
74
+ (characters_count.to_f / words_count).round(2)
75
+ end
76
+
77
+ def words_per_sentence_average
78
+ return 0.0 if words_count.zero? || sentences_count.zero?
79
+
80
+ (words_count.to_f / sentences_count).round(2)
81
+ end
82
+
83
+ def characters_per_sentence_average
84
+ return 0.0 if characters_count.zero? || sentences_count.zero?
85
+
86
+ (characters_count.to_f / sentences_count).round(2)
87
+ end
88
+
89
+ def punctuations_per_sentence_average
90
+ return 0.0 if punctuations_count.zero? || sentences_count.zero?
91
+
92
+ (punctuations_count.to_f / sentences_count).round(2)
93
+ end
94
+
95
+ def words_per_punctuations_average
96
+ return 0.0 if words_count.zero? || punctuations_count.zero?
97
+
98
+ (words_count.to_f / punctuations_count).round(2)
99
+ end
100
+
101
+ # readability scores
102
+ def flesch_reading_ease
103
+ raise NotImplementedError
104
+ end
105
+
106
+ def flesch_kincaid_grade
107
+ raise NotImplementedError
108
+ end
109
+
110
+ def smog_index
111
+ if sentences_count >= 3
112
+ begin
113
+ smog = 1.043 * Math.sqrt(30.0 * poly_syllabes_count / sentences_count) + 3.1291
114
+ smog.round(1)
115
+ rescue ZeroDivisionError
116
+ 0.0
117
+ end
118
+ else
119
+ 0.0
120
+ end
121
+ end
122
+
123
+ def coleman_liau_index
124
+ return 0.0 if words_per_sentence_average.zero? || letters_per_word_average.zero?
125
+
126
+ letters = (letters_per_word_average * 100).round(2)
127
+ sentences = (1.to_f / words_per_sentence_average * 100).round(2)
128
+ coleman = 0.0588 * letters - 0.296 * sentences - 15.8
129
+ coleman.round(2).clamp(0.0, 20.0)
130
+ end
131
+
132
+ def lix
133
+ return 0.0 if words_count.zero?
134
+
135
+ long_words = words.count { |word| word.length > 6 }
136
+
137
+ per_long_words = 100.0 * long_words / words_count
138
+ lix = words_per_sentence_average + per_long_words
139
+
140
+ lix.round(2).clamp(0.0, 100.0)
141
+ end
142
+
143
+ # similarity
144
+ def levenshtein_distance_from(other_text, normalize: true)
145
+ distance = levenshtein_distance(@text, other_text)
146
+ return distance unless normalize
147
+
148
+ # Normalize to a score out of 100
149
+ max_length = [@text.length, other_text.length].max
150
+ normalized_score = if max_length.zero?
151
+ 100
152
+ else
153
+ ((max_length - distance).to_f / max_length) * 100
154
+ end
155
+
156
+ normalized_score.round(2)
157
+ end
158
+
159
+ # tokenizers
160
+ #
161
+ def punctuations
162
+ @punctuations ||= text.scan(/[.,!?;:]/)
163
+ end
164
+
165
+ def words
166
+ @words ||= begin
167
+ normalized_text = text.downcase.strip
168
+
169
+ # Split the sentence into words, including hyphenated words, and excluding numbers
170
+ normalized_text.scan(/\b[A-Za-zÀ-ÖØ-öø-ÿ'-]+\b/)
171
+ end
172
+ end
173
+
174
+ def sentences
175
+ @sentences ||= text.scan(/(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)(?=\s|$)/)
176
+ end
177
+
178
+ private
179
+
180
+ def levenshtein_distance(s1, s2)
181
+ m = s1.length
182
+ n = s2.length
183
+
184
+ # Return if one of the strings is empty
185
+ return n if m == 0
186
+ return m if n == 0
187
+
188
+ # Create a matrix
189
+ matrix = Array.new(m + 1) { Array.new(n + 1) }
190
+
191
+ # Initialize the first row and column
192
+ (0..m).each { |i| matrix[i][0] = i }
193
+ (0..n).each { |j| matrix[0][j] = j }
194
+
195
+ # Fill in the matrix
196
+ (1..m).each do |i|
197
+ (1..n).each do |j|
198
+ cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1
199
+ matrix[i][j] = [
200
+ matrix[i - 1][j] + 1, # Deletion
201
+ matrix[i][j - 1] + 1, # Insertion
202
+ matrix[i - 1][j - 1] + cost # Substitution
203
+ ].min
204
+ end
205
+ end
206
+
207
+ # Return the Levenshtein distance
208
+ matrix[m][n]
209
+ end
210
+ end
211
+ end
212
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "text_metrics/processors/base"
4
+ require "yaml"
5
+
6
+ module TextMetrics
7
+ module Processors
8
+ class French < TextMetrics::Processors::Base
9
+ GEM_PATH = File.dirname(__FILE__, 2).freeze
10
+ SYLLABLE_EXCEPTIONS_PATH = File.join(GEM_PATH, "dictionnaries/french_word_syllable_exceptions.yml").freeze
11
+ SYLLABLE_EXCEPTIONS = YAML.load_file(SYLLABLE_EXCEPTIONS_PATH).freeze
12
+
13
+ def flesch_reading_ease
14
+ sentence_length = words_per_sentence_average
15
+ syllables_per_word = syllables_per_word_average
16
+ flesch = 206.835 - 1.015 * sentence_length - 73.6 * syllables_per_word
17
+
18
+ flesch.round(2).clamp(0.0, 100.0)
19
+ end
20
+
21
+ def flesch_kincaid_grade
22
+ sentence_length = words_per_sentence_average
23
+ syllables_per_word = syllables_per_word_average
24
+ flesch = (0.55 * sentence_length) + (11.76 * syllables_per_word) - 15.79
25
+
26
+ flesch.round(1).clamp(0.0, 18.0)
27
+ end
28
+
29
+ private
30
+
31
+ def count_syllables_in_word(word)
32
+ return SYLLABLE_EXCEPTIONS[word].to_i if with_syllable_exceptions && SYLLABLE_EXCEPTIONS.key?(word)
33
+
34
+ word = word.downcase.gsub(/[^a-zàâäéèêëîïôöùûüç]/, "")
35
+
36
+ # Define vowel patterns including accents
37
+ vowels = "aàâeéèêëiîïoôöuùûüy"
38
+
39
+ # Remove final silent 'e' or 'es' unless the word is a single letter
40
+ word.gsub!(/(e|es|ent)$/, "") unless word.size == 1
41
+
42
+ # Handle special case for words ending with a consonant followed by 'r' (e.g., 'arbre')
43
+ if /[^aeiouy]r$/.match?(word)
44
+ word += "e" # Temporarily treat the final 'r' as part of a vowel sound for syllable counting
45
+ end
46
+
47
+ # Split the word into parts based on vowel-consonant transitions
48
+ parts = word.scan(/[^#{vowels}]*[#{vowels}]+/)
49
+
50
+ # Count the syllables based on vowel groups in each part
51
+ parts.sum { |part| part.scan(/[#{vowels}]+/).size }
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module TextMetrics # :nodoc:
4
+ VERSION = "0.0.1"
5
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "text_metrics/version"
4
+ require "text_metrics/processors/french"
5
+ require "text_metrics/processors/american_english"
6
+ require "forwardable"
7
+
8
+ module TextMetrics
9
+ class Error < StandardError; end
10
+
11
+ class TextMetrics
12
+ extend Forwardable
13
+ def_delegators :text_metrics_processor, :words_count, :characters_count, :syllables_count,
14
+ :sentences_count, :words_per_sentence_average, :syllables_per_word_average,
15
+ :letters_per_word_average, :words_per_sentence_average, :characters_per_sentence_average,
16
+ :flesch_reading_ease, :flesch_kincaid_grade, :all, :levenshtein_distance_from
17
+
18
+ PROCESSORS = {
19
+ "fr" => Processors::French,
20
+ "en_us" => Processors::AmericanEnglish
21
+ }
22
+
23
+ attr_reader :text, :language, :text_metrics_processor
24
+
25
+ def initialize(text:, language: "en_us")
26
+ @text = text
27
+ @language = language
28
+ @text_metrics_processor = PROCESSORS[language].new(text: text)
29
+ end
30
+
31
+ private
32
+
33
+ def processor_for(language)
34
+ PROCESSORS[language] || raise("Unknown language: #{language}, available languages: #{PROCESSORS.keys}")
35
+ end
36
+ end
37
+
38
+ def self.new(text:, language: "en_us")
39
+ TextMetrics.new(text: text, language: language)
40
+ end
41
+ end
metadata ADDED
@@ -0,0 +1,160 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: text-metrics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Adrien POLY
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2026-06-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: text-hyphen
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: 1.5.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: 1.5.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: bundler
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '1.15'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '1.15'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '13.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '13.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: minitest
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: guard
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: guard-minitest
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: byebug
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: A Ruby gem to compute various metrics for text, Currently focusing on
112
+ English and French
113
+ email:
114
+ - adrienpoly@gmail.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - CHANGELOG.md
120
+ - LICENSE.txt
121
+ - README.md
122
+ - lib/text_metrics.rb
123
+ - lib/text_metrics/dictionnaries/en_us.txt
124
+ - lib/text_metrics/dictionnaries/fr.txt
125
+ - lib/text_metrics/dictionnaries/french_word_syllable_database.yml
126
+ - lib/text_metrics/dictionnaries/french_word_syllable_exceptions.yml
127
+ - lib/text_metrics/dictionnaries/lexique-383.csv
128
+ - lib/text_metrics/processors/american_english.rb
129
+ - lib/text_metrics/processors/base.rb
130
+ - lib/text_metrics/processors/french.rb
131
+ - lib/text_metrics/version.rb
132
+ homepage: https://github.com/plume-app/text-metrics
133
+ licenses:
134
+ - MIT
135
+ metadata:
136
+ bug_tracker_uri: https://github.com/plume-app/text-metrics/issues
137
+ changelog_uri: https://github.com/plume-app/text-metrics/blob/main/CHANGELOG.md
138
+ documentation_uri: https://github.com/plume-app/text-metrics
139
+ homepage_uri: https://github.com/plume-app/text-metrics
140
+ source_code_uri: https://github.com/plume-app/text-metrics
141
+ post_install_message:
142
+ rdoc_options: []
143
+ require_paths:
144
+ - lib
145
+ required_ruby_version: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - ">="
148
+ - !ruby/object:Gem::Version
149
+ version: '3.1'
150
+ required_rubygems_version: !ruby/object:Gem::Requirement
151
+ requirements:
152
+ - - ">="
153
+ - !ruby/object:Gem::Version
154
+ version: '0'
155
+ requirements: []
156
+ rubygems_version: 3.4.19
157
+ signing_key:
158
+ specification_version: 4
159
+ summary: A Ruby gem to compute various metrics for text
160
+ test_files: []