textstat 0.1.9 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/counter.rb +6 -6
- data/lib/dictionaries/da.txt +2000 -0
- data/lib/dictionaries/de.txt +2000 -0
- data/lib/dictionaries/en_uk.txt +2945 -0
- data/lib/dictionaries/es.txt +2000 -0
- data/lib/dictionaries/et.txt +2000 -0
- data/lib/dictionaries/fi.txt +2000 -0
- data/lib/dictionaries/fr.txt +2000 -0
- data/lib/dictionaries/hr.txt +1980 -0
- data/lib/dictionaries/hu.txt +2000 -0
- data/lib/dictionaries/id.txt +2000 -0
- data/lib/dictionaries/is.txt +2000 -0
- data/lib/dictionaries/it.txt +2000 -0
- data/lib/dictionaries/la.txt +2000 -0
- data/lib/dictionaries/no2.txt +2000 -0
- data/lib/dictionaries/pl.txt +2000 -0
- data/lib/dictionaries/pt.txt +2000 -0
- data/lib/dictionaries/ru.txt +2000 -0
- data/lib/dictionaries/sv.txt +2000 -0
- data/lib/textstat/basic_stats.rb +156 -0
- data/lib/textstat/dictionary_manager.rb +156 -0
- data/lib/textstat/main.rb +137 -0
- data/lib/textstat/readability_formulas.rb +363 -0
- data/lib/textstat/version.rb +21 -2
- data/lib/textstat.rb +36 -313
- metadata +217 -21
- data/spec/textstat_spec.rb +0 -197
@@ -0,0 +1,156 @@
|
|
1
|
+
module TextStat
|
2
|
+
# Basic text statistics calculations
|
3
|
+
#
|
4
|
+
# This module provides fundamental text analysis methods such as counting
|
5
|
+
# characters, words, syllables, and sentences. These statistics form the
|
6
|
+
# foundation for more advanced readability calculations.
|
7
|
+
#
|
8
|
+
# @author Jakub Polak
|
9
|
+
# @since 1.0.0
|
10
|
+
# @example Basic usage
|
11
|
+
# text = "Hello world! This is a test."
|
12
|
+
# TextStat.char_count(text) # => 23
|
13
|
+
# TextStat.lexicon_count(text) # => 6
|
14
|
+
# TextStat.syllable_count(text) # => 6
|
15
|
+
# TextStat.sentence_count(text) # => 2
|
16
|
+
module BasicStats
|
17
|
+
# Count characters in text
|
18
|
+
#
|
19
|
+
# @param text [String] the text to analyze
|
20
|
+
# @param ignore_spaces [Boolean] whether to ignore spaces in counting
|
21
|
+
# @return [Integer] number of characters
|
22
|
+
# @example
|
23
|
+
# TextStat.char_count("Hello world!") # => 11
|
24
|
+
# TextStat.char_count("Hello world!", false) # => 12
|
25
|
+
def char_count(text, ignore_spaces = true)
|
26
|
+
text = text.delete(' ') if ignore_spaces
|
27
|
+
text.length
|
28
|
+
end
|
29
|
+
|
30
|
+
# Count words (lexicons) in text
|
31
|
+
#
|
32
|
+
# @param text [String] the text to analyze
|
33
|
+
# @param remove_punctuation [Boolean] whether to remove punctuation before counting
|
34
|
+
# @return [Integer] number of words
|
35
|
+
# @example
|
36
|
+
# TextStat.lexicon_count("Hello, world!") # => 2
|
37
|
+
# TextStat.lexicon_count("Hello, world!", false) # => 2
|
38
|
+
def lexicon_count(text, remove_punctuation = true)
|
39
|
+
text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
|
40
|
+
text.split.count
|
41
|
+
end
|
42
|
+
|
43
|
+
# Count syllables in text using hyphenation
|
44
|
+
#
|
45
|
+
# Uses the text-hyphen library for accurate syllable counting across
|
46
|
+
# different languages. Supports 22 languages including English, Spanish,
|
47
|
+
# French, German, and more.
|
48
|
+
#
|
49
|
+
# @param text [String] the text to analyze
|
50
|
+
# @param language [String] language code for hyphenation dictionary
|
51
|
+
# @return [Integer] number of syllables
|
52
|
+
# @example
|
53
|
+
# TextStat.syllable_count("beautiful") # => 3
|
54
|
+
# TextStat.syllable_count("hello", "en_us") # => 2
|
55
|
+
# TextStat.syllable_count("bonjour", "fr") # => 2
|
56
|
+
# @see TextStat::DictionaryManager.supported_languages
|
57
|
+
def syllable_count(text, language = 'en_us')
|
58
|
+
return 0 if text.empty?
|
59
|
+
|
60
|
+
text = text.downcase
|
61
|
+
text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
|
62
|
+
dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
|
63
|
+
count = 0
|
64
|
+
text.split.each do |word|
|
65
|
+
word_hyphenated = dictionary.visualise(word)
|
66
|
+
count += word_hyphenated.count('-') + 1
|
67
|
+
end
|
68
|
+
count
|
69
|
+
end
|
70
|
+
|
71
|
+
# Count sentences in text
|
72
|
+
#
|
73
|
+
# Identifies sentence boundaries using punctuation marks (.!?) followed
|
74
|
+
# by whitespace and capital letters.
|
75
|
+
#
|
76
|
+
# @param text [String] the text to analyze
|
77
|
+
# @return [Integer] number of sentences
|
78
|
+
# @example
|
79
|
+
# TextStat.sentence_count("Hello world! How are you?") # => 2
|
80
|
+
# TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
|
81
|
+
def sentence_count(text)
|
82
|
+
text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
|
83
|
+
end
|
84
|
+
|
85
|
+
# Calculate average sentence length
|
86
|
+
#
|
87
|
+
# @param text [String] the text to analyze
|
88
|
+
# @return [Float] average number of words per sentence
|
89
|
+
# @example
|
90
|
+
# TextStat.avg_sentence_length("Hello world! How are you?") # => 3.0
|
91
|
+
def avg_sentence_length(text)
|
92
|
+
asl = lexicon_count(text).to_f / sentence_count(text)
|
93
|
+
asl.round(1)
|
94
|
+
rescue ZeroDivisionError
|
95
|
+
0.0
|
96
|
+
end
|
97
|
+
|
98
|
+
# Calculate average syllables per word
|
99
|
+
#
|
100
|
+
# @param text [String] the text to analyze
|
101
|
+
# @param language [String] language code for hyphenation dictionary
|
102
|
+
# @return [Float] average number of syllables per word
|
103
|
+
# @example
|
104
|
+
# TextStat.avg_syllables_per_word("beautiful morning") # => 2.5
|
105
|
+
def avg_syllables_per_word(text, language = 'en_us')
|
106
|
+
syllable = syllable_count(text, language)
|
107
|
+
words = lexicon_count(text)
|
108
|
+
syllables_per_word = syllable.to_f / words
|
109
|
+
syllables_per_word.round(1)
|
110
|
+
rescue ZeroDivisionError
|
111
|
+
0.0
|
112
|
+
end
|
113
|
+
|
114
|
+
# Calculate average letters per word
|
115
|
+
#
|
116
|
+
# @param text [String] the text to analyze
|
117
|
+
# @return [Float] average number of letters per word
|
118
|
+
# @example
|
119
|
+
# TextStat.avg_letter_per_word("hello world") # => 5.0
|
120
|
+
def avg_letter_per_word(text)
|
121
|
+
letters_per_word = char_count(text).to_f / lexicon_count(text)
|
122
|
+
letters_per_word.round(2)
|
123
|
+
rescue ZeroDivisionError
|
124
|
+
0.0
|
125
|
+
end
|
126
|
+
|
127
|
+
# Calculate average sentences per word
|
128
|
+
#
|
129
|
+
# @param text [String] the text to analyze
|
130
|
+
# @return [Float] average number of sentences per word
|
131
|
+
# @example
|
132
|
+
# TextStat.avg_sentence_per_word("Hello world! How are you?") # => 0.4
|
133
|
+
def avg_sentence_per_word(text)
|
134
|
+
sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
|
135
|
+
sentence_per_word.round(2)
|
136
|
+
rescue ZeroDivisionError
|
137
|
+
0.0
|
138
|
+
end
|
139
|
+
|
140
|
+
# Count polysyllabic words (3+ syllables)
|
141
|
+
#
|
142
|
+
# @param text [String] the text to analyze
|
143
|
+
# @param language [String] language code for hyphenation dictionary
|
144
|
+
# @return [Integer] number of polysyllabic words
|
145
|
+
# @example
|
146
|
+
# TextStat.polysyllab_count("beautiful complicated") # => 2
|
147
|
+
def polysyllab_count(text, language = 'en_us')
|
148
|
+
count = 0
|
149
|
+
text.split.each do |word|
|
150
|
+
w = syllable_count(word, language)
|
151
|
+
count += 1 if w >= 3
|
152
|
+
end
|
153
|
+
count
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,156 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module TextStat
|
4
|
+
# Dictionary management with high-performance caching
|
5
|
+
#
|
6
|
+
# This module handles loading and caching of language-specific dictionaries
|
7
|
+
# used for identifying difficult words. The caching system provides a 36x
|
8
|
+
# performance improvement over reading dictionaries from disk on every call.
|
9
|
+
#
|
10
|
+
# @author Jakub Polak
|
11
|
+
# @since 1.0.0
|
12
|
+
# @example Performance optimization
|
13
|
+
# # First call loads dictionary from disk
|
14
|
+
# TextStat.difficult_words(text, 'en_us') # ~0.047s
|
15
|
+
#
|
16
|
+
# # Subsequent calls use cached dictionary
|
17
|
+
# TextStat.difficult_words(text, 'en_us') # ~0.0013s (36x faster!)
|
18
|
+
#
|
19
|
+
# # Check cache status
|
20
|
+
# TextStat::DictionaryManager.cache_size # => 1
|
21
|
+
# TextStat::DictionaryManager.cached_languages # => ['en_us']
|
22
|
+
#
|
23
|
+
# @example Multi-language support
|
24
|
+
# TextStat.difficult_words(english_text, 'en_us')
|
25
|
+
# TextStat.difficult_words(spanish_text, 'es')
|
26
|
+
# TextStat.difficult_words(french_text, 'fr')
|
27
|
+
# TextStat::DictionaryManager.cache_size # => 3
|
28
|
+
module DictionaryManager
|
29
|
+
# Cache for loaded dictionaries
|
30
|
+
@dictionary_cache = {}
|
31
|
+
@dictionary_path = nil
|
32
|
+
|
33
|
+
class << self
|
34
|
+
attr_accessor :dictionary_cache
|
35
|
+
|
36
|
+
# Set dictionary path
|
37
|
+
#
|
38
|
+
# @param path [String] path to dictionary directory
|
39
|
+
# @return [String] the set path
|
40
|
+
def dictionary_path=(path)
|
41
|
+
@dictionary_path = path
|
42
|
+
end
|
43
|
+
|
44
|
+
# Load dictionary with automatic caching
|
45
|
+
#
|
46
|
+
# Loads a language-specific dictionary from disk and caches it in memory
|
47
|
+
# for subsequent calls. This provides significant performance improvements
|
48
|
+
# for repeated operations.
|
49
|
+
#
|
50
|
+
# @param language [String] language code (e.g., 'en_us', 'es', 'fr')
|
51
|
+
# @return [Set] set of easy words for the specified language
|
52
|
+
# @example
|
53
|
+
# dict = TextStat::DictionaryManager.load_dictionary('en_us')
|
54
|
+
# dict.include?('hello') # => true
|
55
|
+
# dict.include?('comprehensive') # => false
|
56
|
+
# @see #supported_languages
|
57
|
+
def load_dictionary(language)
|
58
|
+
# Return cached dictionary if available
|
59
|
+
return @dictionary_cache[language] if @dictionary_cache[language]
|
60
|
+
|
61
|
+
# Load dictionary from file
|
62
|
+
dictionary_file = File.join(dictionary_path, "#{language}.txt")
|
63
|
+
easy_words = Set.new
|
64
|
+
|
65
|
+
if File.exist?(dictionary_file)
|
66
|
+
File.read(dictionary_file).each_line do |line|
|
67
|
+
easy_words << line.chomp
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Cache the loaded dictionary
|
72
|
+
@dictionary_cache[language] = easy_words
|
73
|
+
easy_words
|
74
|
+
end
|
75
|
+
|
76
|
+
# Clear all cached dictionaries
|
77
|
+
#
|
78
|
+
# Removes all dictionaries from memory cache. Useful for memory management
|
79
|
+
# in long-running applications or when switching between different sets
|
80
|
+
# of languages.
|
81
|
+
#
|
82
|
+
# @return [Hash] empty cache hash
|
83
|
+
# @example
|
84
|
+
# TextStat::DictionaryManager.cache_size # => 3
|
85
|
+
# TextStat::DictionaryManager.clear_cache
|
86
|
+
# TextStat::DictionaryManager.cache_size # => 0
|
87
|
+
def clear_cache
|
88
|
+
@dictionary_cache.clear
|
89
|
+
end
|
90
|
+
|
91
|
+
# Get list of cached languages
|
92
|
+
#
|
93
|
+
# @return [Array<String>] array of language codes currently in cache
|
94
|
+
# @example
|
95
|
+
# TextStat::DictionaryManager.cached_languages # => ['en_us', 'es', 'fr']
|
96
|
+
def cached_languages
|
97
|
+
@dictionary_cache.keys
|
98
|
+
end
|
99
|
+
|
100
|
+
# Get number of cached dictionaries
|
101
|
+
#
|
102
|
+
# @return [Integer] number of dictionaries currently in cache
|
103
|
+
# @example
|
104
|
+
# TextStat::DictionaryManager.cache_size # => 3
|
105
|
+
def cache_size
|
106
|
+
@dictionary_cache.size
|
107
|
+
end
|
108
|
+
|
109
|
+
# Get path to dictionary files
|
110
|
+
#
|
111
|
+
# @return [String] absolute path to dictionary directory
|
112
|
+
# @example
|
113
|
+
# TextStat::DictionaryManager.dictionary_path
|
114
|
+
# # => \"/path/to/gem/lib/dictionaries\"
|
115
|
+
def dictionary_path
|
116
|
+
@dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
# Count difficult words in text
|
121
|
+
#
|
122
|
+
# Identifies words that are considered difficult based on:
|
123
|
+
# 1. Not being in the language's easy words dictionary
|
124
|
+
# 2. Having more than one syllable
|
125
|
+
#
|
126
|
+
# This method uses the cached dictionary system for optimal performance.
|
127
|
+
#
|
128
|
+
# @param text [String] the text to analyze
|
129
|
+
# @param language [String] language code for dictionary selection
|
130
|
+
# @param return_words [Boolean] whether to return words array or count
|
131
|
+
# @return [Integer, Set] number of difficult words or set of difficult words
|
132
|
+
# @example Count difficult words
|
133
|
+
# TextStat.difficult_words(\"This is a comprehensive analysis\") # => 2
|
134
|
+
#
|
135
|
+
# @example Get list of difficult words
|
136
|
+
# words = TextStat.difficult_words(\"comprehensive analysis\", 'en_us', true)
|
137
|
+
# words.to_a # => [\"comprehensive\", \"analysis\"]
|
138
|
+
#
|
139
|
+
# @example Multi-language support
|
140
|
+
# TextStat.difficult_words(spanish_text, 'es') # Spanish dictionary
|
141
|
+
# TextStat.difficult_words(french_text, 'fr') # French dictionary
|
142
|
+
def difficult_words(text, language = 'en_us', return_words = false)
|
143
|
+
easy_words = DictionaryManager.load_dictionary(language)
|
144
|
+
|
145
|
+
text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
|
146
|
+
diff_words_set = Set.new
|
147
|
+
text_list.each do |value|
|
148
|
+
next if easy_words.include? value
|
149
|
+
|
150
|
+
diff_words_set.add(value) if syllable_count(value, language) > 1
|
151
|
+
end
|
152
|
+
|
153
|
+
return_words ? diff_words_set : diff_words_set.length
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
require 'text-hyphen'
|
2
|
+
require_relative 'basic_stats'
|
3
|
+
require_relative 'dictionary_manager'
|
4
|
+
require_relative 'readability_formulas'
|
5
|
+
|
6
|
+
module TextStat
|
7
|
+
# Path to the TextStat gem installation directory
|
8
|
+
#
|
9
|
+
# This constant is used internally to locate dictionary files and other
|
10
|
+
# gem resources. It points to the root directory of the installed gem.
|
11
|
+
#
|
12
|
+
# @return [String] absolute path to gem root directory
|
13
|
+
# @example
|
14
|
+
# TextStat::GEM_PATH # => \"/path/to/gems/textstat-1.0.0\"
|
15
|
+
GEM_PATH = File.dirname(File.dirname(File.dirname(__FILE__)))
|
16
|
+
|
17
|
+
# Main class providing text readability analysis
|
18
|
+
#
|
19
|
+
# This class combines all TextStat modules to provide a unified interface
|
20
|
+
# for text analysis. It includes basic statistics, dictionary management,
|
21
|
+
# and readability formulas in a single class.
|
22
|
+
#
|
23
|
+
# The class maintains backward compatibility through method delegation,
|
24
|
+
# ensuring that existing code continues to work seamlessly.
|
25
|
+
#
|
26
|
+
# @author Jakub Polak
|
27
|
+
# @since 1.0.0
|
28
|
+
# @example Creating an instance
|
29
|
+
# analyzer = TextStat::Main.new
|
30
|
+
# analyzer.flesch_reading_ease(\"Sample text\") # => 83.32
|
31
|
+
#
|
32
|
+
# @example Using class methods (backward compatibility)
|
33
|
+
# TextStat::Main.flesch_reading_ease(\"Sample text\") # => 83.32
|
34
|
+
# TextStat.flesch_reading_ease(\"Sample text\") # => 83.32
|
35
|
+
class Main
|
36
|
+
include BasicStats
|
37
|
+
include DictionaryManager
|
38
|
+
include ReadabilityFormulas
|
39
|
+
|
40
|
+
# Legacy class methods for backward compatibility
|
41
|
+
class << self
|
42
|
+
# Handle method delegation for backward compatibility
|
43
|
+
#
|
44
|
+
# This method ensures that all instance methods can be called as class methods,
|
45
|
+
# maintaining compatibility with the pre-1.0 API.
|
46
|
+
#
|
47
|
+
# @param method_name [Symbol] the method name being called
|
48
|
+
# @param args [Array] method arguments
|
49
|
+
# @param kwargs [Hash] keyword arguments
|
50
|
+
# @param block [Proc] block if provided
|
51
|
+
# @return [Object] result of the method call
|
52
|
+
# @private
|
53
|
+
def method_missing(method_name, *args, **kwargs, &block)
|
54
|
+
instance = new
|
55
|
+
if instance.respond_to?(method_name)
|
56
|
+
instance.send(method_name, *args, **kwargs, &block)
|
57
|
+
else
|
58
|
+
super
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# Check if method exists for delegation
|
63
|
+
#
|
64
|
+
# @param method_name [Symbol] the method name to check
|
65
|
+
# @param include_private [Boolean] whether to include private methods
|
66
|
+
# @return [Boolean] true if method exists
|
67
|
+
# @private
|
68
|
+
def respond_to_missing?(method_name, include_private = false)
|
69
|
+
new.respond_to?(method_name, include_private) || super
|
70
|
+
end
|
71
|
+
|
72
|
+
# Set dictionary path for all instances
|
73
|
+
#
|
74
|
+
# @param path [String] path to dictionary directory
|
75
|
+
# @return [String] the set path
|
76
|
+
# @example
|
77
|
+
# TextStat::Main.dictionary_path = \"/custom/dictionaries\"
|
78
|
+
def dictionary_path=(path)
|
79
|
+
DictionaryManager.dictionary_path = path
|
80
|
+
end
|
81
|
+
|
82
|
+
# Get current dictionary path
|
83
|
+
#
|
84
|
+
# @return [String] current dictionary path
|
85
|
+
# @example
|
86
|
+
# TextStat::Main.dictionary_path # => \"/path/to/dictionaries\"
|
87
|
+
def dictionary_path
|
88
|
+
DictionaryManager.dictionary_path
|
89
|
+
end
|
90
|
+
|
91
|
+
# Clear all cached dictionaries
|
92
|
+
#
|
93
|
+
# @return [Hash] empty cache
|
94
|
+
# @example
|
95
|
+
# TextStat::Main.clear_dictionary_cache
|
96
|
+
def clear_dictionary_cache
|
97
|
+
DictionaryManager.clear_cache
|
98
|
+
end
|
99
|
+
|
100
|
+
# Load dictionary for specified language
|
101
|
+
#
|
102
|
+
# @param language [String] language code
|
103
|
+
# @return [Set] set of easy words for the language
|
104
|
+
# @example
|
105
|
+
# TextStat::Main.load_dictionary('en_us')
|
106
|
+
def load_dictionary(language)
|
107
|
+
DictionaryManager.load_dictionary(language)
|
108
|
+
end
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# For backward compatibility, expose TextStat module class methods
|
114
|
+
# This ensures that TextStat.method_name works exactly like TextStat::Main.method_name
|
115
|
+
TextStat.extend(Module.new do
|
116
|
+
# Handle method delegation at module level
|
117
|
+
#
|
118
|
+
# @param method_name [Symbol] the method name being called
|
119
|
+
# @param args [Array] method arguments
|
120
|
+
# @param kwargs [Hash] keyword arguments
|
121
|
+
# @param block [Proc] block if provided
|
122
|
+
# @return [Object] result of the method call
|
123
|
+
# @private
|
124
|
+
def method_missing(method_name, *args, **kwargs, &block)
|
125
|
+
TextStat::Main.send(method_name, *args, **kwargs, &block)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Check if method exists for delegation
|
129
|
+
#
|
130
|
+
# @param method_name [Symbol] the method name to check
|
131
|
+
# @param include_private [Boolean] whether to include private methods
|
132
|
+
# @return [Boolean] true if method exists
|
133
|
+
# @private
|
134
|
+
def respond_to_missing?(method_name, include_private = false)
|
135
|
+
TextStat::Main.respond_to?(method_name, include_private) || super
|
136
|
+
end
|
137
|
+
end)
|