textstat 0.1.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,197 @@
1
+ module TextStat
2
+ # Basic text statistics calculations
3
+ #
4
+ # This module provides fundamental text analysis methods such as counting
5
+ # characters, words, syllables, and sentences. These statistics form the
6
+ # foundation for more advanced readability calculations.
7
+ #
8
+ # @author Jakub Polak
9
+ # @since 1.0.0
10
+ # @example Basic usage
11
+ # text = "Hello world! This is a test."
12
+ # TextStat.char_count(text) # => 23
13
+ # TextStat.lexicon_count(text) # => 6
14
+ # TextStat.syllable_count(text) # => 6
15
+ # TextStat.sentence_count(text) # => 2
16
+ module BasicStats
17
+ # Frozen regex constants to avoid recompilation overhead
18
+ NON_ALPHA_REGEX = /[^a-zA-Z\s]/.freeze
19
+ SENTENCE_BOUNDARY_REGEX = /[.?!]['\\)\]]*[ |\n][A-Z]/.freeze
20
+
21
+ # Cache for Text::Hyphen instances to avoid recreating them for each call
22
+ @hyphenator_cache = {}
23
+
24
+ class << self
25
+ attr_accessor :hyphenator_cache
26
+
27
+ # Get or create a cached Text::Hyphen instance for the specified language
28
+ #
29
+ # @param language [String] language code
30
+ # @return [Text::Hyphen] cached hyphenator instance
31
+ # @private
32
+ def get_hyphenator(language)
33
+ @hyphenator_cache[language] ||= Text::Hyphen.new(language: language, left: 0, right: 0)
34
+ end
35
+
36
+ # Clear all cached hyphenators
37
+ #
38
+ # @return [Hash] empty cache
39
+ # @private
40
+ def clear_hyphenator_cache
41
+ @hyphenator_cache.clear
42
+ end
43
+ end
44
+ # Count characters in text
45
+ #
46
+ # @param text [String] the text to analyze
47
+ # @param ignore_spaces [Boolean] whether to ignore spaces in counting
48
+ # @return [Integer] number of characters
49
+ # @example
50
+ # TextStat.char_count("Hello world!") # => 11
51
+ # TextStat.char_count("Hello world!", false) # => 12
52
+ def char_count(text, ignore_spaces = true)
53
+ text = text.delete(' ') if ignore_spaces
54
+ text.length
55
+ end
56
+
57
+ # Count words (lexicons) in text
58
+ #
59
+ # @param text [String] the text to analyze
60
+ # @param remove_punctuation [Boolean] whether to remove punctuation before counting
61
+ # @return [Integer] number of words
62
+ # @example
63
+ # TextStat.lexicon_count("Hello, world!") # => 2
64
+ # TextStat.lexicon_count("Hello, world!", false) # => 2
65
+ def lexicon_count(text, remove_punctuation = true)
66
+ text = text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') if remove_punctuation
67
+ text.split.count
68
+ end
69
+
70
+ # Count syllables in text using hyphenation
71
+ #
72
+ # Uses the text-hyphen library for accurate syllable counting across
73
+ # different languages. Supports 22 languages including English, Spanish,
74
+ # French, German, and more. Hyphenator instances are cached for performance.
75
+ #
76
+ # @param text [String] the text to analyze
77
+ # @param language [String] language code for hyphenation dictionary
78
+ # @return [Integer] number of syllables
79
+ # @example
80
+ # TextStat.syllable_count("beautiful") # => 3
81
+ # TextStat.syllable_count("hello", "en_us") # => 2
82
+ # TextStat.syllable_count("bonjour", "fr") # => 2
83
+ # @see TextStat::DictionaryManager.supported_languages
84
+ def syllable_count(text, language = 'en_us')
85
+ return 0 if text.empty?
86
+
87
+ text = text.downcase
88
+ text.gsub(NON_ALPHA_REGEX, '').squeeze(' ') # NOTE: not assigned back (matches original behavior)
89
+ hyphenator = BasicStats.get_hyphenator(language)
90
+ count = 0
91
+ text.split.each do |word|
92
+ word_hyphenated = hyphenator.visualise(word)
93
+ count += word_hyphenated.count('-') + 1
94
+ end
95
+ count
96
+ end
97
+
98
+ # Count sentences in text
99
+ #
100
+ # Identifies sentence boundaries using punctuation marks (.!?) followed
101
+ # by whitespace and capital letters.
102
+ #
103
+ # @param text [String] the text to analyze
104
+ # @return [Integer] number of sentences
105
+ # @example
106
+ # TextStat.sentence_count("Hello world! How are you?") # => 2
107
+ # TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
108
+ def sentence_count(text)
109
+ text.scan(SENTENCE_BOUNDARY_REGEX).map(&:strip).count + 1
110
+ end
111
+
112
+ # Calculate average sentence length
113
+ #
114
+ # @param text [String] the text to analyze
115
+ # @return [Float] average number of words per sentence
116
+ # @example
117
+ # TextStat.avg_sentence_length("Hello world! How are you?") # => 3.0
118
+ def avg_sentence_length(text)
119
+ asl = lexicon_count(text).to_f / sentence_count(text)
120
+ asl.round(1)
121
+ rescue ZeroDivisionError
122
+ 0.0
123
+ end
124
+
125
+ # Calculate average syllables per word
126
+ #
127
+ # @param text [String] the text to analyze
128
+ # @param language [String] language code for hyphenation dictionary
129
+ # @return [Float] average number of syllables per word
130
+ # @example
131
+ # TextStat.avg_syllables_per_word("beautiful morning") # => 2.5
132
+ def avg_syllables_per_word(text, language = 'en_us')
133
+ syllable = syllable_count(text, language)
134
+ words = lexicon_count(text)
135
+ syllables_per_word = syllable.to_f / words
136
+ syllables_per_word.round(1)
137
+ rescue ZeroDivisionError
138
+ 0.0
139
+ end
140
+
141
+ # Calculate average letters per word
142
+ #
143
+ # @param text [String] the text to analyze
144
+ # @return [Float] average number of letters per word
145
+ # @example
146
+ # TextStat.avg_letter_per_word("hello world") # => 5.0
147
+ def avg_letter_per_word(text)
148
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
149
+ letters_per_word.round(2)
150
+ rescue ZeroDivisionError
151
+ 0.0
152
+ end
153
+
154
+ # Calculate average sentences per word
155
+ #
156
+ # @param text [String] the text to analyze
157
+ # @return [Float] average number of sentences per word
158
+ # @example
159
+ # TextStat.avg_sentence_per_word("Hello world! How are you?") # => 0.4
160
+ def avg_sentence_per_word(text)
161
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
162
+ sentence_per_word.round(2)
163
+ rescue ZeroDivisionError
164
+ 0.0
165
+ end
166
+
167
+ # Count polysyllabic words (3+ syllables)
168
+ #
169
+ # Optimized to count syllables for all words in one pass using a cached hyphenator.
170
+ #
171
+ # @param text [String] the text to analyze
172
+ # @param language [String] language code for hyphenation dictionary
173
+ # @return [Integer] number of polysyllabic words
174
+ # @example
175
+ # TextStat.polysyllab_count("beautiful complicated") # => 2
176
+ def polysyllab_count(text, language = 'en_us')
177
+ return 0 if text.empty?
178
+
179
+ # Clean and split text once
180
+ cleaned_text = text.downcase.gsub(NON_ALPHA_REGEX, '').squeeze(' ')
181
+ words = cleaned_text.split
182
+ return 0 if words.empty?
183
+
184
+ # Use cached hyphenator for better performance
185
+ hyphenator = BasicStats.get_hyphenator(language)
186
+ count = 0
187
+ words.each do |word|
188
+ next if word.empty?
189
+
190
+ word_hyphenated = hyphenator.visualise(word)
191
+ syllables = word_hyphenated.count('-') + 1
192
+ count += 1 if syllables >= 3
193
+ end
194
+ count
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,168 @@
1
+ require 'set'
2
+
3
+ module TextStat
4
+ # Dictionary management with high-performance caching
5
+ #
6
+ # This module handles loading and caching of language-specific dictionaries
7
+ # used for identifying difficult words. The caching system provides a 36x
8
+ # performance improvement over reading dictionaries from disk on every call.
9
+ #
10
+ # @author Jakub Polak
11
+ # @since 1.0.0
12
+ # @example Performance optimization
13
+ # # First call loads dictionary from disk
14
+ # TextStat.difficult_words(text, 'en_us') # ~0.047s
15
+ #
16
+ # # Subsequent calls use cached dictionary
17
+ # TextStat.difficult_words(text, 'en_us') # ~0.0013s (36x faster!)
18
+ #
19
+ # # Check cache status
20
+ # TextStat::DictionaryManager.cache_size # => 1
21
+ # TextStat::DictionaryManager.cached_languages # => ['en_us']
22
+ #
23
+ # @example Multi-language support
24
+ # TextStat.difficult_words(english_text, 'en_us')
25
+ # TextStat.difficult_words(spanish_text, 'es')
26
+ # TextStat.difficult_words(french_text, 'fr')
27
+ # TextStat::DictionaryManager.cache_size # => 3
28
+ module DictionaryManager
29
+ # Cache for loaded dictionaries
30
+ @dictionary_cache = {}
31
+ @dictionary_path = nil
32
+
33
+ class << self
34
+ attr_accessor :dictionary_cache
35
+
36
+ # Set dictionary path
37
+ #
38
+ # @param path [String] path to dictionary directory
39
+ # @return [String] the set path
40
+ def dictionary_path=(path)
41
+ @dictionary_path = path
42
+ end
43
+
44
+ # Load dictionary with automatic caching
45
+ #
46
+ # Loads a language-specific dictionary from disk and caches it in memory
47
+ # for subsequent calls. This provides significant performance improvements
48
+ # for repeated operations. Uses optimized file reading with streaming for
49
+ # better performance and memory efficiency.
50
+ #
51
+ # @param language [String] language code (e.g., 'en_us', 'es', 'fr')
52
+ # @return [Set] set of easy words for the specified language
53
+ # @example
54
+ # dict = TextStat::DictionaryManager.load_dictionary('en_us')
55
+ # dict.include?('hello') # => true
56
+ # dict.include?('comprehensive') # => false
57
+ # @see #supported_languages
58
+ def load_dictionary(language)
59
+ # Return cached dictionary if available
60
+ return @dictionary_cache[language] if @dictionary_cache[language]
61
+
62
+ # Load dictionary from file
63
+ dictionary_file = File.join(dictionary_path, "#{language}.txt")
64
+ easy_words = Set.new
65
+
66
+ if File.exist?(dictionary_file)
67
+ # Use foreach for streaming - efficient and memory-friendly for large files
68
+ File.foreach(dictionary_file, chomp: true) do |line|
69
+ easy_words << line
70
+ end
71
+ end
72
+
73
+ # Cache the loaded dictionary
74
+ @dictionary_cache[language] = easy_words
75
+ easy_words
76
+ end
77
+
78
+ # Clear all cached dictionaries
79
+ #
80
+ # Removes all dictionaries from memory cache. Useful for memory management
81
+ # in long-running applications or when switching between different sets
82
+ # of languages.
83
+ #
84
+ # @return [Hash] empty cache hash
85
+ # @example
86
+ # TextStat::DictionaryManager.cache_size # => 3
87
+ # TextStat::DictionaryManager.clear_cache
88
+ # TextStat::DictionaryManager.cache_size # => 0
89
+ def clear_cache
90
+ @dictionary_cache.clear
91
+ end
92
+
93
+ # Get list of cached languages
94
+ #
95
+ # @return [Array<String>] array of language codes currently in cache
96
+ # @example
97
+ # TextStat::DictionaryManager.cached_languages # => ['en_us', 'es', 'fr']
98
+ def cached_languages
99
+ @dictionary_cache.keys
100
+ end
101
+
102
+ # Get number of cached dictionaries
103
+ #
104
+ # @return [Integer] number of dictionaries currently in cache
105
+ # @example
106
+ # TextStat::DictionaryManager.cache_size # => 3
107
+ def cache_size
108
+ @dictionary_cache.size
109
+ end
110
+
111
+ # Get path to dictionary files
112
+ #
113
+ # @return [String] absolute path to dictionary directory
114
+ # @example
115
+ # TextStat::DictionaryManager.dictionary_path
116
+ # # => \"/path/to/gem/lib/dictionaries\"
117
+ def dictionary_path
118
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
119
+ end
120
+ end
121
+
122
+ # Count difficult words in text
123
+ #
124
+ # Identifies words that are considered difficult based on:
125
+ # 1. Not being in the language's easy words dictionary
126
+ # 2. Having more than one syllable
127
+ #
128
+ # This method uses the cached dictionary and hyphenator systems for optimal performance.
129
+ #
130
+ # @param text [String] the text to analyze
131
+ # @param language [String] language code for dictionary selection
132
+ # @param return_words [Boolean] whether to return words array or count
133
+ # @return [Integer, Set] number of difficult words or set of difficult words
134
+ # @example Count difficult words
135
+ # TextStat.difficult_words(\"This is a comprehensive analysis\") # => 2
136
+ #
137
+ # @example Get list of difficult words
138
+ # words = TextStat.difficult_words(\"comprehensive analysis\", 'en_us', true)
139
+ # words.to_a # => [\"comprehensive\", \"analysis\"]
140
+ #
141
+ # @example Multi-language support
142
+ # TextStat.difficult_words(spanish_text, 'es') # Spanish dictionary
143
+ # TextStat.difficult_words(french_text, 'fr') # French dictionary
144
+ def difficult_words(text, language = 'en_us', return_words = false)
145
+ easy_words = DictionaryManager.load_dictionary(language)
146
+
147
+ # Clean and split text once
148
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
149
+ return return_words ? Set.new : 0 if text_list.empty?
150
+
151
+ # Get cached hyphenator for syllable counting
152
+ hyphenator = BasicStats.get_hyphenator(language)
153
+ diff_words_set = Set.new
154
+
155
+ # Process each word once
156
+ text_list.each do |word|
157
+ next if easy_words.include?(word)
158
+
159
+ # Count syllables inline using cached hyphenator
160
+ word_hyphenated = hyphenator.visualise(word)
161
+ syllables = word_hyphenated.count('-') + 1
162
+ diff_words_set.add(word) if syllables > 1
163
+ end
164
+
165
+ return_words ? diff_words_set : diff_words_set.length
166
+ end
167
+ end
168
+ end
@@ -0,0 +1,137 @@
1
+ require 'text-hyphen'
2
+ require_relative 'basic_stats'
3
+ require_relative 'dictionary_manager'
4
+ require_relative 'readability_formulas'
5
+
6
+ module TextStat
7
+ # Path to the TextStat gem installation directory
8
+ #
9
+ # This constant is used internally to locate dictionary files and other
10
+ # gem resources. It points to the root directory of the installed gem.
11
+ #
12
+ # @return [String] absolute path to gem root directory
13
+ # @example
14
+ # TextStat::GEM_PATH # => \"/path/to/gems/textstat-1.0.0\"
15
+ GEM_PATH = File.dirname(File.dirname(File.dirname(__FILE__)))
16
+
17
+ # Main class providing text readability analysis
18
+ #
19
+ # This class combines all TextStat modules to provide a unified interface
20
+ # for text analysis. It includes basic statistics, dictionary management,
21
+ # and readability formulas in a single class.
22
+ #
23
+ # The class maintains backward compatibility through method delegation,
24
+ # ensuring that existing code continues to work seamlessly.
25
+ #
26
+ # @author Jakub Polak
27
+ # @since 1.0.0
28
+ # @example Creating an instance
29
+ # analyzer = TextStat::Main.new
30
+ # analyzer.flesch_reading_ease(\"Sample text\") # => 83.32
31
+ #
32
+ # @example Using class methods (backward compatibility)
33
+ # TextStat::Main.flesch_reading_ease(\"Sample text\") # => 83.32
34
+ # TextStat.flesch_reading_ease(\"Sample text\") # => 83.32
35
+ class Main
36
+ include BasicStats
37
+ include DictionaryManager
38
+ include ReadabilityFormulas
39
+
40
+ # Legacy class methods for backward compatibility
41
+ class << self
42
+ # Handle method delegation for backward compatibility
43
+ #
44
+ # This method ensures that all instance methods can be called as class methods,
45
+ # maintaining compatibility with the pre-1.0 API.
46
+ #
47
+ # @param method_name [Symbol] the method name being called
48
+ # @param args [Array] method arguments
49
+ # @param kwargs [Hash] keyword arguments
50
+ # @param block [Proc] block if provided
51
+ # @return [Object] result of the method call
52
+ # @private
53
+ def method_missing(method_name, *args, **kwargs, &block)
54
+ instance = new
55
+ if instance.respond_to?(method_name)
56
+ instance.send(method_name, *args, **kwargs, &block)
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ # Check if method exists for delegation
63
+ #
64
+ # @param method_name [Symbol] the method name to check
65
+ # @param include_private [Boolean] whether to include private methods
66
+ # @return [Boolean] true if method exists
67
+ # @private
68
+ def respond_to_missing?(method_name, include_private = false)
69
+ new.respond_to?(method_name, include_private) || super
70
+ end
71
+
72
+ # Set dictionary path for all instances
73
+ #
74
+ # @param path [String] path to dictionary directory
75
+ # @return [String] the set path
76
+ # @example
77
+ # TextStat::Main.dictionary_path = \"/custom/dictionaries\"
78
+ def dictionary_path=(path)
79
+ DictionaryManager.dictionary_path = path
80
+ end
81
+
82
+ # Get current dictionary path
83
+ #
84
+ # @return [String] current dictionary path
85
+ # @example
86
+ # TextStat::Main.dictionary_path # => \"/path/to/dictionaries\"
87
+ def dictionary_path
88
+ DictionaryManager.dictionary_path
89
+ end
90
+
91
+ # Clear all cached dictionaries
92
+ #
93
+ # @return [Hash] empty cache
94
+ # @example
95
+ # TextStat::Main.clear_dictionary_cache
96
+ def clear_dictionary_cache
97
+ DictionaryManager.clear_cache
98
+ end
99
+
100
+ # Load dictionary for specified language
101
+ #
102
+ # @param language [String] language code
103
+ # @return [Set] set of easy words for the language
104
+ # @example
105
+ # TextStat::Main.load_dictionary('en_us')
106
+ def load_dictionary(language)
107
+ DictionaryManager.load_dictionary(language)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ # For backward compatibility, expose TextStat module class methods
114
+ # This ensures that TextStat.method_name works exactly like TextStat::Main.method_name
115
+ TextStat.extend(Module.new do
116
+ # Handle method delegation at module level
117
+ #
118
+ # @param method_name [Symbol] the method name being called
119
+ # @param args [Array] method arguments
120
+ # @param kwargs [Hash] keyword arguments
121
+ # @param block [Proc] block if provided
122
+ # @return [Object] result of the method call
123
+ # @private
124
+ def method_missing(method_name, *args, **kwargs, &block)
125
+ TextStat::Main.send(method_name, *args, **kwargs, &block)
126
+ end
127
+
128
+ # Check if method exists for delegation
129
+ #
130
+ # @param method_name [Symbol] the method name to check
131
+ # @param include_private [Boolean] whether to include private methods
132
+ # @return [Boolean] true if method exists
133
+ # @private
134
+ def respond_to_missing?(method_name, include_private = false)
135
+ TextStat::Main.respond_to?(method_name, include_private) || super
136
+ end
137
+ end)