textstat 0.1.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ module TextStat
2
+ # Basic text statistics calculations
3
+ #
4
+ # This module provides fundamental text analysis methods such as counting
5
+ # characters, words, syllables, and sentences. These statistics form the
6
+ # foundation for more advanced readability calculations.
7
+ #
8
+ # @author Jakub Polak
9
+ # @since 1.0.0
10
+ # @example Basic usage
11
+ # text = "Hello world! This is a test."
12
+ # TextStat.char_count(text) # => 23
13
+ # TextStat.lexicon_count(text) # => 6
14
+ # TextStat.syllable_count(text) # => 6
15
+ # TextStat.sentence_count(text) # => 2
16
+ module BasicStats
17
+ # Count characters in text
18
+ #
19
+ # @param text [String] the text to analyze
20
+ # @param ignore_spaces [Boolean] whether to ignore spaces in counting
21
+ # @return [Integer] number of characters
22
+ # @example
23
+ # TextStat.char_count("Hello world!") # => 11
24
+ # TextStat.char_count("Hello world!", false) # => 12
25
+ def char_count(text, ignore_spaces = true)
26
+ text = text.delete(' ') if ignore_spaces
27
+ text.length
28
+ end
29
+
30
+ # Count words (lexicons) in text
31
+ #
32
+ # @param text [String] the text to analyze
33
+ # @param remove_punctuation [Boolean] whether to remove punctuation before counting
34
+ # @return [Integer] number of words
35
+ # @example
36
+ # TextStat.lexicon_count("Hello, world!") # => 2
37
+ # TextStat.lexicon_count("Hello, world!", false) # => 2
38
+ def lexicon_count(text, remove_punctuation = true)
39
+ text = text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ') if remove_punctuation
40
+ text.split.count
41
+ end
42
+
43
+ # Count syllables in text using hyphenation
44
+ #
45
+ # Uses the text-hyphen library for accurate syllable counting across
46
+ # different languages. Supports 22 languages including English, Spanish,
47
+ # French, German, and more.
48
+ #
49
+ # @param text [String] the text to analyze
50
+ # @param language [String] language code for hyphenation dictionary
51
+ # @return [Integer] number of syllables
52
+ # @example
53
+ # TextStat.syllable_count("beautiful") # => 3
54
+ # TextStat.syllable_count("hello", "en_us") # => 2
55
+ # TextStat.syllable_count("bonjour", "fr") # => 2
56
+ # @see TextStat::DictionaryManager.supported_languages
57
+ def syllable_count(text, language = 'en_us')
58
+ return 0 if text.empty?
59
+
60
+ text = text.downcase
61
+ text.gsub(/[^a-zA-Z\s]/, '').squeeze(' ')
62
+ dictionary = Text::Hyphen.new(language: language, left: 0, right: 0)
63
+ count = 0
64
+ text.split.each do |word|
65
+ word_hyphenated = dictionary.visualise(word)
66
+ count += word_hyphenated.count('-') + 1
67
+ end
68
+ count
69
+ end
70
+
71
+ # Count sentences in text
72
+ #
73
+ # Identifies sentence boundaries using punctuation marks (.!?) followed
74
+ # by whitespace and capital letters.
75
+ #
76
+ # @param text [String] the text to analyze
77
+ # @return [Integer] number of sentences
78
+ # @example
79
+ # TextStat.sentence_count("Hello world! How are you?") # => 2
80
+ # TextStat.sentence_count("Dr. Smith went to the U.S.A.") # => 1
81
+ def sentence_count(text)
82
+ text.scan(/[\.\?!][\'\\)\]]*[ |\n][A-Z]/).map(&:strip).count + 1
83
+ end
84
+
85
+ # Calculate average sentence length
86
+ #
87
+ # @param text [String] the text to analyze
88
+ # @return [Float] average number of words per sentence
89
+ # @example
90
+ # TextStat.avg_sentence_length("Hello world! How are you?") # => 3.0
91
+ def avg_sentence_length(text)
92
+ asl = lexicon_count(text).to_f / sentence_count(text)
93
+ asl.round(1)
94
+ rescue ZeroDivisionError
95
+ 0.0
96
+ end
97
+
98
+ # Calculate average syllables per word
99
+ #
100
+ # @param text [String] the text to analyze
101
+ # @param language [String] language code for hyphenation dictionary
102
+ # @return [Float] average number of syllables per word
103
+ # @example
104
+ # TextStat.avg_syllables_per_word("beautiful morning") # => 2.5
105
+ def avg_syllables_per_word(text, language = 'en_us')
106
+ syllable = syllable_count(text, language)
107
+ words = lexicon_count(text)
108
+ syllables_per_word = syllable.to_f / words
109
+ syllables_per_word.round(1)
110
+ rescue ZeroDivisionError
111
+ 0.0
112
+ end
113
+
114
+ # Calculate average letters per word
115
+ #
116
+ # @param text [String] the text to analyze
117
+ # @return [Float] average number of letters per word
118
+ # @example
119
+ # TextStat.avg_letter_per_word("hello world") # => 5.0
120
+ def avg_letter_per_word(text)
121
+ letters_per_word = char_count(text).to_f / lexicon_count(text)
122
+ letters_per_word.round(2)
123
+ rescue ZeroDivisionError
124
+ 0.0
125
+ end
126
+
127
+ # Calculate average sentences per word
128
+ #
129
+ # @param text [String] the text to analyze
130
+ # @return [Float] average number of sentences per word
131
+ # @example
132
+ # TextStat.avg_sentence_per_word("Hello world! How are you?") # => 0.4
133
+ def avg_sentence_per_word(text)
134
+ sentence_per_word = sentence_count(text).to_f / lexicon_count(text)
135
+ sentence_per_word.round(2)
136
+ rescue ZeroDivisionError
137
+ 0.0
138
+ end
139
+
140
+ # Count polysyllabic words (3+ syllables)
141
+ #
142
+ # @param text [String] the text to analyze
143
+ # @param language [String] language code for hyphenation dictionary
144
+ # @return [Integer] number of polysyllabic words
145
+ # @example
146
+ # TextStat.polysyllab_count("beautiful complicated") # => 2
147
+ def polysyllab_count(text, language = 'en_us')
148
+ count = 0
149
+ text.split.each do |word|
150
+ w = syllable_count(word, language)
151
+ count += 1 if w >= 3
152
+ end
153
+ count
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,156 @@
1
+ require 'set'
2
+
3
+ module TextStat
4
+ # Dictionary management with high-performance caching
5
+ #
6
+ # This module handles loading and caching of language-specific dictionaries
7
+ # used for identifying difficult words. The caching system provides a 36x
8
+ # performance improvement over reading dictionaries from disk on every call.
9
+ #
10
+ # @author Jakub Polak
11
+ # @since 1.0.0
12
+ # @example Performance optimization
13
+ # # First call loads dictionary from disk
14
+ # TextStat.difficult_words(text, 'en_us') # ~0.047s
15
+ #
16
+ # # Subsequent calls use cached dictionary
17
+ # TextStat.difficult_words(text, 'en_us') # ~0.0013s (36x faster!)
18
+ #
19
+ # # Check cache status
20
+ # TextStat::DictionaryManager.cache_size # => 1
21
+ # TextStat::DictionaryManager.cached_languages # => ['en_us']
22
+ #
23
+ # @example Multi-language support
24
+ # TextStat.difficult_words(english_text, 'en_us')
25
+ # TextStat.difficult_words(spanish_text, 'es')
26
+ # TextStat.difficult_words(french_text, 'fr')
27
+ # TextStat::DictionaryManager.cache_size # => 3
28
+ module DictionaryManager
29
+ # Cache for loaded dictionaries
30
+ @dictionary_cache = {}
31
+ @dictionary_path = nil
32
+
33
+ class << self
34
+ attr_accessor :dictionary_cache
35
+
36
+ # Set dictionary path
37
+ #
38
+ # @param path [String] path to dictionary directory
39
+ # @return [String] the set path
40
+ def dictionary_path=(path)
41
+ @dictionary_path = path
42
+ end
43
+
44
+ # Load dictionary with automatic caching
45
+ #
46
+ # Loads a language-specific dictionary from disk and caches it in memory
47
+ # for subsequent calls. This provides significant performance improvements
48
+ # for repeated operations.
49
+ #
50
+ # @param language [String] language code (e.g., 'en_us', 'es', 'fr')
51
+ # @return [Set] set of easy words for the specified language
52
+ # @example
53
+ # dict = TextStat::DictionaryManager.load_dictionary('en_us')
54
+ # dict.include?('hello') # => true
55
+ # dict.include?('comprehensive') # => false
56
+ # @see #supported_languages
57
+ def load_dictionary(language)
58
+ # Return cached dictionary if available
59
+ return @dictionary_cache[language] if @dictionary_cache[language]
60
+
61
+ # Load dictionary from file
62
+ dictionary_file = File.join(dictionary_path, "#{language}.txt")
63
+ easy_words = Set.new
64
+
65
+ if File.exist?(dictionary_file)
66
+ File.read(dictionary_file).each_line do |line|
67
+ easy_words << line.chomp
68
+ end
69
+ end
70
+
71
+ # Cache the loaded dictionary
72
+ @dictionary_cache[language] = easy_words
73
+ easy_words
74
+ end
75
+
76
+ # Clear all cached dictionaries
77
+ #
78
+ # Removes all dictionaries from memory cache. Useful for memory management
79
+ # in long-running applications or when switching between different sets
80
+ # of languages.
81
+ #
82
+ # @return [Hash] empty cache hash
83
+ # @example
84
+ # TextStat::DictionaryManager.cache_size # => 3
85
+ # TextStat::DictionaryManager.clear_cache
86
+ # TextStat::DictionaryManager.cache_size # => 0
87
+ def clear_cache
88
+ @dictionary_cache.clear
89
+ end
90
+
91
+ # Get list of cached languages
92
+ #
93
+ # @return [Array<String>] array of language codes currently in cache
94
+ # @example
95
+ # TextStat::DictionaryManager.cached_languages # => ['en_us', 'es', 'fr']
96
+ def cached_languages
97
+ @dictionary_cache.keys
98
+ end
99
+
100
+ # Get number of cached dictionaries
101
+ #
102
+ # @return [Integer] number of dictionaries currently in cache
103
+ # @example
104
+ # TextStat::DictionaryManager.cache_size # => 3
105
+ def cache_size
106
+ @dictionary_cache.size
107
+ end
108
+
109
+ # Get path to dictionary files
110
+ #
111
+ # @return [String] absolute path to dictionary directory
112
+ # @example
113
+ # TextStat::DictionaryManager.dictionary_path
114
+ # # => \"/path/to/gem/lib/dictionaries\"
115
+ def dictionary_path
116
+ @dictionary_path ||= File.join(TextStat::GEM_PATH, 'lib', 'dictionaries')
117
+ end
118
+ end
119
+
120
+ # Count difficult words in text
121
+ #
122
+ # Identifies words that are considered difficult based on:
123
+ # 1. Not being in the language's easy words dictionary
124
+ # 2. Having more than one syllable
125
+ #
126
+ # This method uses the cached dictionary system for optimal performance.
127
+ #
128
+ # @param text [String] the text to analyze
129
+ # @param language [String] language code for dictionary selection
130
+ # @param return_words [Boolean] whether to return words array or count
131
+ # @return [Integer, Set] number of difficult words or set of difficult words
132
+ # @example Count difficult words
133
+ # TextStat.difficult_words(\"This is a comprehensive analysis\") # => 2
134
+ #
135
+ # @example Get list of difficult words
136
+ # words = TextStat.difficult_words(\"comprehensive analysis\", 'en_us', true)
137
+ # words.to_a # => [\"comprehensive\", \"analysis\"]
138
+ #
139
+ # @example Multi-language support
140
+ # TextStat.difficult_words(spanish_text, 'es') # Spanish dictionary
141
+ # TextStat.difficult_words(french_text, 'fr') # French dictionary
142
+ def difficult_words(text, language = 'en_us', return_words = false)
143
+ easy_words = DictionaryManager.load_dictionary(language)
144
+
145
+ text_list = text.downcase.gsub(/[^0-9a-z ]/i, '').split
146
+ diff_words_set = Set.new
147
+ text_list.each do |value|
148
+ next if easy_words.include? value
149
+
150
+ diff_words_set.add(value) if syllable_count(value, language) > 1
151
+ end
152
+
153
+ return_words ? diff_words_set : diff_words_set.length
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,137 @@
1
+ require 'text-hyphen'
2
+ require_relative 'basic_stats'
3
+ require_relative 'dictionary_manager'
4
+ require_relative 'readability_formulas'
5
+
6
+ module TextStat
7
+ # Path to the TextStat gem installation directory
8
+ #
9
+ # This constant is used internally to locate dictionary files and other
10
+ # gem resources. It points to the root directory of the installed gem.
11
+ #
12
+ # @return [String] absolute path to gem root directory
13
+ # @example
14
+ # TextStat::GEM_PATH # => \"/path/to/gems/textstat-1.0.0\"
15
+ GEM_PATH = File.dirname(File.dirname(File.dirname(__FILE__)))
16
+
17
+ # Main class providing text readability analysis
18
+ #
19
+ # This class combines all TextStat modules to provide a unified interface
20
+ # for text analysis. It includes basic statistics, dictionary management,
21
+ # and readability formulas in a single class.
22
+ #
23
+ # The class maintains backward compatibility through method delegation,
24
+ # ensuring that existing code continues to work seamlessly.
25
+ #
26
+ # @author Jakub Polak
27
+ # @since 1.0.0
28
+ # @example Creating an instance
29
+ # analyzer = TextStat::Main.new
30
+ # analyzer.flesch_reading_ease(\"Sample text\") # => 83.32
31
+ #
32
+ # @example Using class methods (backward compatibility)
33
+ # TextStat::Main.flesch_reading_ease(\"Sample text\") # => 83.32
34
+ # TextStat.flesch_reading_ease(\"Sample text\") # => 83.32
35
+ class Main
36
+ include BasicStats
37
+ include DictionaryManager
38
+ include ReadabilityFormulas
39
+
40
+ # Legacy class methods for backward compatibility
41
+ class << self
42
+ # Handle method delegation for backward compatibility
43
+ #
44
+ # This method ensures that all instance methods can be called as class methods,
45
+ # maintaining compatibility with the pre-1.0 API.
46
+ #
47
+ # @param method_name [Symbol] the method name being called
48
+ # @param args [Array] method arguments
49
+ # @param kwargs [Hash] keyword arguments
50
+ # @param block [Proc] block if provided
51
+ # @return [Object] result of the method call
52
+ # @private
53
+ def method_missing(method_name, *args, **kwargs, &block)
54
+ instance = new
55
+ if instance.respond_to?(method_name)
56
+ instance.send(method_name, *args, **kwargs, &block)
57
+ else
58
+ super
59
+ end
60
+ end
61
+
62
+ # Check if method exists for delegation
63
+ #
64
+ # @param method_name [Symbol] the method name to check
65
+ # @param include_private [Boolean] whether to include private methods
66
+ # @return [Boolean] true if method exists
67
+ # @private
68
+ def respond_to_missing?(method_name, include_private = false)
69
+ new.respond_to?(method_name, include_private) || super
70
+ end
71
+
72
+ # Set dictionary path for all instances
73
+ #
74
+ # @param path [String] path to dictionary directory
75
+ # @return [String] the set path
76
+ # @example
77
+ # TextStat::Main.dictionary_path = \"/custom/dictionaries\"
78
+ def dictionary_path=(path)
79
+ DictionaryManager.dictionary_path = path
80
+ end
81
+
82
+ # Get current dictionary path
83
+ #
84
+ # @return [String] current dictionary path
85
+ # @example
86
+ # TextStat::Main.dictionary_path # => \"/path/to/dictionaries\"
87
+ def dictionary_path
88
+ DictionaryManager.dictionary_path
89
+ end
90
+
91
+ # Clear all cached dictionaries
92
+ #
93
+ # @return [Hash] empty cache
94
+ # @example
95
+ # TextStat::Main.clear_dictionary_cache
96
+ def clear_dictionary_cache
97
+ DictionaryManager.clear_cache
98
+ end
99
+
100
+ # Load dictionary for specified language
101
+ #
102
+ # @param language [String] language code
103
+ # @return [Set] set of easy words for the language
104
+ # @example
105
+ # TextStat::Main.load_dictionary('en_us')
106
+ def load_dictionary(language)
107
+ DictionaryManager.load_dictionary(language)
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ # For backward compatibility, expose TextStat module class methods
114
+ # This ensures that TextStat.method_name works exactly like TextStat::Main.method_name
115
+ TextStat.extend(Module.new do
116
+ # Handle method delegation at module level
117
+ #
118
+ # @param method_name [Symbol] the method name being called
119
+ # @param args [Array] method arguments
120
+ # @param kwargs [Hash] keyword arguments
121
+ # @param block [Proc] block if provided
122
+ # @return [Object] result of the method call
123
+ # @private
124
+ def method_missing(method_name, *args, **kwargs, &block)
125
+ TextStat::Main.send(method_name, *args, **kwargs, &block)
126
+ end
127
+
128
+ # Check if method exists for delegation
129
+ #
130
+ # @param method_name [Symbol] the method name to check
131
+ # @param include_private [Boolean] whether to include private methods
132
+ # @return [Boolean] true if method exists
133
+ # @private
134
+ def respond_to_missing?(method_name, include_private = false)
135
+ TextStat::Main.respond_to?(method_name, include_private) || super
136
+ end
137
+ end)