string_to_number 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +81 -0
- data/.github/workflows/release.yml +62 -0
- data/.rubocop.yml +110 -0
- data/CLAUDE.md +23 -85
- data/Gemfile +9 -0
- data/Gemfile.lock +32 -1
- data/README.md +53 -163
- data/Rakefile +5 -1
- data/SECURITY.md +25 -0
- data/benchmark.rb +41 -40
- data/docs/ARCHITECTURE.md +131 -0
- data/docs/demo.gif +0 -0
- data/lib/string_to_number/parser.rb +49 -79
- data/lib/string_to_number/to_number.rb +21 -22
- data/lib/string_to_number/version.rb +3 -1
- data/lib/string_to_number.rb +9 -7
- data/microbenchmark.rb +81 -80
- data/performance_comparison.rb +34 -35
- data/profile.rb +44 -45
- data/string_to_number.gemspec +5 -6
- metadata +15 -51
- data/.travis.yml +0 -5
- /data/{LICENSE.txt → LICENSE} +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module StringToNumber
|
|
4
4
|
# High-performance French text to number parser
|
|
5
|
-
#
|
|
5
|
+
#
|
|
6
6
|
# This class provides a clean, optimized implementation that maintains
|
|
7
7
|
# compatibility with the original algorithm while adding significant
|
|
8
8
|
# performance improvements through caching and memoization.
|
|
@@ -21,21 +21,21 @@ module StringToNumber
|
|
|
21
21
|
MULTIPLIERS = StringToNumber::ToNumber::POWERS_OF_TEN.freeze
|
|
22
22
|
|
|
23
23
|
# Pre-compiled regex patterns for optimal performance
|
|
24
|
-
MULTIPLIER_KEYS = MULTIPLIERS.keys
|
|
24
|
+
MULTIPLIER_KEYS = MULTIPLIERS.keys
|
|
25
|
+
.reject { |k| %w[un dix].include?(k) }
|
|
25
26
|
.sort_by(&:length).reverse.freeze
|
|
26
|
-
MULTIPLIER_PATTERN = /(?<f>.*?)\s?(?<m>#{MULTIPLIER_KEYS.join('|')})
|
|
27
|
-
QUATRE_VINGT_PATTERN = /(quatre
|
|
28
|
-
|
|
27
|
+
MULTIPLIER_PATTERN = /(?<f>.*?)\s?(?<m>#{MULTIPLIER_KEYS.join('|')})/.freeze
|
|
28
|
+
QUATRE_VINGT_PATTERN = /(?<base>quatre[-\s]vingt(?:s?)(?:[-\s]dix)?)(?:[-\s]?)(?<suffix>\w*)/.freeze
|
|
29
|
+
|
|
29
30
|
# Cache configuration
|
|
30
31
|
MAX_CACHE_SIZE = 1000
|
|
31
32
|
private_constant :MAX_CACHE_SIZE
|
|
32
33
|
|
|
33
|
-
# Thread-safe
|
|
34
|
-
@
|
|
35
|
-
@
|
|
36
|
-
@
|
|
34
|
+
# Thread-safe LRU cache using Hash insertion order (Ruby 1.9+)
|
|
35
|
+
@cache = {}
|
|
36
|
+
@cache_hits = 0
|
|
37
|
+
@cache_lookups = 0
|
|
37
38
|
@cache_mutex = Mutex.new
|
|
38
|
-
@instance_mutex = Mutex.new
|
|
39
39
|
|
|
40
40
|
class << self
|
|
41
41
|
# Convert French text to number using cached parser instance
|
|
@@ -45,32 +45,38 @@ module StringToNumber
|
|
|
45
45
|
# @raise [ArgumentError] if text is not a string
|
|
46
46
|
def convert(text)
|
|
47
47
|
validate_input!(text)
|
|
48
|
-
|
|
48
|
+
|
|
49
49
|
normalized = normalize_text(text)
|
|
50
50
|
return 0 if normalized.empty?
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
@cache_mutex.synchronize do
|
|
53
|
+
@cache_lookups += 1
|
|
54
|
+
|
|
55
|
+
if @cache.key?(normalized)
|
|
56
|
+
@cache_hits += 1
|
|
57
|
+
# Delete and reinsert to move to end (most recently used)
|
|
58
|
+
value = @cache.delete(normalized)
|
|
59
|
+
@cache[normalized] = value
|
|
60
|
+
return value
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
result = new(normalized).parse_optimized(normalized)
|
|
65
|
+
|
|
66
|
+
@cache_mutex.synchronize do
|
|
67
|
+
@cache.delete(@cache.first[0]) if @cache.size >= MAX_CACHE_SIZE
|
|
68
|
+
@cache[normalized] = result
|
|
69
|
+
end
|
|
55
70
|
|
|
56
|
-
# Get or create parser instance and convert
|
|
57
|
-
parser = get_cached_instance(normalized)
|
|
58
|
-
result = parser.parse_optimized(normalized)
|
|
59
|
-
|
|
60
|
-
# Cache the result
|
|
61
|
-
cache_conversion(normalized, result)
|
|
62
71
|
result
|
|
63
72
|
end
|
|
64
73
|
|
|
65
74
|
# Clear all caches
|
|
66
75
|
def clear_caches!
|
|
67
76
|
@cache_mutex.synchronize do
|
|
68
|
-
@
|
|
69
|
-
@
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@instance_mutex.synchronize do
|
|
73
|
-
@instance_cache.clear
|
|
77
|
+
@cache.clear
|
|
78
|
+
@cache_hits = 0
|
|
79
|
+
@cache_lookups = 0
|
|
74
80
|
end
|
|
75
81
|
end
|
|
76
82
|
|
|
@@ -78,10 +84,9 @@ module StringToNumber
|
|
|
78
84
|
def cache_stats
|
|
79
85
|
@cache_mutex.synchronize do
|
|
80
86
|
{
|
|
81
|
-
conversion_cache_size: @
|
|
87
|
+
conversion_cache_size: @cache.size,
|
|
82
88
|
conversion_cache_limit: MAX_CACHE_SIZE,
|
|
83
|
-
|
|
84
|
-
cache_hit_ratio: calculate_hit_ratio
|
|
89
|
+
cache_hit_ratio: @cache_lookups.zero? ? 0.0 : @cache_hits.to_f / @cache_lookups
|
|
85
90
|
}
|
|
86
91
|
end
|
|
87
92
|
end
|
|
@@ -95,42 +100,6 @@ module StringToNumber
|
|
|
95
100
|
def normalize_text(text)
|
|
96
101
|
text.to_s.downcase.strip
|
|
97
102
|
end
|
|
98
|
-
|
|
99
|
-
def get_cached_conversion(normalized_text)
|
|
100
|
-
@cache_mutex.synchronize do
|
|
101
|
-
if @conversion_cache.key?(normalized_text)
|
|
102
|
-
# Update LRU order
|
|
103
|
-
@cache_access_order.delete(normalized_text)
|
|
104
|
-
@cache_access_order.push(normalized_text)
|
|
105
|
-
return @conversion_cache[normalized_text]
|
|
106
|
-
end
|
|
107
|
-
end
|
|
108
|
-
nil
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
def cache_conversion(normalized_text, result)
|
|
112
|
-
@cache_mutex.synchronize do
|
|
113
|
-
# LRU eviction
|
|
114
|
-
if @conversion_cache.size >= MAX_CACHE_SIZE
|
|
115
|
-
oldest = @cache_access_order.shift
|
|
116
|
-
@conversion_cache.delete(oldest)
|
|
117
|
-
end
|
|
118
|
-
|
|
119
|
-
@conversion_cache[normalized_text] = result
|
|
120
|
-
@cache_access_order.push(normalized_text)
|
|
121
|
-
end
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def get_cached_instance(normalized_text)
|
|
125
|
-
@instance_mutex.synchronize do
|
|
126
|
-
@instance_cache[normalized_text] ||= new(normalized_text)
|
|
127
|
-
end
|
|
128
|
-
end
|
|
129
|
-
|
|
130
|
-
def calculate_hit_ratio
|
|
131
|
-
return 0.0 if @cache_access_order.empty?
|
|
132
|
-
@conversion_cache.size.to_f / @cache_access_order.size
|
|
133
|
-
end
|
|
134
103
|
end
|
|
135
104
|
|
|
136
105
|
# Initialize parser with normalized text
|
|
@@ -147,12 +116,12 @@ module StringToNumber
|
|
|
147
116
|
# but with performance optimizations
|
|
148
117
|
def parse_optimized(text)
|
|
149
118
|
return 0 if text.nil? || text.empty?
|
|
150
|
-
|
|
119
|
+
|
|
151
120
|
# Direct lookup (fastest path)
|
|
152
121
|
return WORD_VALUES[text] if WORD_VALUES.key?(text)
|
|
153
122
|
|
|
154
123
|
# Use the proven extraction algorithm from the original implementation
|
|
155
|
-
extract_optimized(text
|
|
124
|
+
extract_optimized(text)
|
|
156
125
|
end
|
|
157
126
|
|
|
158
127
|
private
|
|
@@ -160,14 +129,14 @@ module StringToNumber
|
|
|
160
129
|
# Optimized version of the original extract method
|
|
161
130
|
# This maintains the exact logic of the working implementation
|
|
162
131
|
# but with performance improvements
|
|
163
|
-
def extract_optimized(sentence,
|
|
132
|
+
def extract_optimized(sentence, detail: false)
|
|
164
133
|
return 0 if sentence.nil? || sentence.empty?
|
|
165
|
-
|
|
134
|
+
|
|
166
135
|
# Direct lookup
|
|
167
136
|
return WORD_VALUES[sentence] if WORD_VALUES.key?(sentence)
|
|
168
137
|
|
|
169
138
|
# Main pattern matching using pre-compiled regex
|
|
170
|
-
if result = MULTIPLIER_PATTERN.match(sentence)
|
|
139
|
+
if (result = MULTIPLIER_PATTERN.match(sentence))
|
|
171
140
|
# Remove matched portion
|
|
172
141
|
sentence = sentence.gsub(result[0], '') if result[0]
|
|
173
142
|
|
|
@@ -178,7 +147,7 @@ module StringToNumber
|
|
|
178
147
|
|
|
179
148
|
# Handle compound numbers
|
|
180
149
|
if higher_multiple_exists?(result[:m], sentence)
|
|
181
|
-
details = extract_optimized(sentence,
|
|
150
|
+
details = extract_optimized(sentence, detail: true)
|
|
182
151
|
factor = (factor * multiple_of_ten) + details[:factor]
|
|
183
152
|
multiple_of_ten = details[:multiple_of_ten]
|
|
184
153
|
sentence = details[:sentence]
|
|
@@ -193,19 +162,19 @@ module StringToNumber
|
|
|
193
162
|
}
|
|
194
163
|
end
|
|
195
164
|
|
|
196
|
-
|
|
165
|
+
extract_optimized(sentence) + (factor * multiple_of_ten)
|
|
197
166
|
|
|
198
167
|
# Quatre-vingt special handling
|
|
199
|
-
elsif m = QUATRE_VINGT_PATTERN.match(sentence)
|
|
200
|
-
normalize_str = m[
|
|
168
|
+
elsif (m = QUATRE_VINGT_PATTERN.match(sentence))
|
|
169
|
+
normalize_str = m[:base].tr(' ', '-')
|
|
201
170
|
normalize_str = normalize_str[0...-1] if normalize_str[-1] == 's'
|
|
202
171
|
|
|
203
172
|
sentence = sentence.gsub(m[0], '')
|
|
204
173
|
|
|
205
|
-
|
|
206
|
-
|
|
174
|
+
extract_optimized(sentence) +
|
|
175
|
+
WORD_VALUES[normalize_str] + (WORD_VALUES[m[:suffix]] || 0)
|
|
207
176
|
else
|
|
208
|
-
|
|
177
|
+
match_optimized(sentence)
|
|
209
178
|
end
|
|
210
179
|
end
|
|
211
180
|
|
|
@@ -213,8 +182,9 @@ module StringToNumber
|
|
|
213
182
|
def match_optimized(sentence)
|
|
214
183
|
return 0 if sentence.nil?
|
|
215
184
|
|
|
216
|
-
sentence.tr('-', ' ').split
|
|
185
|
+
sentence.tr('-', ' ').split.reverse.sum do |word|
|
|
217
186
|
next 0 if word == 'et'
|
|
187
|
+
|
|
218
188
|
WORD_VALUES[word] || (MULTIPLIERS[word] ? 10 * MULTIPLIERS[word] : 0)
|
|
219
189
|
end
|
|
220
190
|
end
|
|
@@ -227,4 +197,4 @@ module StringToNumber
|
|
|
227
197
|
end
|
|
228
198
|
end
|
|
229
199
|
end
|
|
230
|
-
end
|
|
200
|
+
end
|
|
@@ -47,8 +47,8 @@ module StringToNumber
|
|
|
47
47
|
'quatre-vingt' => 80, # Standard French: "four-twenty" (singular)
|
|
48
48
|
'huitante' => 80, # Swiss French alternative
|
|
49
49
|
'quatre-vingt-dix' => 90, # Standard French: "four-twenty-ten"
|
|
50
|
-
'quatre-vingts-dix' => 90
|
|
51
|
-
'nonante' => 90
|
|
50
|
+
'quatre-vingts-dix' => 90, # Alternative with plural "vingts"
|
|
51
|
+
'nonante' => 90 # Belgian/Swiss French alternative
|
|
52
52
|
}.freeze
|
|
53
53
|
|
|
54
54
|
# POWERS_OF_TEN maps French number words to their power of 10 exponents
|
|
@@ -100,7 +100,7 @@ module StringToNumber
|
|
|
100
100
|
'trigintillion' => 93,
|
|
101
101
|
'untrigintillion' => 96,
|
|
102
102
|
'duotrigintillion' => 99,
|
|
103
|
-
'googol' => 100
|
|
103
|
+
'googol' => 100 # Special case: 10^100
|
|
104
104
|
}.freeze
|
|
105
105
|
|
|
106
106
|
# Initialize the ToNumber parser with a French sentence
|
|
@@ -111,7 +111,7 @@ module StringToNumber
|
|
|
111
111
|
# Sort keys by length (longest first) to ensure longer matches are preferred
|
|
112
112
|
# This prevents "cent" from matching before "cents" in "cinq cents"
|
|
113
113
|
sorted_keys = POWERS_OF_TEN.keys.reject { |k| %w[un dix].include?(k) }.sort_by(&:length).reverse
|
|
114
|
-
@keys = sorted_keys.join('|')
|
|
114
|
+
@keys = sorted_keys.join('|') # Create regex alternation pattern
|
|
115
115
|
# Normalize input to lowercase for case-insensitive matching
|
|
116
116
|
@sentence = sentence&.downcase || ''
|
|
117
117
|
end
|
|
@@ -133,10 +133,10 @@ module StringToNumber
|
|
|
133
133
|
def extract(sentence, keys, detail: false)
|
|
134
134
|
# Base cases: handle empty/nil input
|
|
135
135
|
return 0 if sentence.nil? || sentence.empty?
|
|
136
|
-
|
|
136
|
+
|
|
137
137
|
# Ensure case-insensitive matching
|
|
138
138
|
sentence = sentence.downcase
|
|
139
|
-
|
|
139
|
+
|
|
140
140
|
# Direct lookup for simple cases (e.g., "vingt" -> 20)
|
|
141
141
|
return EXCEPTIONS[sentence] unless EXCEPTIONS[sentence].nil?
|
|
142
142
|
|
|
@@ -146,19 +146,19 @@ module StringToNumber
|
|
|
146
146
|
# (?<f>.*?) - Non-greedy capture of factor part (before multiplier)
|
|
147
147
|
# \s? - Optional space
|
|
148
148
|
# (?<m>#{keys}) - Named capture of multiplier from keys pattern
|
|
149
|
-
if result = /(?<f>.*?)\s?(?<m>#{keys})/.match(sentence)
|
|
149
|
+
if (result = /(?<f>.*?)\s?(?<m>#{keys})/.match(sentence))
|
|
150
150
|
# Remove the matched portion from sentence for further processing
|
|
151
|
-
sentence.gsub!(
|
|
151
|
+
sentence.gsub!(::Regexp.last_match(0), '') if ::Regexp.last_match(0)
|
|
152
152
|
|
|
153
153
|
# Parse the factor part (number before the multiplier)
|
|
154
154
|
# Example: "cinq" -> 5, "deux cent" -> 200
|
|
155
155
|
factor = EXCEPTIONS[result[:f]] || match(result[:f])
|
|
156
|
-
|
|
156
|
+
|
|
157
157
|
# Handle implicit factor of 1 for standalone multipliers
|
|
158
158
|
# Example: "million" -> factor=1, but only for top-level calls
|
|
159
159
|
# For recursive calls (detail=true), keep factor as 0 to avoid double-counting
|
|
160
160
|
factor = 1 if factor.zero? && !detail
|
|
161
|
-
|
|
161
|
+
|
|
162
162
|
# Calculate the multiplier value (10^exponent)
|
|
163
163
|
# Example: "cents" -> 10^2 = 100, "millions" -> 10^6 = 1,000,000
|
|
164
164
|
multiple_of_ten = 10**(POWERS_OF_TEN[result[:m]] || 0)
|
|
@@ -192,19 +192,18 @@ module StringToNumber
|
|
|
192
192
|
|
|
193
193
|
# Final calculation: process any remaining sentence + current factor*multiplier
|
|
194
194
|
# Example: For "trois millions cinq cents", this handles the "cinq cents" part
|
|
195
|
-
|
|
195
|
+
extract(sentence, keys) + (factor * multiple_of_ten)
|
|
196
196
|
|
|
197
197
|
# Special case handling for "quatre-vingt" variations
|
|
198
198
|
# This complex regex handles the irregular French "eighty" patterns:
|
|
199
199
|
# - "quatre-vingt" / "quatre vingts" (with/without 's')
|
|
200
200
|
# - "quatre-vingt-dix" / "quatre vingts dix" (90)
|
|
201
201
|
# - Space vs hyphen variations
|
|
202
|
-
elsif m = /(quatre
|
|
202
|
+
elsif (m = /(?<base>quatre[-\s]vingt(?:s?)(?:[-\s]dix)?)(?:[-\s]?)(?<suffix>\w*)/.match(sentence))
|
|
203
203
|
# Normalize spacing to hyphens for consistent lookup
|
|
204
|
-
normalize_str = m[
|
|
205
|
-
|
|
204
|
+
normalize_str = m[:base].tr(' ', '-')
|
|
205
|
+
|
|
206
206
|
# Remove trailing 's' from "quatre-vingts" if present
|
|
207
|
-
# Bug fix: use [-1] instead of [length] for last character
|
|
208
207
|
normalize_str = normalize_str[0...-1] if normalize_str[-1] == 's'
|
|
209
208
|
|
|
210
209
|
# Remove the matched portion from sentence
|
|
@@ -212,11 +211,11 @@ module StringToNumber
|
|
|
212
211
|
|
|
213
212
|
# Return sum of: remaining sentence + normalized quatre-vingt value + any suffix
|
|
214
213
|
# Example: "quatre-vingt-cinq" -> EXCEPTIONS["quatre-vingt"] + EXCEPTIONS["cinq"]
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
extract(sentence, keys) +
|
|
215
|
+
EXCEPTIONS[normalize_str] + (EXCEPTIONS[m[:suffix]] || 0)
|
|
217
216
|
else
|
|
218
217
|
# Fallback: use match() method for simple word combinations
|
|
219
|
-
|
|
218
|
+
match(sentence)
|
|
220
219
|
end
|
|
221
220
|
end
|
|
222
221
|
|
|
@@ -229,11 +228,11 @@ module StringToNumber
|
|
|
229
228
|
|
|
230
229
|
# Process words in reverse order for proper French number logic
|
|
231
230
|
# Example: "vingt et un" -> ["un", "et", "vingt"] -> 1 + 0 + 20 = 21
|
|
232
|
-
sentence.downcase.tr('-', ' ').split
|
|
231
|
+
sentence.downcase.tr('-', ' ').split.reverse.sum do |word|
|
|
233
232
|
# Handle French "et" (and) conjunction by ignoring it in calculations
|
|
234
233
|
# Example: "vingt et un" -> ignore "et", sum "vingt" + "un"
|
|
235
234
|
next 0 if word == 'et'
|
|
236
|
-
|
|
235
|
+
|
|
237
236
|
# Look up word value in either EXCEPTIONS or POWERS_OF_TEN
|
|
238
237
|
if EXCEPTIONS[word].nil? && POWERS_OF_TEN[word].nil?
|
|
239
238
|
# Unknown words contribute 0 to the sum
|
|
@@ -241,8 +240,8 @@ module StringToNumber
|
|
|
241
240
|
else
|
|
242
241
|
# Use EXCEPTIONS value if available, otherwise use 10 * power_of_ten
|
|
243
242
|
# Example: "dix" -> EXCEPTIONS["dix"] = 10
|
|
244
|
-
# "cent" -> 10 * POWERS_OF_TEN["cent"] = 10 * 2 = 100
|
|
245
|
-
|
|
243
|
+
# "cent" -> 10 * POWERS_OF_TEN["cent"] = 10 * 2 = 100
|
|
244
|
+
EXCEPTIONS[word] || (10 * POWERS_OF_TEN[word])
|
|
246
245
|
end
|
|
247
246
|
end
|
|
248
247
|
end
|
data/lib/string_to_number.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'string_to_number/version'
|
|
2
4
|
|
|
3
5
|
# Load original implementation first for constant definitions
|
|
@@ -78,20 +80,20 @@ module StringToNumber
|
|
|
78
80
|
#
|
|
79
81
|
def valid_french_number?(text)
|
|
80
82
|
return false unless text.respond_to?(:to_s)
|
|
81
|
-
|
|
83
|
+
|
|
82
84
|
normalized = text.to_s.downcase.strip
|
|
83
85
|
return false if normalized.empty?
|
|
84
|
-
|
|
86
|
+
|
|
85
87
|
# Check if any words are recognized French number words
|
|
86
88
|
words = normalized.tr('-', ' ').split(/\s+/)
|
|
87
89
|
recognized_words = words.count do |word|
|
|
88
|
-
word == 'et' ||
|
|
89
|
-
|
|
90
|
-
|
|
90
|
+
word == 'et' ||
|
|
91
|
+
Parser::WORD_VALUES.key?(word) ||
|
|
92
|
+
Parser::MULTIPLIERS.key?(word)
|
|
91
93
|
end
|
|
92
|
-
|
|
94
|
+
|
|
93
95
|
# Require at least 50% recognized words for validation
|
|
94
96
|
recognized_words.to_f / words.size >= 0.5
|
|
95
97
|
end
|
|
96
98
|
end
|
|
97
|
-
end
|
|
99
|
+
end
|