string_to_number 0.1.4 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +83 -0
- data/.rubocop.yml +110 -0
- data/.tool-versions +1 -0
- data/CLAUDE.md +103 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +33 -2
- data/README.md +184 -25
- data/Rakefile +5 -1
- data/benchmark.rb +178 -0
- data/lib/string_to_number/parser.rb +232 -0
- data/lib/string_to_number/to_number.rb +145 -38
- data/lib/string_to_number/version.rb +3 -1
- data/lib/string_to_number.rb +91 -2
- data/logo.png +0 -0
- data/microbenchmark.rb +227 -0
- data/performance_comparison.rb +154 -0
- data/profile.rb +130 -0
- data/string_to_number.gemspec +5 -6
- metadata +14 -45
data/benchmark.rb
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
# Performance benchmark script for StringToNumber gem
|
5
|
+
# Run with: ruby benchmark.rb
|
6
|
+
|
7
|
+
require_relative 'lib/string_to_number'
|
8
|
+
require 'benchmark'
|
9
|
+
|
10
|
+
class StringToNumberBenchmark
|
11
|
+
# Test data organized by complexity
|
12
|
+
TEST_CASES = {
|
13
|
+
simple: %w[
|
14
|
+
un vingt cent mille
|
15
|
+
],
|
16
|
+
medium: [
|
17
|
+
'vingt et un', 'deux cent cinquante', 'mille deux cent'
|
18
|
+
],
|
19
|
+
complex: [
|
20
|
+
'trois milliards cinq cents millions',
|
21
|
+
'soixante-quinze million trois cent quarante six mille sept cent quatre-vingt-dix neuf'
|
22
|
+
],
|
23
|
+
edge_cases: %w[
|
24
|
+
VINGT une septante quatre-vingts
|
25
|
+
]
|
26
|
+
}.freeze
|
27
|
+
|
28
|
+
def self.run_benchmark
|
29
|
+
puts 'StringToNumber Performance Benchmark'
|
30
|
+
puts '=' * 50
|
31
|
+
puts "Ruby version: #{RUBY_VERSION}"
|
32
|
+
puts "Platform: #{RUBY_PLATFORM}"
|
33
|
+
puts
|
34
|
+
|
35
|
+
# Warm up
|
36
|
+
puts 'Warming up...'
|
37
|
+
TEST_CASES.values.flatten.each { |text| StringToNumber.in_numbers(text) }
|
38
|
+
puts
|
39
|
+
|
40
|
+
total_results = {}
|
41
|
+
|
42
|
+
TEST_CASES.each do |category, test_cases|
|
43
|
+
puts "#{category.to_s.capitalize} Numbers:"
|
44
|
+
puts '-' * 30
|
45
|
+
|
46
|
+
results = benchmark_category(test_cases)
|
47
|
+
total_results[category] = results
|
48
|
+
|
49
|
+
puts "Cases: #{test_cases.size}"
|
50
|
+
puts "Total time: #{results[:total_time].round(4)}s"
|
51
|
+
puts "Average per conversion: #{results[:avg_time_ms].round(4)}ms"
|
52
|
+
puts "Conversions per second: #{results[:ops_per_sec].round(0)}"
|
53
|
+
puts
|
54
|
+
|
55
|
+
# Show individual case performance for complex numbers
|
56
|
+
next unless category == :complex
|
57
|
+
|
58
|
+
puts 'Individual case breakdown:'
|
59
|
+
test_cases.each_with_index do |text, index|
|
60
|
+
individual_time = Benchmark.realtime do
|
61
|
+
1000.times { StringToNumber.in_numbers(text) }
|
62
|
+
end
|
63
|
+
avg_ms = (individual_time / 1000) * 1000
|
64
|
+
puts " #{index + 1}. #{avg_ms.round(4)}ms - '#{text[0..50]}#{'...' if text.length > 50}'"
|
65
|
+
end
|
66
|
+
puts
|
67
|
+
end
|
68
|
+
|
69
|
+
# Summary
|
70
|
+
puts '=' * 50
|
71
|
+
puts 'PERFORMANCE SUMMARY'
|
72
|
+
puts '=' * 50
|
73
|
+
|
74
|
+
total_results.each do |category, results|
|
75
|
+
status = case results[:avg_time_ms]
|
76
|
+
when 0..0.1 then '🟢 Excellent'
|
77
|
+
when 0.1..0.5 then '🟡 Good'
|
78
|
+
when 0.5..1.0 then '🟠 Acceptable'
|
79
|
+
else '🔴 Needs optimization'
|
80
|
+
end
|
81
|
+
|
82
|
+
puts "#{category.to_s.capitalize.ljust(12)} #{status.ljust(15)} #{results[:avg_time_ms].round(4)}ms avg"
|
83
|
+
end
|
84
|
+
|
85
|
+
puts
|
86
|
+
puts 'Memory efficiency test...'
|
87
|
+
test_memory_usage
|
88
|
+
|
89
|
+
puts
|
90
|
+
puts 'Scalability test...'
|
91
|
+
test_scalability
|
92
|
+
end
|
93
|
+
|
94
|
+
def self.benchmark_category(test_cases, iterations = 2000)
|
95
|
+
total_time = Benchmark.realtime do
|
96
|
+
test_cases.each do |text|
|
97
|
+
iterations.times do
|
98
|
+
StringToNumber.in_numbers(text)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
total_conversions = test_cases.size * iterations
|
104
|
+
avg_time_ms = (total_time / total_conversions) * 1000
|
105
|
+
ops_per_sec = total_conversions / total_time
|
106
|
+
|
107
|
+
{
|
108
|
+
total_time: total_time,
|
109
|
+
avg_time_ms: avg_time_ms,
|
110
|
+
ops_per_sec: ops_per_sec
|
111
|
+
}
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.test_memory_usage
|
115
|
+
# Test memory efficiency
|
116
|
+
if Object.const_defined?(:ObjectSpace)
|
117
|
+
GC.start
|
118
|
+
initial_objects = ObjectSpace.count_objects[:TOTAL]
|
119
|
+
|
120
|
+
# Perform intensive operations
|
121
|
+
500.times do
|
122
|
+
TEST_CASES.values.flatten.each { |text| StringToNumber.in_numbers(text) }
|
123
|
+
end
|
124
|
+
|
125
|
+
GC.start
|
126
|
+
final_objects = ObjectSpace.count_objects[:TOTAL]
|
127
|
+
object_growth = final_objects - initial_objects
|
128
|
+
|
129
|
+
puts "Object creation: #{object_growth} new objects (#{object_growth > 1000 ? '🔴 High' : '🟢 Low'})"
|
130
|
+
else
|
131
|
+
puts 'Memory tracking not available on this platform'
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def self.test_scalability
|
136
|
+
# Test how performance scales with input complexity
|
137
|
+
inputs = [
|
138
|
+
'un', # 2 chars
|
139
|
+
'vingt et un', # 11 chars
|
140
|
+
'mille deux cent trente-quatre', # 29 chars
|
141
|
+
'trois milliards cinq cents millions deux cent mille et une' # 58 chars
|
142
|
+
]
|
143
|
+
|
144
|
+
puts 'Input length vs. performance:'
|
145
|
+
|
146
|
+
results = inputs.map do |input|
|
147
|
+
time = Benchmark.realtime do
|
148
|
+
1000.times { StringToNumber.in_numbers(input) }
|
149
|
+
end
|
150
|
+
avg_ms = (time / 1000) * 1000
|
151
|
+
|
152
|
+
{ length: input.length, time: avg_ms, input: input }
|
153
|
+
end
|
154
|
+
|
155
|
+
results.each do |result|
|
156
|
+
complexity_ratio = result[:time] / results.first[:time]
|
157
|
+
status = if complexity_ratio < 5
|
158
|
+
'🟢'
|
159
|
+
else
|
160
|
+
complexity_ratio < 10 ? '🟡' : '🔴'
|
161
|
+
end
|
162
|
+
|
163
|
+
puts " #{result[:length].to_s.rjust(2)} chars: #{result[:time].round(4)}ms #{status} " \
|
164
|
+
"(#{complexity_ratio.round(1)}x baseline)"
|
165
|
+
end
|
166
|
+
|
167
|
+
# Check if performance degrades reasonably
|
168
|
+
worst_ratio = results.last[:time] / results.first[:time]
|
169
|
+
if worst_ratio < 10
|
170
|
+
puts "✅ Scalability: Good (#{worst_ratio.round(1)}x degradation)"
|
171
|
+
else
|
172
|
+
puts "❌ Scalability: Poor (#{worst_ratio.round(1)}x degradation)"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
# Run the benchmark
|
178
|
+
StringToNumberBenchmark.run_benchmark if __FILE__ == $PROGRAM_NAME
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module StringToNumber
|
4
|
+
# High-performance French text to number parser
|
5
|
+
#
|
6
|
+
# This class provides a clean, optimized implementation that maintains
|
7
|
+
# compatibility with the original algorithm while adding significant
|
8
|
+
# performance improvements through caching and memoization.
|
9
|
+
#
|
10
|
+
# @example Basic usage
|
11
|
+
# parser = StringToNumber::Parser.new
|
12
|
+
# parser.parse('vingt et un') #=> 21
|
13
|
+
# parser.parse('trois millions') #=> 3_000_000
|
14
|
+
#
|
15
|
+
# @example Class method usage
|
16
|
+
# StringToNumber::Parser.convert('mille deux cent') #=> 1200
|
17
|
+
#
|
18
|
+
class Parser
|
19
|
+
# Import the proven data structures from the original implementation
|
20
|
+
WORD_VALUES = StringToNumber::ToNumber::EXCEPTIONS.freeze
|
21
|
+
MULTIPLIERS = StringToNumber::ToNumber::POWERS_OF_TEN.freeze
|
22
|
+
|
23
|
+
# Pre-compiled regex patterns for optimal performance
|
24
|
+
MULTIPLIER_KEYS = MULTIPLIERS.keys.reject { |k| %w[un dix].include?(k) }
|
25
|
+
.sort_by(&:length).reverse.freeze
|
26
|
+
MULTIPLIER_PATTERN = /(?<f>.*?)\s?(?<m>#{MULTIPLIER_KEYS.join('|')})/.freeze
|
27
|
+
QUATRE_VINGT_PATTERN = /(quatre(-|\s)vingt(s?)((-|\s)dix)?)((-|\s)?)(\w*)/.freeze
|
28
|
+
|
29
|
+
# Cache configuration
|
30
|
+
MAX_CACHE_SIZE = 1000
|
31
|
+
private_constant :MAX_CACHE_SIZE
|
32
|
+
|
33
|
+
# Thread-safe class-level caches
|
34
|
+
@conversion_cache = {}
|
35
|
+
@cache_access_order = []
|
36
|
+
@instance_cache = {}
|
37
|
+
@cache_mutex = Mutex.new
|
38
|
+
@instance_mutex = Mutex.new
|
39
|
+
|
40
|
+
class << self
|
41
|
+
# Convert French text to number using cached parser instance
|
42
|
+
#
|
43
|
+
# @param text [String] French number text to convert
|
44
|
+
# @return [Integer] The numeric value
|
45
|
+
# @raise [ArgumentError] if text is not a string
|
46
|
+
def convert(text)
|
47
|
+
validate_input!(text)
|
48
|
+
|
49
|
+
normalized = normalize_text(text)
|
50
|
+
return 0 if normalized.empty?
|
51
|
+
|
52
|
+
# Check conversion cache first
|
53
|
+
cached_result = get_cached_conversion(normalized)
|
54
|
+
return cached_result if cached_result
|
55
|
+
|
56
|
+
# Get or create parser instance and convert
|
57
|
+
parser = get_cached_instance(normalized)
|
58
|
+
result = parser.parse_optimized(normalized)
|
59
|
+
|
60
|
+
# Cache the result
|
61
|
+
cache_conversion(normalized, result)
|
62
|
+
result
|
63
|
+
end
|
64
|
+
|
65
|
+
# Clear all caches
|
66
|
+
def clear_caches!
|
67
|
+
@cache_mutex.synchronize do
|
68
|
+
@conversion_cache.clear
|
69
|
+
@cache_access_order.clear
|
70
|
+
end
|
71
|
+
|
72
|
+
@instance_mutex.synchronize do
|
73
|
+
@instance_cache.clear
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
# Get cache statistics
|
78
|
+
def cache_stats
|
79
|
+
@cache_mutex.synchronize do
|
80
|
+
{
|
81
|
+
conversion_cache_size: @conversion_cache.size,
|
82
|
+
conversion_cache_limit: MAX_CACHE_SIZE,
|
83
|
+
instance_cache_size: @instance_cache.size,
|
84
|
+
cache_hit_ratio: calculate_hit_ratio
|
85
|
+
}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
private
|
90
|
+
|
91
|
+
def validate_input!(text)
|
92
|
+
raise ArgumentError, 'Input must be a string' unless text.respond_to?(:to_s)
|
93
|
+
end
|
94
|
+
|
95
|
+
def normalize_text(text)
|
96
|
+
text.to_s.downcase.strip
|
97
|
+
end
|
98
|
+
|
99
|
+
def get_cached_conversion(normalized_text)
|
100
|
+
@cache_mutex.synchronize do
|
101
|
+
if @conversion_cache.key?(normalized_text)
|
102
|
+
# Update LRU order
|
103
|
+
@cache_access_order.delete(normalized_text)
|
104
|
+
@cache_access_order.push(normalized_text)
|
105
|
+
return @conversion_cache[normalized_text]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
nil
|
109
|
+
end
|
110
|
+
|
111
|
+
def cache_conversion(normalized_text, result)
|
112
|
+
@cache_mutex.synchronize do
|
113
|
+
# LRU eviction
|
114
|
+
if @conversion_cache.size >= MAX_CACHE_SIZE
|
115
|
+
oldest = @cache_access_order.shift
|
116
|
+
@conversion_cache.delete(oldest)
|
117
|
+
end
|
118
|
+
|
119
|
+
@conversion_cache[normalized_text] = result
|
120
|
+
@cache_access_order.push(normalized_text)
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def get_cached_instance(normalized_text)
|
125
|
+
@instance_mutex.synchronize do
|
126
|
+
@instance_cache[normalized_text] ||= new(normalized_text)
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def calculate_hit_ratio
|
131
|
+
return 0.0 if @cache_access_order.empty?
|
132
|
+
|
133
|
+
@conversion_cache.size.to_f / @cache_access_order.size
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# Initialize parser with normalized text
|
138
|
+
def initialize(text = '')
|
139
|
+
@normalized_text = self.class.send(:normalize_text, text)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Parse the text to numeric value
|
143
|
+
def parse
|
144
|
+
self.class.convert(@normalized_text)
|
145
|
+
end
|
146
|
+
|
147
|
+
# Internal optimized parsing method using the original proven algorithm
|
148
|
+
# but with performance optimizations
|
149
|
+
def parse_optimized(text)
|
150
|
+
return 0 if text.nil? || text.empty?
|
151
|
+
|
152
|
+
# Direct lookup (fastest path)
|
153
|
+
return WORD_VALUES[text] if WORD_VALUES.key?(text)
|
154
|
+
|
155
|
+
# Use the proven extraction algorithm from the original implementation
|
156
|
+
extract_optimized(text, MULTIPLIER_KEYS.join('|'))
|
157
|
+
end
|
158
|
+
|
159
|
+
private
|
160
|
+
|
161
|
+
# Optimized version of the original extract method
|
162
|
+
# This maintains the exact logic of the working implementation
|
163
|
+
# but with performance improvements
|
164
|
+
def extract_optimized(sentence, keys, detail: false)
|
165
|
+
return 0 if sentence.nil? || sentence.empty?
|
166
|
+
|
167
|
+
# Direct lookup
|
168
|
+
return WORD_VALUES[sentence] if WORD_VALUES.key?(sentence)
|
169
|
+
|
170
|
+
# Main pattern matching using pre-compiled regex
|
171
|
+
if (result = MULTIPLIER_PATTERN.match(sentence))
|
172
|
+
# Remove matched portion
|
173
|
+
sentence = sentence.gsub(result[0], '') if result[0]
|
174
|
+
|
175
|
+
# Extract factor
|
176
|
+
factor = WORD_VALUES[result[:f]] || match_optimized(result[:f])
|
177
|
+
factor = 1 if factor.zero? && !detail
|
178
|
+
multiple_of_ten = 10**(MULTIPLIERS[result[:m]] || 0)
|
179
|
+
|
180
|
+
# Handle compound numbers
|
181
|
+
if higher_multiple_exists?(result[:m], sentence)
|
182
|
+
details = extract_optimized(sentence, keys, detail: true)
|
183
|
+
factor = (factor * multiple_of_ten) + details[:factor]
|
184
|
+
multiple_of_ten = details[:multiple_of_ten]
|
185
|
+
sentence = details[:sentence]
|
186
|
+
end
|
187
|
+
|
188
|
+
# Return based on mode
|
189
|
+
if detail
|
190
|
+
return {
|
191
|
+
factor: factor,
|
192
|
+
multiple_of_ten: multiple_of_ten,
|
193
|
+
sentence: sentence
|
194
|
+
}
|
195
|
+
end
|
196
|
+
|
197
|
+
extract_optimized(sentence, keys) + (factor * multiple_of_ten)
|
198
|
+
|
199
|
+
# Quatre-vingt special handling
|
200
|
+
elsif (m = QUATRE_VINGT_PATTERN.match(sentence))
|
201
|
+
normalize_str = m[1].tr(' ', '-')
|
202
|
+
normalize_str = normalize_str[0...-1] if normalize_str[-1] == 's'
|
203
|
+
|
204
|
+
sentence = sentence.gsub(m[0], '')
|
205
|
+
|
206
|
+
extract_optimized(sentence, keys) +
|
207
|
+
WORD_VALUES[normalize_str] + (WORD_VALUES[m[8]] || 0)
|
208
|
+
else
|
209
|
+
match_optimized(sentence)
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
# Optimized match method
|
214
|
+
def match_optimized(sentence)
|
215
|
+
return 0 if sentence.nil?
|
216
|
+
|
217
|
+
sentence.tr('-', ' ').split.reverse.sum do |word|
|
218
|
+
next 0 if word == 'et'
|
219
|
+
|
220
|
+
WORD_VALUES[word] || (MULTIPLIERS[word] ? 10 * MULTIPLIERS[word] : 0)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
# Optimized higher multiple check
|
225
|
+
def higher_multiple_exists?(multiple, sentence)
|
226
|
+
current_power = MULTIPLIERS[multiple]
|
227
|
+
MULTIPLIERS.any? do |word, power|
|
228
|
+
power > current_power && sentence.include?(word)
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
end
|
@@ -1,13 +1,23 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
3
|
module StringToNumber
|
4
|
+
# ToNumber class handles the conversion of French text to numbers
|
5
|
+
# It uses a complex recursive parsing algorithm to handle French number grammar
|
4
6
|
class ToNumber
|
5
7
|
attr_accessor :sentence, :keys
|
6
8
|
|
9
|
+
# EXCEPTIONS contains direct mappings from French words to their numeric values
|
10
|
+
# This includes:
|
11
|
+
# - Basic numbers 0-90
|
12
|
+
# - Feminine forms ("une" for "un")
|
13
|
+
# - Regional variations (Belgian/Swiss French: "septante", "huitante", "nonante")
|
14
|
+
# - Special cases for "quatre-vingt" variations with/without 's'
|
15
|
+
# - Compound numbers like "dix-sept", "soixante-dix"
|
7
16
|
EXCEPTIONS = {
|
8
|
-
'zéro' => 0,
|
9
|
-
'zero' => 0,
|
10
|
-
'un' => 1,
|
17
|
+
'zéro' => 0, # Zero with accent
|
18
|
+
'zero' => 0, # Zero without accent
|
19
|
+
'un' => 1, # Masculine "one"
|
20
|
+
'une' => 1, # Feminine "one"
|
11
21
|
'deux' => 2,
|
12
22
|
'trois' => 3,
|
13
23
|
'quatre' => 4,
|
@@ -23,29 +33,44 @@ module StringToNumber
|
|
23
33
|
'quatorze' => 14,
|
24
34
|
'quinze' => 15,
|
25
35
|
'seize' => 16,
|
26
|
-
'dix-sept' => 17,
|
27
|
-
'dix-huit' => 18,
|
28
|
-
'dix-neuf' => 19,
|
36
|
+
'dix-sept' => 17, # Compound: "ten-seven"
|
37
|
+
'dix-huit' => 18, # Compound: "ten-eight"
|
38
|
+
'dix-neuf' => 19, # Compound: "ten-nine"
|
29
39
|
'vingt' => 20,
|
30
40
|
'trente' => 30,
|
31
41
|
'quarante' => 40,
|
32
42
|
'cinquante' => 50,
|
33
43
|
'soixante' => 60,
|
34
|
-
'soixante-dix' => 70,
|
35
|
-
'
|
36
|
-
'quatre-
|
37
|
-
'quatre-vingt
|
38
|
-
'
|
44
|
+
'soixante-dix' => 70, # Standard French: "sixty-ten"
|
45
|
+
'septante' => 70, # Belgian/Swiss French alternative
|
46
|
+
'quatre-vingts' => 80, # Standard French: "four-twenties" (plural)
|
47
|
+
'quatre-vingt' => 80, # Standard French: "four-twenty" (singular)
|
48
|
+
'huitante' => 80, # Swiss French alternative
|
49
|
+
'quatre-vingt-dix' => 90, # Standard French: "four-twenty-ten"
|
50
|
+
'quatre-vingts-dix' => 90, # Alternative with plural "vingts"
|
51
|
+
'nonante' => 90 # Belgian/Swiss French alternative
|
39
52
|
}.freeze
|
40
53
|
|
54
|
+
# POWERS_OF_TEN maps French number words to their power of 10 exponents
|
55
|
+
# Used for multipliers like "cent" (10^2), "mille" (10^3), "million" (10^6)
|
56
|
+
# Includes both singular and plural forms for proper French grammar
|
57
|
+
# Uses French number scale where "billion" = 10^12 (not 10^9 as in English)
|
41
58
|
POWERS_OF_TEN = {
|
42
|
-
'un' => 0,
|
43
|
-
'dix' => 1,
|
44
|
-
'cent' => 2,
|
45
|
-
'
|
46
|
-
'
|
47
|
-
'
|
48
|
-
'
|
59
|
+
'un' => 0, # 10^0 = 1 (ones place)
|
60
|
+
'dix' => 1, # 10^1 = 10 (tens place)
|
61
|
+
'cent' => 2, # 10^2 = 100 (hundreds, singular)
|
62
|
+
'cents' => 2, # 10^2 = 100 (hundreds, plural)
|
63
|
+
'mille' => 3, # 10^3 = 1,000 (thousands, singular)
|
64
|
+
'milles' => 3, # 10^3 = 1,000 (thousands, plural)
|
65
|
+
'million' => 6, # 10^6 = 1,000,000 (millions, singular)
|
66
|
+
'millions' => 6, # 10^6 = 1,000,000 (millions, plural)
|
67
|
+
'milliard' => 9, # 10^9 = 1,000,000,000 (French billion, singular)
|
68
|
+
'milliards' => 9, # 10^9 = 1,000,000,000 (French billion, plural)
|
69
|
+
'billion' => 12, # 10^12 = 1,000,000,000,000 (French trillion, singular)
|
70
|
+
'billions' => 12, # 10^12 = 1,000,000,000,000 (French trillion, plural)
|
71
|
+
'trillion' => 15, # 10^15 (French quadrillion, singular)
|
72
|
+
'trillions' => 15, # 10^15 (French quadrillion, plural)
|
73
|
+
# Extended list of large number names for completeness
|
49
74
|
'quadrillion' => 15,
|
50
75
|
'quintillion' => 18,
|
51
76
|
'sextillion' => 21,
|
@@ -75,42 +100,88 @@ module StringToNumber
|
|
75
100
|
'trigintillion' => 93,
|
76
101
|
'untrigintillion' => 96,
|
77
102
|
'duotrigintillion' => 99,
|
78
|
-
'googol' => 100
|
103
|
+
'googol' => 100 # Special case: 10^100
|
79
104
|
}.freeze
|
80
105
|
|
106
|
+
# Initialize the ToNumber parser with a French sentence
|
107
|
+
# @param sentence [String] The French text to be converted to numbers
|
81
108
|
def initialize(sentence = '')
|
82
|
-
|
83
|
-
|
109
|
+
# Create regex pattern from POWERS_OF_TEN keys, excluding 'un' and 'dix'
|
110
|
+
# which are handled differently in the parsing logic
|
111
|
+
# Sort keys by length (longest first) to ensure longer matches are preferred
|
112
|
+
# This prevents "cent" from matching before "cents" in "cinq cents"
|
113
|
+
sorted_keys = POWERS_OF_TEN.keys.reject { |k| %w[un dix].include?(k) }.sort_by(&:length).reverse
|
114
|
+
@keys = sorted_keys.join('|') # Create regex alternation pattern
|
115
|
+
# Normalize input to lowercase for case-insensitive matching
|
116
|
+
@sentence = sentence&.downcase || ''
|
84
117
|
end
|
85
118
|
|
119
|
+
# Main entry point to convert the French sentence to a number
|
120
|
+
# @return [Integer] The numeric value of the French text
|
86
121
|
def to_number
|
87
122
|
extract(@sentence, keys)
|
88
123
|
end
|
89
124
|
|
90
125
|
private
|
91
126
|
|
127
|
+
# Main recursive extraction method that parses French number patterns
|
128
|
+
# This is the core of the parsing algorithm
|
129
|
+
# @param sentence [String] The French text to parse
|
130
|
+
# @param keys [String] Regex pattern of power-of-ten multipliers
|
131
|
+
# @param detail [Boolean] If true, returns detailed parsing info for recursion
|
132
|
+
# @return [Integer, Hash] Numeric value or detailed parsing hash
|
92
133
|
def extract(sentence, keys, detail: false)
|
134
|
+
# Base cases: handle empty/nil input
|
93
135
|
return 0 if sentence.nil? || sentence.empty?
|
136
|
+
|
137
|
+
# Ensure case-insensitive matching
|
138
|
+
sentence = sentence.downcase
|
139
|
+
|
140
|
+
# Direct lookup for simple cases (e.g., "vingt" -> 20)
|
94
141
|
return EXCEPTIONS[sentence] unless EXCEPTIONS[sentence].nil?
|
95
142
|
|
96
|
-
|
97
|
-
|
98
|
-
|
143
|
+
# Main parsing logic: look for pattern "factor + multiplier"
|
144
|
+
# Example: "cinq cents" -> factor="cinq", multiplier="cents"
|
145
|
+
# Regex explanation:
|
146
|
+
# (?<f>.*?) - Non-greedy capture of factor part (before multiplier)
|
147
|
+
# \s? - Optional space
|
148
|
+
# (?<m>#{keys}) - Named capture of multiplier from keys pattern
|
149
|
+
if (result = /(?<f>.*?)\s?(?<m>#{keys})/.match(sentence))
|
150
|
+
# Remove the matched portion from sentence for further processing
|
151
|
+
sentence.gsub!(::Regexp.last_match(0), '') if ::Regexp.last_match(0)
|
152
|
+
|
153
|
+
# Parse the factor part (number before the multiplier)
|
154
|
+
# Example: "cinq" -> 5, "deux cent" -> 200
|
155
|
+
factor = EXCEPTIONS[result[:f]] || match(result[:f])
|
99
156
|
|
100
|
-
#
|
101
|
-
factor
|
102
|
-
|
157
|
+
# Handle implicit factor of 1 for standalone multipliers
|
158
|
+
# Example: "million" -> factor=1, but only for top-level calls
|
159
|
+
# For recursive calls (detail=true), keep factor as 0 to avoid double-counting
|
160
|
+
factor = 1 if factor.zero? && !detail
|
161
|
+
|
162
|
+
# Calculate the multiplier value (10^exponent)
|
163
|
+
# Example: "cents" -> 10^2 = 100, "millions" -> 10^6 = 1,000,000
|
103
164
|
multiple_of_ten = 10**(POWERS_OF_TEN[result[:m]] || 0)
|
104
165
|
|
105
|
-
#
|
166
|
+
# Handle compound numbers with higher-order multipliers
|
167
|
+
# Example: "cinq cents millions" - after matching "cinq cents",
|
168
|
+
# check if "millions" (a higher multiplier than "cents") remains
|
106
169
|
if /#{higher_multiple(result[:m]).keys.join('|')}/.match(sentence)
|
170
|
+
# Recursively process the higher multiplier
|
107
171
|
details = extract(sentence, keys, detail: true)
|
108
172
|
|
109
|
-
|
173
|
+
# Combine the current factor*multiplier with the higher multiplier
|
174
|
+
# Example: For "cinq cents millions":
|
175
|
+
# - factor = 5, multiple_of_ten = 100 (from "cinq cents")
|
176
|
+
# - details[:factor] = 0, details[:multiple_of_ten] = 1000000 (from "millions")
|
177
|
+
# - result: factor = (5 * 100) + 0 = 500, multiple_of_ten = 1000000
|
178
|
+
# - final: 500 * 1000000 = 500,000,000
|
179
|
+
factor = (factor * multiple_of_ten) + details[:factor]
|
110
180
|
multiple_of_ten = details[:multiple_of_ten]
|
111
|
-
sentence
|
181
|
+
sentence = details[:sentence]
|
112
182
|
end
|
113
183
|
|
184
|
+
# Return detailed parsing info for recursive calls
|
114
185
|
if detail
|
115
186
|
return {
|
116
187
|
factor: factor,
|
@@ -119,33 +190,69 @@ module StringToNumber
|
|
119
190
|
}
|
120
191
|
end
|
121
192
|
|
122
|
-
|
193
|
+
# Final calculation: process any remaining sentence + current factor*multiplier
|
194
|
+
# Example: For "trois millions cinq cents", this handles the "cinq cents" part
|
195
|
+
extract(sentence, keys) + (factor * multiple_of_ten)
|
123
196
|
|
124
|
-
|
197
|
+
# Special case handling for "quatre-vingt" variations
|
198
|
+
# This complex regex handles the irregular French "eighty" patterns:
|
199
|
+
# - "quatre-vingt" / "quatre vingts" (with/without 's')
|
200
|
+
# - "quatre-vingt-dix" / "quatre vingts dix" (90)
|
201
|
+
# - Space vs hyphen variations
|
202
|
+
elsif (m = /(quatre(-|\s)vingt(s?)((-|\s)dix)?)((-|\s)?)(\w*)/.match(sentence))
|
203
|
+
# Normalize spacing to hyphens for consistent lookup
|
125
204
|
normalize_str = m[1].tr(' ', '-')
|
126
|
-
normalize_str = normalize_str[0...-1] if normalize_str[normalize_str.length] == 's'
|
127
205
|
|
206
|
+
# Remove trailing 's' from "quatre-vingts" if present
|
207
|
+
# Bug fix: use [-1] instead of [length] for last character
|
208
|
+
normalize_str = normalize_str[0...-1] if normalize_str[-1] == 's'
|
209
|
+
|
210
|
+
# Remove the matched portion from sentence
|
128
211
|
sentence.gsub!(m[0], '')
|
129
212
|
|
130
|
-
|
131
|
-
|
213
|
+
# Return sum of: remaining sentence + normalized quatre-vingt value + any suffix
|
214
|
+
# Example: "quatre-vingt-cinq" -> EXCEPTIONS["quatre-vingt"] + EXCEPTIONS["cinq"]
|
215
|
+
extract(sentence, keys) +
|
216
|
+
EXCEPTIONS[normalize_str] + (EXCEPTIONS[m[8]] || 0)
|
132
217
|
else
|
133
|
-
|
218
|
+
# Fallback: use match() method for simple word combinations
|
219
|
+
match(sentence)
|
134
220
|
end
|
135
221
|
end
|
136
222
|
|
223
|
+
# Fallback method for parsing simple word sequences
|
224
|
+
# Used when the main extract() method can't find multiplier patterns
|
225
|
+
# @param sentence [String] French text to parse as individual words
|
226
|
+
# @return [Integer, nil] Sum of individual word values or nil if no sentence
|
137
227
|
def match(sentence)
|
138
228
|
return if sentence.nil?
|
139
229
|
|
140
|
-
|
230
|
+
# Process words in reverse order for proper French number logic
|
231
|
+
# Example: "vingt et un" -> ["un", "et", "vingt"] -> 1 + 0 + 20 = 21
|
232
|
+
sentence.downcase.tr('-', ' ').split.reverse.sum do |word|
|
233
|
+
# Handle French "et" (and) conjunction by ignoring it in calculations
|
234
|
+
# Example: "vingt et un" -> ignore "et", sum "vingt" + "un"
|
235
|
+
next 0 if word == 'et'
|
236
|
+
|
237
|
+
# Look up word value in either EXCEPTIONS or POWERS_OF_TEN
|
141
238
|
if EXCEPTIONS[word].nil? && POWERS_OF_TEN[word].nil?
|
239
|
+
# Unknown words contribute 0 to the sum
|
142
240
|
0
|
143
241
|
else
|
144
|
-
|
242
|
+
# Use EXCEPTIONS value if available, otherwise use 10 * power_of_ten
|
243
|
+
# Example: "dix" -> EXCEPTIONS["dix"] = 10
|
244
|
+
# "cent" -> 10 * POWERS_OF_TEN["cent"] = 10 * 2 = 100
|
245
|
+
EXCEPTIONS[word] || (10 * POWERS_OF_TEN[word])
|
145
246
|
end
|
146
247
|
end
|
147
248
|
end
|
148
249
|
|
250
|
+
# Helper method to find multipliers with higher powers than the given one
|
251
|
+
# Used to detect when compound numbers have higher-order multipliers
|
252
|
+
# @param multiple [String] The current multiplier word (e.g., "cents")
|
253
|
+
# @return [Hash] Hash of multipliers with higher powers of 10
|
254
|
+
# Example: higher_multiple("cents") returns {"mille"=>3, "million"=>6, ...}
|
255
|
+
# because 10^3, 10^6, etc. are all > 10^2 (cents)
|
149
256
|
def higher_multiple(multiple)
|
150
257
|
POWERS_OF_TEN.select do |_k, v|
|
151
258
|
v > POWERS_OF_TEN[multiple]
|