hpoydar-chronic_duration 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +4 -0
- data/lib/chronic_duration.rb +2 -1
- data/lib/numerizer.rb +97 -0
- data/spec/chronic_duration_spec.rb +3 -1
- metadata +3 -2
data/README.rdoc
CHANGED
data/lib/chronic_duration.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'numerizer'
|
1
2
|
module ChronicDuration
|
2
3
|
extend self
|
3
4
|
|
@@ -99,7 +100,7 @@ private
|
|
99
100
|
end
|
100
101
|
|
101
102
|
def cleanup(string)
|
102
|
-
res = filter_by_type(string)
|
103
|
+
res = filter_by_type(Numerizer.numerize(string))
|
103
104
|
res = res.gsub(float_matcher) {|n| " #{n} "}.squeeze(' ').strip
|
104
105
|
res = filter_through_white_list(res)
|
105
106
|
end
|
data/lib/numerizer.rb
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
class Numerizer
|
4
|
+
|
5
|
+
DIRECT_NUMS = [
|
6
|
+
['eleven', '11'],
|
7
|
+
['twelve', '12'],
|
8
|
+
['thirteen', '13'],
|
9
|
+
['fourteen', '14'],
|
10
|
+
['fifteen', '15'],
|
11
|
+
['sixteen', '16'],
|
12
|
+
['seventeen', '17'],
|
13
|
+
['eighteen', '18'],
|
14
|
+
['nineteen', '19'],
|
15
|
+
['ninteen', '19'], # Common mis-spelling
|
16
|
+
['zero', '0'],
|
17
|
+
['one', '1'],
|
18
|
+
['two', '2'],
|
19
|
+
['three', '3'],
|
20
|
+
['four(\W|$)', '4\1'], # The weird regex is so that it matches four but not fourty
|
21
|
+
['five', '5'],
|
22
|
+
['six(\W|$)', '6\1'],
|
23
|
+
['seven(\W|$)', '7\1'],
|
24
|
+
['eight(\W|$)', '8\1'],
|
25
|
+
['nine(\W|$)', '9\1'],
|
26
|
+
['ten', '10'],
|
27
|
+
['\ba[\b^$]', '1'] # doesn't make sense for an 'a' at the end to be a 1
|
28
|
+
]
|
29
|
+
|
30
|
+
TEN_PREFIXES = [ ['twenty', 20],
|
31
|
+
['thirty', 30],
|
32
|
+
['fourty', 40],
|
33
|
+
['fifty', 50],
|
34
|
+
['sixty', 60],
|
35
|
+
['seventy', 70],
|
36
|
+
['eighty', 80],
|
37
|
+
['ninety', 90]
|
38
|
+
]
|
39
|
+
|
40
|
+
BIG_PREFIXES = [ ['hundred', 100],
|
41
|
+
['thousand', 1000],
|
42
|
+
['million', 1_000_000],
|
43
|
+
['billion', 1_000_000_000],
|
44
|
+
['trillion', 1_000_000_000_000],
|
45
|
+
]
|
46
|
+
|
47
|
+
def self.numerize(string)
|
48
|
+
string = string.dup
|
49
|
+
|
50
|
+
# preprocess
|
51
|
+
string.gsub!(/ +|([^\d])-([^\d])/, '\1 \2') # will mutilate hyphenated-words but shouldn't matter for date extraction
|
52
|
+
string.gsub!(/a half/, 'haAlf') # take the 'a' out so it doesn't turn into a 1, save the half for the end
|
53
|
+
|
54
|
+
# easy/direct replacements
|
55
|
+
|
56
|
+
DIRECT_NUMS.each do |dn|
|
57
|
+
string.gsub!(/#{dn[0]}/i, '<num>' + dn[1])
|
58
|
+
end
|
59
|
+
|
60
|
+
# ten, twenty, etc.
|
61
|
+
|
62
|
+
TEN_PREFIXES.each do |tp|
|
63
|
+
string.gsub!(/(?:#{tp[0]}) *<num>(\d(?=[^\d]|$))*/i) { '<num>' + (tp[1] + $1.to_i).to_s }
|
64
|
+
end
|
65
|
+
|
66
|
+
TEN_PREFIXES.each do |tp|
|
67
|
+
string.gsub!(/#{tp[0]}/i) { '<num>' + tp[1].to_s }
|
68
|
+
end
|
69
|
+
|
70
|
+
# hundreds, thousands, millions, etc.
|
71
|
+
|
72
|
+
BIG_PREFIXES.each do |bp|
|
73
|
+
string.gsub!(/(?:<num>)?(\d*) *#{bp[0]}/i) { '<num>' + (bp[1] * $1.to_i).to_s}
|
74
|
+
andition(string)
|
75
|
+
end
|
76
|
+
|
77
|
+
# fractional addition
|
78
|
+
# I'm not combining this with the previous block as using float addition complicates the strings
|
79
|
+
# (with extraneous .0's and such )
|
80
|
+
string.gsub!(/(\d+)(?: | and |-)*haAlf/i) { ($1.to_f + 0.5).to_s }
|
81
|
+
|
82
|
+
string.gsub(/<num>/, '')
|
83
|
+
end
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
def self.andition(string)
|
88
|
+
sc = StringScanner.new(string)
|
89
|
+
while(sc.scan_until(/<num>(\d+)( | and )<num>(\d+)(?=[^\w]|$)/i))
|
90
|
+
if sc[2] =~ /and/ || sc[1].size > sc[3].size
|
91
|
+
string[(sc.pos - sc.matched_size)..(sc.pos-1)] = '<num>' + (sc[1].to_i + sc[3].to_i).to_s
|
92
|
+
sc.reset
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
@@ -19,11 +19,13 @@ describe ChronicDuration, '.parse' do
|
|
19
19
|
'1:20.51' => 60 + 20.51,
|
20
20
|
'4:01:01' => 4 * 3600 + 60 + 1,
|
21
21
|
'3 mins 4 sec' => 3 * 60 + 4,
|
22
|
+
'three mins four sec' => 3 * 60 + 4,
|
22
23
|
'2 hrs 20 min' => 2 * 3600 + 20 * 60,
|
23
24
|
'2h20min' => 2 * 3600 + 20 * 60,
|
24
25
|
'6 mos 1 day' => 6 * 30 * 24 * 3600 + 24 * 3600,
|
25
26
|
'2.5 hrs' => 2.5 * 3600,
|
26
|
-
'47 yrs 6 mos and 4.5d' => 47 * 31557600 + 6 * 30 * 24 * 3600 + 4.5 * 24 * 3600
|
27
|
+
'47 yrs 6 mos and 4.5d' => 47 * 31557600 + 6 * 30 * 24 * 3600 + 4.5 * 24 * 3600,
|
28
|
+
'two hours and twenty minutes' => 2 * 3600 + 20 * 60
|
27
29
|
}
|
28
30
|
|
29
31
|
it "should return nil if the string can't be parsed" do
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: hpoydar-chronic_duration
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Henry Poydar
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-02-12 00:00:00 -08:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
@@ -23,6 +23,7 @@ extra_rdoc_files: []
|
|
23
23
|
|
24
24
|
files:
|
25
25
|
- lib/chronic_duration.rb
|
26
|
+
- lib/numerizer.rb
|
26
27
|
- MIT-LICENSE
|
27
28
|
- Rakefile
|
28
29
|
- README.rdoc
|