numerizer 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/LICENSE +20 -0
- data/README.rdoc +24 -0
- data/Rakefile +56 -0
- data/VERSION +1 -0
- data/lib/numerizer.rb +123 -0
- data/numerizer.gemspec +50 -0
- data/test/test_helper.rb +9 -0
- data/test/test_numerizer.rb +83 -0
- metadata +66 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Tom Preston-Werner
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
= Numerizer
|
2
|
+
|
3
|
+
Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic.
|
4
|
+
|
5
|
+
== Installation
|
6
|
+
|
7
|
+
$ sudo gem sources -a http://gemcutter.org
|
8
|
+
$ sudo gem install numerizer
|
9
|
+
|
10
|
+
== Usage
|
11
|
+
|
12
|
+
>> require 'numerizer'
|
13
|
+
=> true
|
14
|
+
>> Numerizer.numerize('forty two')
|
15
|
+
=> "42"
|
16
|
+
>> Numerizer.numerize('two and a half')
|
17
|
+
=> "2.5"
|
18
|
+
>> Numerizer.numerize('three quarters')
|
19
|
+
=> "3/4"
|
20
|
+
>> Numerizer.numerize('two and three eighths')
|
21
|
+
=> "2.375"
|
22
|
+
|
23
|
+
== Contributors
|
24
|
+
Tom Preston-Werner, John Duff
|
data/Rakefile
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "numerizer"
|
8
|
+
gem.summary = "Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two)."
|
9
|
+
gem.description = "Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic."
|
10
|
+
gem.email = "duff.john@gmail.com"
|
11
|
+
gem.homepage = "http://github.com/jduff/numerizer"
|
12
|
+
gem.authors = ["John Duff"]
|
13
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
14
|
+
end
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
18
|
+
end
|
19
|
+
|
20
|
+
require 'rake/testtask'
|
21
|
+
Rake::TestTask.new(:test) do |test|
|
22
|
+
test.libs << 'lib' << 'test'
|
23
|
+
test.pattern = 'test/**/test_*.rb'
|
24
|
+
test.verbose = true
|
25
|
+
end
|
26
|
+
|
27
|
+
begin
|
28
|
+
require 'rcov/rcovtask'
|
29
|
+
Rcov::RcovTask.new do |test|
|
30
|
+
test.libs << 'test'
|
31
|
+
test.pattern = 'test/**/test_*.rb'
|
32
|
+
test.verbose = true
|
33
|
+
end
|
34
|
+
rescue LoadError
|
35
|
+
task :rcov do
|
36
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
task :test => :check_dependencies
|
41
|
+
|
42
|
+
task :default => :test
|
43
|
+
|
44
|
+
require 'rake/rdoctask'
|
45
|
+
Rake::RDocTask.new do |rdoc|
|
46
|
+
if File.exist?('VERSION')
|
47
|
+
version = File.read('VERSION')
|
48
|
+
else
|
49
|
+
version = ""
|
50
|
+
end
|
51
|
+
|
52
|
+
rdoc.rdoc_dir = 'rdoc'
|
53
|
+
rdoc.title = "numerizer #{version}"
|
54
|
+
rdoc.rdoc_files.include('README*')
|
55
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
56
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/lib/numerizer.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
# LICENSE:
|
2
|
+
#
|
3
|
+
# (The MIT License)
|
4
|
+
#
|
5
|
+
# Copyright © 2008 Tom Preston-Werner
|
6
|
+
#
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
8
|
+
#
|
9
|
+
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
10
|
+
#
|
11
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
12
|
+
|
13
|
+
require 'strscan'
|
14
|
+
|
15
|
+
class Numerizer
|
16
|
+
|
17
|
+
DIRECT_NUMS = [
|
18
|
+
['eleven', '11'],
|
19
|
+
['twelve', '12'],
|
20
|
+
['thirteen', '13'],
|
21
|
+
['fourteen', '14'],
|
22
|
+
['fifteen', '15'],
|
23
|
+
['sixteen', '16'],
|
24
|
+
['seventeen', '17'],
|
25
|
+
['eighteen', '18'],
|
26
|
+
['nineteen', '19'],
|
27
|
+
['ninteen', '19'], # Common mis-spelling
|
28
|
+
['zero', '0'],
|
29
|
+
['one', '1'],
|
30
|
+
['two', '2'],
|
31
|
+
['three', '3'],
|
32
|
+
['four(\W|$)', '4\1'], # The weird regex is so that it matches four but not fourty
|
33
|
+
['five', '5'],
|
34
|
+
['six(\W|$)', '6\1'],
|
35
|
+
['seven(\W|$)', '7\1'],
|
36
|
+
['eight(\W|$)', '8\1'],
|
37
|
+
['nine(\W|$)', '9\1'],
|
38
|
+
['ten', '10'],
|
39
|
+
['\ba[\b^$]', '1'] # doesn't make sense for an 'a' at the end to be a 1
|
40
|
+
]
|
41
|
+
|
42
|
+
TEN_PREFIXES = [ ['twenty', 20],
|
43
|
+
['thirty', 30],
|
44
|
+
['forty', 40],
|
45
|
+
['fourty', 40], # Common misspelling
|
46
|
+
['fifty', 50],
|
47
|
+
['sixty', 60],
|
48
|
+
['seventy', 70],
|
49
|
+
['eighty', 80],
|
50
|
+
['ninety', 90]
|
51
|
+
]
|
52
|
+
|
53
|
+
BIG_PREFIXES = [ ['hundred', 100],
|
54
|
+
['thousand', 1000],
|
55
|
+
['million', 1_000_000],
|
56
|
+
['billion', 1_000_000_000],
|
57
|
+
['trillion', 1_000_000_000_000],
|
58
|
+
]
|
59
|
+
|
60
|
+
FRACTIONS = [ ['half', 2],
|
61
|
+
['third(s)?', 3],
|
62
|
+
['fourth(s)?', 4],
|
63
|
+
['quarter(s)?', 4],
|
64
|
+
['fifth(s)?', 5],
|
65
|
+
['sixth(s)?', 6],
|
66
|
+
['seventh(s)?', 7],
|
67
|
+
['eighth(s)?', 8],
|
68
|
+
['nineth(s)?', 9],
|
69
|
+
]
|
70
|
+
|
71
|
+
def self.numerize(string)
|
72
|
+
string = string.dup
|
73
|
+
|
74
|
+
# preprocess
|
75
|
+
string.gsub!(/ +|([^\d])-([^\d])/, '\1 \2') # will mutilate hyphenated-words
|
76
|
+
|
77
|
+
# easy/direct replacements
|
78
|
+
|
79
|
+
DIRECT_NUMS.each do |dn|
|
80
|
+
string.gsub!(/#{dn[0]}/i, '<num>' + dn[1])
|
81
|
+
end
|
82
|
+
|
83
|
+
# ten, twenty, etc.
|
84
|
+
TEN_PREFIXES.each do |tp|
|
85
|
+
string.gsub!(/(?:#{tp[0]}) *<num>(\d(?=[^\d]|$))*/i) {'<num>' + (tp[1] + $1.to_i).to_s}
|
86
|
+
end
|
87
|
+
TEN_PREFIXES.each do |tp|
|
88
|
+
string.gsub!(/#{tp[0]}/i) { '<num>' + tp[1].to_s }
|
89
|
+
end
|
90
|
+
|
91
|
+
# handle fractions
|
92
|
+
FRACTIONS.each do |tp|
|
93
|
+
string.gsub!(/a #{tp[0]}/i) { '<num>1/' + tp[1].to_s }
|
94
|
+
string.gsub!(/\s#{tp[0]}/i) { '/' + tp[1].to_s }
|
95
|
+
end
|
96
|
+
|
97
|
+
# evaluate fractions when preceded by another number
|
98
|
+
string.gsub!(/(\d+)(?: | and |-)+(<num>|\s)*(\d+)\s*\/\s*(\d+)/i) { ($1.to_f + ($3.to_f/$4.to_f)).to_s }
|
99
|
+
|
100
|
+
# hundreds, thousands, millions, etc.
|
101
|
+
BIG_PREFIXES.each do |bp|
|
102
|
+
string.gsub!(/(?:<num>)?(\d*) *#{bp[0]}/i) { '<num>' + (bp[1] * $1.to_i).to_s}
|
103
|
+
andition(string)
|
104
|
+
end
|
105
|
+
|
106
|
+
andition(string)
|
107
|
+
|
108
|
+
string.gsub(/<num>/, '')
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
|
113
|
+
def self.andition(string)
|
114
|
+
sc = StringScanner.new(string)
|
115
|
+
while(sc.scan_until(/<num>(\d+)( | and )<num>(\d+)(?=[^\w]|$)/i))
|
116
|
+
if sc[2] =~ /and/ || sc[1].size > sc[3].size
|
117
|
+
string[(sc.pos - sc.matched_size)..(sc.pos-1)] = '<num>' + (sc[1].to_i + sc[3].to_i).to_s
|
118
|
+
sc.reset
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
data/numerizer.gemspec
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{numerizer}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["John Duff"]
|
12
|
+
s.date = %q{2009-12-25}
|
13
|
+
s.description = %q{Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic.}
|
14
|
+
s.email = %q{duff.john@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/numerizer.rb",
|
27
|
+
"numerizer.gemspec",
|
28
|
+
"test/test_helper.rb",
|
29
|
+
"test/test_numerizer.rb"
|
30
|
+
]
|
31
|
+
s.homepage = %q{http://github.com/jduff/numerizer}
|
32
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
33
|
+
s.require_paths = ["lib"]
|
34
|
+
s.rubygems_version = %q{1.3.5}
|
35
|
+
s.summary = %q{Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two).}
|
36
|
+
s.test_files = [
|
37
|
+
"test/test_helper.rb",
|
38
|
+
"test/test_numerizer.rb"
|
39
|
+
]
|
40
|
+
|
41
|
+
if s.respond_to? :specification_version then
|
42
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
43
|
+
s.specification_version = 3
|
44
|
+
|
45
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
46
|
+
else
|
47
|
+
end
|
48
|
+
else
|
49
|
+
end
|
50
|
+
end
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'test_helper'
|
2
|
+
|
3
|
+
class NumerizerTest < Test::Unit::TestCase
|
4
|
+
def test_straight_parsing
|
5
|
+
strings = { 1 => 'one',
|
6
|
+
5 => 'five',
|
7
|
+
10 => 'ten',
|
8
|
+
11 => 'eleven',
|
9
|
+
12 => 'twelve',
|
10
|
+
13 => 'thirteen',
|
11
|
+
14 => 'fourteen',
|
12
|
+
15 => 'fifteen',
|
13
|
+
16 => 'sixteen',
|
14
|
+
17 => 'seventeen',
|
15
|
+
18 => 'eighteen',
|
16
|
+
19 => 'nineteen',
|
17
|
+
20 => 'twenty',
|
18
|
+
27 => 'twenty seven',
|
19
|
+
31 => 'thirty-one',
|
20
|
+
41 => 'forty one',
|
21
|
+
42 => 'fourty two',
|
22
|
+
59 => 'fifty nine',
|
23
|
+
100 => 'a hundred',
|
24
|
+
100 => 'one hundred',
|
25
|
+
150 => 'one hundred and fifty',
|
26
|
+
# 150 => 'one fifty',
|
27
|
+
200 => 'two-hundred',
|
28
|
+
500 => '5 hundred',
|
29
|
+
999 => 'nine hundred and ninety nine',
|
30
|
+
1_000 => 'one thousand',
|
31
|
+
1_200 => 'twelve hundred',
|
32
|
+
1_200 => 'one thousand two hundred',
|
33
|
+
17_000 => 'seventeen thousand',
|
34
|
+
21_473 => 'twentyone-thousand-four-hundred-and-seventy-three',
|
35
|
+
74_002 => 'seventy four thousand and two',
|
36
|
+
99_999 => 'ninety nine thousand nine hundred ninety nine',
|
37
|
+
100_000 => '100 thousand',
|
38
|
+
250_000 => 'two hundred fifty thousand',
|
39
|
+
1_000_000 => 'one million',
|
40
|
+
1_250_007 => 'one million two hundred fifty thousand and seven',
|
41
|
+
1_000_000_000 => 'one billion',
|
42
|
+
1_000_000_001 => 'one billion and one' }
|
43
|
+
|
44
|
+
strings.keys.sort.each do |key|
|
45
|
+
assert_equal key, Numerizer.numerize(strings[key]).to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
assert_equal "2.5", Numerizer.numerize("two and a half")
|
49
|
+
assert_equal "1/2", Numerizer.numerize("one half")
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_fractions_in_words
|
53
|
+
assert_equal "1/4", Numerizer.numerize("1 quarter")
|
54
|
+
assert_equal "1/4", Numerizer.numerize("one quarter")
|
55
|
+
assert_equal "1/4", Numerizer.numerize("a quarter")
|
56
|
+
assert_equal "1/8", Numerizer.numerize("one eighth")
|
57
|
+
|
58
|
+
assert_equal "3/4", Numerizer.numerize("three quarters")
|
59
|
+
assert_equal "2/4", Numerizer.numerize("two fourths")
|
60
|
+
assert_equal "3/8", Numerizer.numerize("three eighths")
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_fractional_addition
|
64
|
+
assert_equal "1.25", Numerizer.numerize("one and a quarter")
|
65
|
+
assert_equal "2.375", Numerizer.numerize("two and three eighths")
|
66
|
+
assert_equal "3.5 hours", Numerizer.numerize("three and a half hours")
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_edges
|
70
|
+
assert_equal "27 Oct 2006 7:30am", Numerizer.numerize("27 Oct 2006 7:30am")
|
71
|
+
end
|
72
|
+
|
73
|
+
def test_multiple_slashes_should_not_be_evaluated
|
74
|
+
assert_equal '11/02/2007', Numerizer.numerize('11/02/2007')
|
75
|
+
end
|
76
|
+
|
77
|
+
def test_compatability
|
78
|
+
assert_equal '1/2', Numerizer.numerize('1/2')
|
79
|
+
assert_equal '05/06', Numerizer.numerize('05/06')
|
80
|
+
assert_equal "3.5 hours", Numerizer.numerize("three and a half hours")
|
81
|
+
end
|
82
|
+
|
83
|
+
end
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: numerizer
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- John Duff
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-12-25 00:00:00 -05:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two). It was extracted from the awesome Chronic gem http://github.com/evaryont/chronic.
|
17
|
+
email: duff.john@gmail.com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- LICENSE
|
24
|
+
- README.rdoc
|
25
|
+
files:
|
26
|
+
- .document
|
27
|
+
- .gitignore
|
28
|
+
- LICENSE
|
29
|
+
- README.rdoc
|
30
|
+
- Rakefile
|
31
|
+
- VERSION
|
32
|
+
- lib/numerizer.rb
|
33
|
+
- numerizer.gemspec
|
34
|
+
- test/test_helper.rb
|
35
|
+
- test/test_numerizer.rb
|
36
|
+
has_rdoc: true
|
37
|
+
homepage: http://github.com/jduff/numerizer
|
38
|
+
licenses: []
|
39
|
+
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options:
|
42
|
+
- --charset=UTF-8
|
43
|
+
require_paths:
|
44
|
+
- lib
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - ">="
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: "0"
|
50
|
+
version:
|
51
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
version:
|
57
|
+
requirements: []
|
58
|
+
|
59
|
+
rubyforge_project:
|
60
|
+
rubygems_version: 1.3.5
|
61
|
+
signing_key:
|
62
|
+
specification_version: 3
|
63
|
+
summary: Numerizer is a gem to help with parsing numbers in natural language from strings (ex forty two).
|
64
|
+
test_files:
|
65
|
+
- test/test_helper.rb
|
66
|
+
- test/test_numerizer.rb
|