herbalist 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 John Duff
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,36 @@
1
+ = herbalist
2
+
3
+ Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].
4
+
5
+ == Installation
6
+
7
+ $ sudo gem sources -a http://gemcutter.org
8
+ $ sudo gem install herbalist
9
+
10
+ == Usage
11
+
12
+ >> require 'herbalist'
13
+ >> Herbalist.parse("a quarter cup")
14
+ => #<Alchemist::NumericConversion @value=0.25, @exponent=1.0, @unit_name=:cup>
15
+
16
+ >> Herbalist.parse("it was two and a half football fields")
17
+ => #<Alchemist::NumericConversion @value=2.5, @exponent=1.0, @unit_name=:football_fields>
18
+
19
+ >> Herbalist.parse('twenty-one and a half Miles')
20
+ => #<Alchemist::NumericConversion @value=21.5, @exponent=1.0, @unit_name=:miles>
21
+
22
+ >> Herbalist.parse('fifteen kilowatt Hours')
23
+ => #<Alchemist::NumericConversion @value=15.0, @exponent=1.0, @unit_name=:kilowatt_hours>
24
+
25
+ == Limitations
26
+
27
+ Parsing is limited to units that Alchemist can understand.
28
+
29
+ == TODO
30
+
31
+ -Allow parsing of units directly next to digits (ed 4m, 21.5L)
32
+ -Add options to limit which type of units to parse (only volume or distance)
33
+
34
+ == Copyright
35
+
36
+ Copyright (c) 2009 John Duff. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ require 'rake/testtask'
2
+ require 'rake/packagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake'
5
+
6
+ begin
7
+ require 'jeweler'
8
+ Jeweler::Tasks.new do |gem|
9
+ gem.name = "herbalist"
10
+ gem.summary = %Q{Herbalist is a natural language measurements parser.}
11
+ gem.description = %Q{Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].}
12
+ gem.email = "duff.john@gmail.com"
13
+ gem.homepage = "http://github.com/jduff/herbalist"
14
+ gem.authors = ["jduff"]
15
+
16
+ gem.add_dependency('alchemist', '>= 0.1.2')
17
+ gem.add_dependency('numerizer', '>= 0.1.1')
18
+
19
+ gem.add_development_dependency('jeremymcanally-context')
20
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
21
+ end
22
+
23
+ Jeweler::GemcutterTasks.new
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+ task :test => :check_dependencies
49
+
50
+ task :default => :test
51
+
52
+ require 'rake/rdoctask'
53
+ Rake::RDocTask.new do |rdoc|
54
+ if File.exist?('VERSION')
55
+ version = File.read('VERSION')
56
+ else
57
+ version = ""
58
+ end
59
+
60
+ rdoc.rdoc_dir = 'rdoc'
61
+ rdoc.title = "herbalist #{version}"
62
+ rdoc.rdoc_files.include('README*')
63
+ rdoc.rdoc_files.include('lib/**/*.rb')
64
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/herbalist.gemspec ADDED
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{herbalist}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["jduff"]
12
+ s.date = %q{2010-01-02}
13
+ s.description = %q{Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].}
14
+ s.email = %q{duff.john@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "herbalist.gemspec",
27
+ "lib/herbalist.rb",
28
+ "lib/herbalist/herbalist.rb",
29
+ "test/test_helper.rb",
30
+ "test/test_herbalist.rb"
31
+ ]
32
+ s.homepage = %q{http://github.com/jduff/herbalist}
33
+ s.rdoc_options = ["--charset=UTF-8"]
34
+ s.require_paths = ["lib"]
35
+ s.rubygems_version = %q{1.3.5}
36
+ s.summary = %q{Herbalist is a natural language measurements parser.}
37
+ s.test_files = [
38
+ "test/test_helper.rb",
39
+ "test/test_herbalist.rb"
40
+ ]
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
+ s.add_runtime_dependency(%q<alchemist>, [">= 0.1.2"])
48
+ s.add_runtime_dependency(%q<numerizer>, [">= 0.1.1"])
49
+ s.add_development_dependency(%q<jeremymcanally-context>, [">= 0"])
50
+ else
51
+ s.add_dependency(%q<alchemist>, [">= 0.1.2"])
52
+ s.add_dependency(%q<numerizer>, [">= 0.1.1"])
53
+ s.add_dependency(%q<jeremymcanally-context>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<alchemist>, [">= 0.1.2"])
57
+ s.add_dependency(%q<numerizer>, [">= 0.1.1"])
58
+ s.add_dependency(%q<jeremymcanally-context>, [">= 0"])
59
+ end
60
+ end
@@ -0,0 +1,159 @@
1
+ module Herbalist
2
+ # collect up all the possible unit types that Alchemist can handle
3
+ POSSIBLE_UNITS = Alchemist.conversion_table.collect{|k,v| v.keys}.flatten.uniq
4
+ MULTIWORD_UNITS = POSSIBLE_UNITS.collect{|u| u.to_s}.grep(/_/)
5
+
6
+ class << self
7
+ def parse(text)
8
+ text = text.dup
9
+ puts "TEXT: #{text}" if Herbalist.debug
10
+ @tokens = self.tokenize(text).select { |token| token.tagged? }
11
+ puts "TOKENS: #{@tokens}" if Herbalist.debug
12
+
13
+ return nil unless @tokens.length>1
14
+
15
+ # at the moment all we handle is a number followed by a unit
16
+ last_number = nil
17
+ result = nil
18
+ @tokens.each do |token|
19
+ if last_number && (unit=token.get_tag(:unit))
20
+ result = last_number.send(unit.value)
21
+ break
22
+ elsif (num=token.get_tag(:number))
23
+ last_number = num.value
24
+ end
25
+ end
26
+
27
+ return result
28
+ end
29
+
30
+ def tokenize(text)
31
+ # cleanup the string before tokenizing
32
+ text = normalize(text)
33
+ @tokens = text.split(' ').collect { |word| Token.new(word) }
34
+ @tokens = Tag.scan(@tokens)
35
+ end
36
+
37
+ private
38
+ def normalize(text)
39
+ # use Numerizer to convert any numbers in words to digets in the string
40
+ text = Numerizer.numerize(text)
41
+ puts "NUMERIZED: #{text}" if Herbalist.debug
42
+
43
+ text = evaluate_fractions(text)
44
+ puts "FRACTIONED: #{text}" if Herbalist.debug
45
+
46
+ text = normalize_multiword(text)
47
+ puts "MULTIWORDED: #{text}" if Herbalist.debug
48
+ return text
49
+ end
50
+
51
+ # takes fractions in the string (1/4) and converts them to floats (0.25)
52
+ def evaluate_fractions(text)
53
+ text.gsub(/(\d+)\/(\d+)/) { ($1.to_f/$2.to_f).to_s }
54
+ end
55
+
56
+ def normalize_multiword(text)
57
+ MULTIWORD_UNITS.each do |unit|
58
+ text = text.gsub(/#{unit.split('_').join(' ')}/i, unit)
59
+ end
60
+ text
61
+ end
62
+ end
63
+
64
+ # based on the Token class found in Chronic
65
+ class Token
66
+ attr_accessor :word, :tags
67
+
68
+ def initialize(word)
69
+ @word = word
70
+ @tags = []
71
+ end
72
+
73
+ # Tag this token with the specified tag
74
+ def tag(new_tag)
75
+ @tags << new_tag
76
+ end
77
+
78
+ # Remove all tags of the given class
79
+ def untag(tag_type)
80
+ @tags = @tags.select { |m| m.type!=tag_type }
81
+ end
82
+
83
+ # Return true if this token has any tags
84
+ def tagged?
85
+ @tags.size > 0
86
+ end
87
+
88
+ # Return the Tag that matches the given class
89
+ def get_tag(tag_type)
90
+ matches = @tags.select { |m| m.type==tag_type }
91
+ return matches.first
92
+ end
93
+
94
+ # Print this Token in a pretty way
95
+ def to_s
96
+ "#{@word}(#{@tags.join(', ')})"
97
+ end
98
+ end
99
+
100
+
101
+ class Tag
102
+ attr_accessor :value, :type
103
+
104
+ def initialize(type, value)
105
+ @type = type
106
+ @value = value
107
+ end
108
+
109
+ # scan the given tokens and tag any matches
110
+ def self.scan(tokens)
111
+ tokens.each do |token|
112
+ if t = self.scan_for_numbers(token) then token.tag(t) end
113
+ if t = self.scan_for_units(token) then token.tag(t) end
114
+ end
115
+ tokens
116
+ end
117
+
118
+ # check the token to see if if it is a number
119
+ # then tag it
120
+ def self.scan_for_numbers(token)
121
+ if token.word =~ /(^|\W)(\d*\.\d+)($|\W)/ || token.word =~ /(^|\W)(\d+)($|\W)/
122
+ return Tag.new(:number, $2.to_f)
123
+ end
124
+ return nil
125
+ end
126
+
127
+ # check the token and see if it is a type of unit that Alchemist can handle
128
+ # then tag the token with that unit type
129
+ def self.scan_for_units(token)
130
+ return nil if token.get_tag(:number)
131
+ # all units
132
+ POSSIBLE_UNITS.each do |unit|
133
+ if token.word.length<=2 # special matching for short forms ex. Mi
134
+ return Tag.new(:unit, unit) if token.word == unit.to_s
135
+ elsif token.word =~ /(^|\W)#{unit.to_s}($|\W)/i
136
+ return Tag.new(:unit, unit)
137
+ end
138
+ end
139
+
140
+ # try si units with prefixes (kilo, deca etc)
141
+ Alchemist.unit_prefixes.each do |prefix, value|
142
+ if token.word =~ /^#{prefix.to_s}.+/i
143
+ Alchemist.si_units.each do |unit|
144
+ if unit.to_s=~/#{token.word.gsub(/^#{prefix.to_s}/i,'')}$/i
145
+ return Tag.new(:unit, "#{prefix}#{unit}")
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ return nil
152
+ end
153
+
154
+ def to_s
155
+ "#{type}-#{value}"
156
+ end
157
+ end
158
+
159
+ end
data/lib/herbalist.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'numerizer'
2
+ require 'alchemist'
3
+
4
+ require 'herbalist/herbalist'
5
+
6
+ # reopen Alchemist and create an accessor for si_units
7
+ # hopefully a future patch will mean we can remove this
8
+ unless Alchemist.respond_to?(:si_units)
9
+ module Alchemist
10
+ def self.si_units
11
+ @@si_units
12
+ end
13
+ end
14
+ end
15
+
16
+ module Herbalist
17
+ class << self
18
+ attr_accessor :debug
19
+ end
20
+
21
+ self.debug = false
22
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'context'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'herbalist'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,81 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class HerbalistTest < Test::Unit::TestCase
4
+ # Herbalist.debug=true
5
+
6
+ should "parse number of cups" do
7
+ assert_equal 1.cup, Herbalist.parse('1 cup')
8
+ assert_equal 2.cups, Herbalist.parse('2 cups')
9
+ assert_equal 2.5.cups, Herbalist.parse('2.5 cups')
10
+ end
11
+
12
+ should "parse number of meters" do
13
+ assert_equal 1.meter, Herbalist.parse('1 meter')
14
+ assert_equal 2.meters, Herbalist.parse('2 meters')
15
+ assert_equal 2.5.meters, Herbalist.parse('2.5 meters')
16
+ end
17
+
18
+ should "parse number of kilometers" do
19
+ assert_equal 1.kilometer, Herbalist.parse('1 kilometer')
20
+ assert_equal 2.kilometers, Herbalist.parse('2 kilometers')
21
+ assert_equal 2.5.kilometers, Herbalist.parse('2.5 kilometers')
22
+ end
23
+
24
+ should "parse mixed case units" do
25
+ assert_equal 1.kilometer, Herbalist.parse('1 Kilometer')
26
+ assert_equal 2.kilometers, Herbalist.parse('2 KiloMeters')
27
+ assert_equal 2.5.miles, Herbalist.parse('2.5 Miles')
28
+ end
29
+
30
+ should "parse numbers in words" do
31
+ assert_equal 1.kilometer, Herbalist.parse('one Kilometer')
32
+ assert_equal 15.kilometers, Herbalist.parse('fifteen Kilometers')
33
+ assert_equal 21.5.miles, Herbalist.parse('twenty-one and a half Miles')
34
+ end
35
+
36
+ should "parse fractions in words" do
37
+ assert_equal 0.25.cup, Herbalist.parse('a quarter cup')
38
+ assert_equal 0.5.teaspoon, Herbalist.parse('a half teaspoon')
39
+ assert_equal 1.25.liters, Herbalist.parse('one and a quarter liters')
40
+ end
41
+
42
+ should "parse multiword units" do
43
+ assert_equal 1.cubic_meter, Herbalist.parse('one Cubic Meter')
44
+ assert_equal 15.kilowatt_hours, Herbalist.parse('fifteen kilowatt Hours')
45
+ assert_equal 21.5.tropical_years, Herbalist.parse('21.5 tropical years')
46
+ end
47
+
48
+ should "parse units from within a string of other text" do
49
+ assert_equal 1.cup, Herbalist.parse("1 Cup onion chopped")
50
+ assert_equal 1.25.ounces, Herbalist.parse("1 Package (1.25 ounces) taco seasoning mix")
51
+ assert_equal 16.ounces, Herbalist.parse("1 Can (16 ounces) tomatoes, undrained")
52
+ assert_equal 6.cups, Herbalist.parse("and 6 Cups turkey broth")
53
+
54
+ assert_equal 2.5.football_fields, Herbalist.parse("it was two and a half football fields")
55
+ assert_equal 15.meters, Herbalist.parse("John ran 15 meters")
56
+ end
57
+
58
+ should "not match can to ccandela units" do
59
+ assert_equal nil, Herbalist.parse("1 Can")
60
+ end
61
+
62
+ should "ignore units of measure that alchemist does not understand" do
63
+ assert_equal nil, Herbalist.parse('2 awesomes')
64
+ end
65
+
66
+ # add tests for all units Alchemist can handle
67
+ Alchemist.conversion_table.collect{|k,v| v.keys}.flatten.uniq.each do |unit|
68
+ should "parse #{unit}" do
69
+ assert_equal 1.send(unit), Herbalist.parse("1 #{unit}")
70
+ assert_equal 2.5.send(unit), Herbalist.parse("2.5 #{unit}")
71
+ end
72
+
73
+ # tests for all multi word units
74
+ if (words = unit.to_s.split('_')).length > 1
75
+ should "parse multiword unit '#{words.join(' ')}'" do
76
+ assert_equal 1.send(unit), Herbalist.parse("1 #{words.join(' ')}")
77
+ assert_equal 2.5.send(unit), Herbalist.parse("2.5 #{words.join(' ')}")
78
+ end
79
+ end
80
+ end
81
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: herbalist
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - jduff
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-02 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: alchemist
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.1.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: numerizer
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.1
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: jeremymcanally-context
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].
46
+ email: duff.john@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.rdoc
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - VERSION
61
+ - herbalist.gemspec
62
+ - lib/herbalist.rb
63
+ - lib/herbalist/herbalist.rb
64
+ - test/test_helper.rb
65
+ - test/test_herbalist.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/jduff/herbalist
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Herbalist is a natural language measurements parser.
94
+ test_files:
95
+ - test/test_helper.rb
96
+ - test/test_herbalist.rb