herbalist 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 John Duff
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,36 @@
1
+ = herbalist
2
+
3
+ Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].
4
+
5
+ == Installation
6
+
7
+ $ sudo gem sources -a http://gemcutter.org
8
+ $ sudo gem install herbalist
9
+
10
+ == Usage
11
+
12
+ >> require 'herbalist'
13
+ >> Herbalist.parse("a quarter cup")
14
+ => #<Alchemist::NumericConversion @value=0.25, @exponent=1.0, @unit_name=:cup>
15
+
16
+ >> Herbalist.parse("it was two and a half football fields")
17
+ => #<Alchemist::NumericConversion @value=2.5, @exponent=1.0, @unit_name=:football_fields>
18
+
19
+ >> Herbalist.parse('twenty-one and a half Miles')
20
+ => #<Alchemist::NumericConversion @value=21.5, @exponent=1.0, @unit_name=:miles>
21
+
22
+ >> Herbalist.parse('fifteen kilowatt Hours')
23
+ => #<Alchemist::NumericConversion @value=15.0, @exponent=1.0, @unit_name=:kilowatt_hours>
24
+
25
+ == Limitations
26
+
27
+ Parsing is limited to units that Alchemist can understand.
28
+
29
+ == TODO
30
+
31
+ -Allow parsing of units directly next to digits (ed 4m, 21.5L)
32
+ -Add options to limit which type of units to parse (only volume or distance)
33
+
34
+ == Copyright
35
+
36
+ Copyright (c) 2009 John Duff. See LICENSE for details.
data/Rakefile ADDED
@@ -0,0 +1,64 @@
1
+ require 'rake/testtask'
2
+ require 'rake/packagetask'
3
+ require 'rake/rdoctask'
4
+ require 'rake'
5
+
6
+ begin
7
+ require 'jeweler'
8
+ Jeweler::Tasks.new do |gem|
9
+ gem.name = "herbalist"
10
+ gem.summary = %Q{Herbalist is a natural language measurements parser.}
11
+ gem.description = %Q{Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].}
12
+ gem.email = "duff.john@gmail.com"
13
+ gem.homepage = "http://github.com/jduff/herbalist"
14
+ gem.authors = ["jduff"]
15
+
16
+ gem.add_dependency('alchemist', '>= 0.1.2')
17
+ gem.add_dependency('numerizer', '>= 0.1.1')
18
+
19
+ gem.add_development_dependency('jeremymcanally-context')
20
+ # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
21
+ end
22
+
23
+ Jeweler::GemcutterTasks.new
24
+ rescue LoadError
25
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
26
+ end
27
+
28
+ require 'rake/testtask'
29
+ Rake::TestTask.new(:test) do |test|
30
+ test.libs << 'lib' << 'test'
31
+ test.pattern = 'test/**/test_*.rb'
32
+ test.verbose = true
33
+ end
34
+
35
+ begin
36
+ require 'rcov/rcovtask'
37
+ Rcov::RcovTask.new do |test|
38
+ test.libs << 'test'
39
+ test.pattern = 'test/**/test_*.rb'
40
+ test.verbose = true
41
+ end
42
+ rescue LoadError
43
+ task :rcov do
44
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
45
+ end
46
+ end
47
+
48
+ task :test => :check_dependencies
49
+
50
+ task :default => :test
51
+
52
+ require 'rake/rdoctask'
53
+ Rake::RDocTask.new do |rdoc|
54
+ if File.exist?('VERSION')
55
+ version = File.read('VERSION')
56
+ else
57
+ version = ""
58
+ end
59
+
60
+ rdoc.rdoc_dir = 'rdoc'
61
+ rdoc.title = "herbalist #{version}"
62
+ rdoc.rdoc_files.include('README*')
63
+ rdoc.rdoc_files.include('lib/**/*.rb')
64
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
data/herbalist.gemspec ADDED
@@ -0,0 +1,60 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{herbalist}
8
+ s.version = "0.1.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["jduff"]
12
+ s.date = %q{2010-01-02}
13
+ s.description = %q{Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].}
14
+ s.email = %q{duff.john@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "herbalist.gemspec",
27
+ "lib/herbalist.rb",
28
+ "lib/herbalist/herbalist.rb",
29
+ "test/test_helper.rb",
30
+ "test/test_herbalist.rb"
31
+ ]
32
+ s.homepage = %q{http://github.com/jduff/herbalist}
33
+ s.rdoc_options = ["--charset=UTF-8"]
34
+ s.require_paths = ["lib"]
35
+ s.rubygems_version = %q{1.3.5}
36
+ s.summary = %q{Herbalist is a natural language measurements parser.}
37
+ s.test_files = [
38
+ "test/test_helper.rb",
39
+ "test/test_herbalist.rb"
40
+ ]
41
+
42
+ if s.respond_to? :specification_version then
43
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
44
+ s.specification_version = 3
45
+
46
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
47
+ s.add_runtime_dependency(%q<alchemist>, [">= 0.1.2"])
48
+ s.add_runtime_dependency(%q<numerizer>, [">= 0.1.1"])
49
+ s.add_development_dependency(%q<jeremymcanally-context>, [">= 0"])
50
+ else
51
+ s.add_dependency(%q<alchemist>, [">= 0.1.2"])
52
+ s.add_dependency(%q<numerizer>, [">= 0.1.1"])
53
+ s.add_dependency(%q<jeremymcanally-context>, [">= 0"])
54
+ end
55
+ else
56
+ s.add_dependency(%q<alchemist>, [">= 0.1.2"])
57
+ s.add_dependency(%q<numerizer>, [">= 0.1.1"])
58
+ s.add_dependency(%q<jeremymcanally-context>, [">= 0"])
59
+ end
60
+ end
@@ -0,0 +1,159 @@
1
+ module Herbalist
2
+ # collect up all the possible unit types that Alchemist can handle
3
+ POSSIBLE_UNITS = Alchemist.conversion_table.collect{|k,v| v.keys}.flatten.uniq
4
+ MULTIWORD_UNITS = POSSIBLE_UNITS.collect{|u| u.to_s}.grep(/_/)
5
+
6
+ class << self
7
+ def parse(text)
8
+ text = text.dup
9
+ puts "TEXT: #{text}" if Herbalist.debug
10
+ @tokens = self.tokenize(text).select { |token| token.tagged? }
11
+ puts "TOKENS: #{@tokens}" if Herbalist.debug
12
+
13
+ return nil unless @tokens.length>1
14
+
15
+ # at the moment all we handle is a number followed by a unit
16
+ last_number = nil
17
+ result = nil
18
+ @tokens.each do |token|
19
+ if last_number && (unit=token.get_tag(:unit))
20
+ result = last_number.send(unit.value)
21
+ break
22
+ elsif (num=token.get_tag(:number))
23
+ last_number = num.value
24
+ end
25
+ end
26
+
27
+ return result
28
+ end
29
+
30
+ def tokenize(text)
31
+ # cleanup the string before tokenizing
32
+ text = normalize(text)
33
+ @tokens = text.split(' ').collect { |word| Token.new(word) }
34
+ @tokens = Tag.scan(@tokens)
35
+ end
36
+
37
+ private
38
+ def normalize(text)
39
+ # use Numerizer to convert any numbers in words to digets in the string
40
+ text = Numerizer.numerize(text)
41
+ puts "NUMERIZED: #{text}" if Herbalist.debug
42
+
43
+ text = evaluate_fractions(text)
44
+ puts "FRACTIONED: #{text}" if Herbalist.debug
45
+
46
+ text = normalize_multiword(text)
47
+ puts "MULTIWORDED: #{text}" if Herbalist.debug
48
+ return text
49
+ end
50
+
51
+ # takes fractions in the string (1/4) and converts them to floats (0.25)
52
+ def evaluate_fractions(text)
53
+ text.gsub(/(\d+)\/(\d+)/) { ($1.to_f/$2.to_f).to_s }
54
+ end
55
+
56
+ def normalize_multiword(text)
57
+ MULTIWORD_UNITS.each do |unit|
58
+ text = text.gsub(/#{unit.split('_').join(' ')}/i, unit)
59
+ end
60
+ text
61
+ end
62
+ end
63
+
64
+ # based on the Token class found in Chronic
65
+ class Token
66
+ attr_accessor :word, :tags
67
+
68
+ def initialize(word)
69
+ @word = word
70
+ @tags = []
71
+ end
72
+
73
+ # Tag this token with the specified tag
74
+ def tag(new_tag)
75
+ @tags << new_tag
76
+ end
77
+
78
+ # Remove all tags of the given class
79
+ def untag(tag_type)
80
+ @tags = @tags.select { |m| m.type!=tag_type }
81
+ end
82
+
83
+ # Return true if this token has any tags
84
+ def tagged?
85
+ @tags.size > 0
86
+ end
87
+
88
+ # Return the Tag that matches the given class
89
+ def get_tag(tag_type)
90
+ matches = @tags.select { |m| m.type==tag_type }
91
+ return matches.first
92
+ end
93
+
94
+ # Print this Token in a pretty way
95
+ def to_s
96
+ "#{@word}(#{@tags.join(', ')})"
97
+ end
98
+ end
99
+
100
+
101
+ class Tag
102
+ attr_accessor :value, :type
103
+
104
+ def initialize(type, value)
105
+ @type = type
106
+ @value = value
107
+ end
108
+
109
+ # scan the given tokens and tag any matches
110
+ def self.scan(tokens)
111
+ tokens.each do |token|
112
+ if t = self.scan_for_numbers(token) then token.tag(t) end
113
+ if t = self.scan_for_units(token) then token.tag(t) end
114
+ end
115
+ tokens
116
+ end
117
+
118
+ # check the token to see if if it is a number
119
+ # then tag it
120
+ def self.scan_for_numbers(token)
121
+ if token.word =~ /(^|\W)(\d*\.\d+)($|\W)/ || token.word =~ /(^|\W)(\d+)($|\W)/
122
+ return Tag.new(:number, $2.to_f)
123
+ end
124
+ return nil
125
+ end
126
+
127
+ # check the token and see if it is a type of unit that Alchemist can handle
128
+ # then tag the token with that unit type
129
+ def self.scan_for_units(token)
130
+ return nil if token.get_tag(:number)
131
+ # all units
132
+ POSSIBLE_UNITS.each do |unit|
133
+ if token.word.length<=2 # special matching for short forms ex. Mi
134
+ return Tag.new(:unit, unit) if token.word == unit.to_s
135
+ elsif token.word =~ /(^|\W)#{unit.to_s}($|\W)/i
136
+ return Tag.new(:unit, unit)
137
+ end
138
+ end
139
+
140
+ # try si units with prefixes (kilo, deca etc)
141
+ Alchemist.unit_prefixes.each do |prefix, value|
142
+ if token.word =~ /^#{prefix.to_s}.+/i
143
+ Alchemist.si_units.each do |unit|
144
+ if unit.to_s=~/#{token.word.gsub(/^#{prefix.to_s}/i,'')}$/i
145
+ return Tag.new(:unit, "#{prefix}#{unit}")
146
+ end
147
+ end
148
+ end
149
+ end
150
+
151
+ return nil
152
+ end
153
+
154
+ def to_s
155
+ "#{type}-#{value}"
156
+ end
157
+ end
158
+
159
+ end
data/lib/herbalist.rb ADDED
@@ -0,0 +1,22 @@
1
+ require 'numerizer'
2
+ require 'alchemist'
3
+
4
+ require 'herbalist/herbalist'
5
+
6
+ # reopen Alchemist and create an accessor for si_units
7
+ # hopefully a future patch will mean we can remove this
8
+ unless Alchemist.respond_to?(:si_units)
9
+ module Alchemist
10
+ def self.si_units
11
+ @@si_units
12
+ end
13
+ end
14
+ end
15
+
16
+ module Herbalist
17
+ class << self
18
+ attr_accessor :debug
19
+ end
20
+
21
+ self.debug = false
22
+ end
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'context'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'herbalist'
8
+
9
+ class Test::Unit::TestCase
10
+ end
@@ -0,0 +1,81 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class HerbalistTest < Test::Unit::TestCase
4
+ # Herbalist.debug=true
5
+
6
+ should "parse number of cups" do
7
+ assert_equal 1.cup, Herbalist.parse('1 cup')
8
+ assert_equal 2.cups, Herbalist.parse('2 cups')
9
+ assert_equal 2.5.cups, Herbalist.parse('2.5 cups')
10
+ end
11
+
12
+ should "parse number of meters" do
13
+ assert_equal 1.meter, Herbalist.parse('1 meter')
14
+ assert_equal 2.meters, Herbalist.parse('2 meters')
15
+ assert_equal 2.5.meters, Herbalist.parse('2.5 meters')
16
+ end
17
+
18
+ should "parse number of kilometers" do
19
+ assert_equal 1.kilometer, Herbalist.parse('1 kilometer')
20
+ assert_equal 2.kilometers, Herbalist.parse('2 kilometers')
21
+ assert_equal 2.5.kilometers, Herbalist.parse('2.5 kilometers')
22
+ end
23
+
24
+ should "parse mixed case units" do
25
+ assert_equal 1.kilometer, Herbalist.parse('1 Kilometer')
26
+ assert_equal 2.kilometers, Herbalist.parse('2 KiloMeters')
27
+ assert_equal 2.5.miles, Herbalist.parse('2.5 Miles')
28
+ end
29
+
30
+ should "parse numbers in words" do
31
+ assert_equal 1.kilometer, Herbalist.parse('one Kilometer')
32
+ assert_equal 15.kilometers, Herbalist.parse('fifteen Kilometers')
33
+ assert_equal 21.5.miles, Herbalist.parse('twenty-one and a half Miles')
34
+ end
35
+
36
+ should "parse fractions in words" do
37
+ assert_equal 0.25.cup, Herbalist.parse('a quarter cup')
38
+ assert_equal 0.5.teaspoon, Herbalist.parse('a half teaspoon')
39
+ assert_equal 1.25.liters, Herbalist.parse('one and a quarter liters')
40
+ end
41
+
42
+ should "parse multiword units" do
43
+ assert_equal 1.cubic_meter, Herbalist.parse('one Cubic Meter')
44
+ assert_equal 15.kilowatt_hours, Herbalist.parse('fifteen kilowatt Hours')
45
+ assert_equal 21.5.tropical_years, Herbalist.parse('21.5 tropical years')
46
+ end
47
+
48
+ should "parse units from within a string of other text" do
49
+ assert_equal 1.cup, Herbalist.parse("1 Cup onion chopped")
50
+ assert_equal 1.25.ounces, Herbalist.parse("1 Package (1.25 ounces) taco seasoning mix")
51
+ assert_equal 16.ounces, Herbalist.parse("1 Can (16 ounces) tomatoes, undrained")
52
+ assert_equal 6.cups, Herbalist.parse("and 6 Cups turkey broth")
53
+
54
+ assert_equal 2.5.football_fields, Herbalist.parse("it was two and a half football fields")
55
+ assert_equal 15.meters, Herbalist.parse("John ran 15 meters")
56
+ end
57
+
58
+ should "not match can to ccandela units" do
59
+ assert_equal nil, Herbalist.parse("1 Can")
60
+ end
61
+
62
+ should "ignore units of measure that alchemist does not understand" do
63
+ assert_equal nil, Herbalist.parse('2 awesomes')
64
+ end
65
+
66
+ # add tests for all units Alchemist can handle
67
+ Alchemist.conversion_table.collect{|k,v| v.keys}.flatten.uniq.each do |unit|
68
+ should "parse #{unit}" do
69
+ assert_equal 1.send(unit), Herbalist.parse("1 #{unit}")
70
+ assert_equal 2.5.send(unit), Herbalist.parse("2.5 #{unit}")
71
+ end
72
+
73
+ # tests for all multi word units
74
+ if (words = unit.to_s.split('_')).length > 1
75
+ should "parse multiword unit '#{words.join(' ')}'" do
76
+ assert_equal 1.send(unit), Herbalist.parse("1 #{words.join(' ')}")
77
+ assert_equal 2.5.send(unit), Herbalist.parse("2.5 #{words.join(' ')}")
78
+ end
79
+ end
80
+ end
81
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: herbalist
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - jduff
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-01-02 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: alchemist
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 0.1.2
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: numerizer
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.1
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: jeremymcanally-context
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: "0"
44
+ version:
45
+ description: Herbalist is a natural language measurements parser. It is built on top of the Alchemist[http://github.com/toastyapps/alchemist] gem and draws heavily from the Chronic[http://github.com/evaryont/chronic].
46
+ email: duff.john@gmail.com
47
+ executables: []
48
+
49
+ extensions: []
50
+
51
+ extra_rdoc_files:
52
+ - LICENSE
53
+ - README.rdoc
54
+ files:
55
+ - .document
56
+ - .gitignore
57
+ - LICENSE
58
+ - README.rdoc
59
+ - Rakefile
60
+ - VERSION
61
+ - herbalist.gemspec
62
+ - lib/herbalist.rb
63
+ - lib/herbalist/herbalist.rb
64
+ - test/test_helper.rb
65
+ - test/test_herbalist.rb
66
+ has_rdoc: true
67
+ homepage: http://github.com/jduff/herbalist
68
+ licenses: []
69
+
70
+ post_install_message:
71
+ rdoc_options:
72
+ - --charset=UTF-8
73
+ require_paths:
74
+ - lib
75
+ required_ruby_version: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - ">="
78
+ - !ruby/object:Gem::Version
79
+ version: "0"
80
+ version:
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: "0"
86
+ version:
87
+ requirements: []
88
+
89
+ rubyforge_project:
90
+ rubygems_version: 1.3.5
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: Herbalist is a natural language measurements parser.
94
+ test_files:
95
+ - test/test_helper.rb
96
+ - test/test_herbalist.rb