fuzzy_strings 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
data/Gemfile ADDED
@@ -0,0 +1,13 @@
1
+ source "http://rubygems.org"
2
+ # Add dependencies required to use your gem here.
3
+ # Example:
4
+ # gem "activesupport", ">= 2.3.5"
5
+
6
+ # Add dependencies to develop your gem here.
7
+ # Include everything needed to run rake, tests, features, etc.
8
+ group :development do
9
+ gem "rspec", "~> 2.1.0"
10
+ gem "bundler", "~> 1.0.0"
11
+ gem "jeweler", "~> 1.5.1"
12
+ gem "rcov", ">= 0"
13
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,28 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ diff-lcs (1.1.2)
5
+ git (1.2.5)
6
+ jeweler (1.5.1)
7
+ bundler (~> 1.0.0)
8
+ git (>= 1.2.5)
9
+ rake
10
+ rake (0.8.7)
11
+ rcov (0.9.9)
12
+ rspec (2.1.0)
13
+ rspec-core (~> 2.1.0)
14
+ rspec-expectations (~> 2.1.0)
15
+ rspec-mocks (~> 2.1.0)
16
+ rspec-core (2.1.0)
17
+ rspec-expectations (2.1.0)
18
+ diff-lcs (~> 1.1.2)
19
+ rspec-mocks (2.1.0)
20
+
21
+ PLATFORMS
22
+ ruby
23
+
24
+ DEPENDENCIES
25
+ bundler (~> 1.0.0)
26
+ jeweler (~> 1.5.1)
27
+ rcov
28
+ rspec (~> 2.1.0)
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2011 Hartog C. de Mik
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,19 @@
1
+ = fuzzy_strings
2
+
3
+ Description goes here.
4
+
5
+ == Contributing to fuzzy_strings
6
+
7
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
8
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
9
+ * Fork the project
10
+ * Start a feature/bugfix branch
11
+ * Commit and push until you are happy with your contribution
12
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
13
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
14
+
15
+ == Copyright
16
+
17
+ Copyright (c) 2011 Hartog C. de Mik. See LICENSE.txt for
18
+ further details.
19
+
data/Rakefile ADDED
@@ -0,0 +1,50 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'rake'
11
+
12
+ require 'jeweler'
13
+ Jeweler::Tasks.new do |gem|
14
+ # gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
15
+ gem.name = "fuzzy_strings"
16
+ gem.homepage = "http://simplic.it/tools/fuzzy_strings"
17
+ gem.license = "MIT"
18
+ gem.summary = %Q{Fuzzy String Matching POC}
19
+ gem.description = %Q{Fuzzy String Matching based on Cost Of Operation}
20
+ gem.email = "hartog.de.mik@gmail.com"
21
+ gem.authors = ["Hartog C. de Mik"]
22
+ # Include your dependencies below. Runtime dependencies are required when using your gem,
23
+ # and development dependencies are only needed for development (ie running rake tasks, tests, etc)
24
+ # gem.add_runtime_dependency 'jabber4r', '> 0.1'
25
+ # gem.add_development_dependency 'rspec', '> 1.2.3'
26
+ end
27
+ Jeweler::RubygemsDotOrgTasks.new
28
+
29
+ require 'rspec/core'
30
+ require 'rspec/core/rake_task'
31
+ RSpec::Core::RakeTask.new(:spec) do |spec|
32
+ spec.pattern = FileList['spec/**/*_spec.rb']
33
+ end
34
+
35
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
36
+ spec.pattern = 'spec/**/*_spec.rb'
37
+ spec.rcov = true
38
+ end
39
+
40
+ task :default => :spec
41
+
42
+ require 'rake/rdoctask'
43
+ Rake::RDocTask.new do |rdoc|
44
+ version = File.exist?('VERSION') ? File.read('VERSION') : ""
45
+
46
+ rdoc.rdoc_dir = 'rdoc'
47
+ rdoc.title = "fuzzy_strings #{version}"
48
+ rdoc.rdoc_files.include('README*')
49
+ rdoc.rdoc_files.include('lib/**/*.rb')
50
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 1.0.0
@@ -0,0 +1,207 @@
1
+ # Match words based on the operations needed to get 2 similar words bt
2
+ # insertion, deletion, substitution or transposition operations.
3
+ #
4
+ # cot => coat (a must be inserted to get the same word)
5
+ # coat => cot (a must be deleted to get the same word)
6
+ # cost => coat (a must be substituted with s to get the same word)
7
+ # foo => floor (l and r must be inserted)
8
+ # floor => foo (l and r must be deleted)
9
+ # cost => cots (t and s must substituted (cost=2) or transpositioned (cost=1))
10
+ #
11
+ # the cost is the amount of operations needed to make 2 words the same
12
+ #
13
+ # == Usage
14
+ # fs = FuzzyStrings.new("pattern")
15
+ # match = fs.compare("pattren")
16
+ # puts match.match?
17
+ # # true
18
+ # puts match.score
19
+ # # 2
20
+ # puts match
21
+ #
22
+ # == Unicode?
23
+ # It is assumed that all strings are utf-8
24
+ #
25
+ class FuzzyStrings
26
+ def initialize(string1)
27
+ @string1 = string1.to_s rescue ""
28
+ end
29
+
30
+ # compare a given string to the base pattern, the compared strings is
31
+ # operated upon (soo cot as the pattern and coat in compare leads to deletion)
32
+ #
33
+ # returns a FuzzyStrings::Match object
34
+ #
35
+ def compare(string2, no_transpositions = false)
36
+ @string2 = string2.to_s rescue ""
37
+ @match = Match.new
38
+
39
+ return @match if @string1 == @string2
40
+
41
+ rule = 'U*'
42
+
43
+ sequence1 = @string1.unpack rule
44
+ sequence2 = @string2.unpack rule
45
+
46
+ if (sequence1 + sequence2).include?(0)
47
+ raise ArgumentError.new(
48
+ "Strings cannot contain NULL-bytes due to internal semantics"
49
+ )
50
+ end
51
+
52
+ @short, @long = if sequence1.length < sequence2.length
53
+ [sequence1, sequence2]
54
+ else
55
+ [sequence2, sequence1]
56
+ end
57
+
58
+ find_insertions
59
+ find_substitutions
60
+ find_transpositions unless no_transpositions == true
61
+
62
+ return @match
63
+ end
64
+
65
+ # find insertions (if string2 is shorter we are finding deletions)
66
+ #
67
+ # place null-bytes on the insert positions
68
+ def find_insertions
69
+ # when both are equal in length no insertions can happen
70
+ return if @short.length == @long.length
71
+
72
+ mode = @short.pack('U*') == @string2 ? :insertions : :deletions
73
+
74
+ ## # don't destroy the object short'
75
+ ## short = @short
76
+
77
+ @long.each_with_index do |long_chr, i|
78
+ short_chr = @short[i]
79
+ if long_chr != short_chr
80
+ next if @long[i+1].nil? or @long[i+1] != short_chr
81
+
82
+ # there is an insertion
83
+ @short = @short[0,i] + [ 0 ] + @short[i, @short.length-1]
84
+ @match.send(:"#{mode}=", @match.send(mode) + 1)
85
+ end
86
+ end
87
+
88
+ # pad the short with 0 until equal in length (these are not insertions)
89
+ while @long.length > @short.length
90
+ @short << 0
91
+ @match.send(:"#{mode}=", @match.send(mode) + 1)
92
+ end
93
+ end
94
+
95
+ # compare characters, dont compare if 1 character is a null byte
96
+ def find_substitutions
97
+ @short.each_with_index do |char1, i| # .select { |c| c != 0 }
98
+ char2 = @long[i]
99
+ next if [ char1, char2 ].include? 0
100
+ @match.substitutions += 1 if char1 != char2
101
+ end
102
+ end
103
+
104
+ # compare characters by 2 and find transposed characters
105
+ # (when given cost, cots, ts is transposed)
106
+ #
107
+ def find_transpositions
108
+ short = @short.select { |c| c != 0 }
109
+ short.each_index do |i|
110
+ break if i == (short.length - 1)
111
+
112
+ one = short[i..i+1]
113
+ two = @long[i..i+1]
114
+ next if one == two
115
+
116
+ @match.transpositions += 1 if (one & two).length == 2
117
+ end
118
+ end
119
+
120
+ private :find_transpositions, :find_substitutions, :find_insertions
121
+
122
+ # A Match object holds all the costs of operations for a comparison and can
123
+ # define a match for you
124
+ #
125
+ class Match
126
+ attr_accessor :insertions, :deletions, :substitutions, :transpositions
127
+
128
+ def initialize
129
+ @insertions = 0
130
+ @deletions = 0
131
+ @substitutions = 0
132
+ @transpositions = 0
133
+ end
134
+
135
+ # Is it a match?
136
+ #
137
+ # By default checks if the cost of all the operations is no greater then 3
138
+ #
139
+ # == Options
140
+ # [:score] * Total cost of operations is no greater then X.
141
+ # * If specified, doesn't check any other criterium
142
+ # [:max] * All of the operations must be no greater then X. So
143
+ # the score may be 3, but there cant be 2 deletions if
144
+ # X = 1.
145
+ # * If specified, checks no other criterium.
146
+ # * It checks substitutions OR transpositions
147
+ # [:deletions] * The amount of deletions is no greater then X
148
+ # [:insertions] * The amount of insertions is no greater then X
149
+ # [:substitutions] * The amount of substitutions is no greater then X
150
+ #
151
+ def match?(opts = { :score => 3 })
152
+ if opts[:score]
153
+ # combined operations
154
+ self.score <= opts[:score]
155
+
156
+ elsif opts[:max]
157
+ !(self.deletions > opts[:max]) and !(self.insertions > opts[:max]) \
158
+ and !(self.substitutions > opts[:max]) \
159
+ and !(self.transpositions > opts[:max])
160
+
161
+ else
162
+ plausable = true
163
+
164
+ if opts[:deletions]
165
+ plausable &= self.deletions <= opts[:deletions]
166
+ end
167
+ if opts[:insertions]
168
+ plausable &= self.insertions <= opts[:insertions]
169
+ end
170
+ if opts[:substitutions]
171
+ plausable &= self.substitutions <= opts[:substitutions]
172
+ end
173
+ if opts[:transpositions]
174
+ plausable &= self.transpositions <= opts[:transpositions]
175
+ end
176
+
177
+ plausable
178
+ end
179
+ end
180
+
181
+ # the total cost of the operations
182
+ #
183
+ # Normaly uses substitutions (which is more expensive).
184
+ #
185
+ # Specify use_transpositions as true to get transposition cost instead of
186
+ # substitutions cost
187
+ #
188
+ def score(use_transpositions=false)
189
+ (use_transpositions ? transpositions : substitutions) + insertions + deletions
190
+ end
191
+ alias_method :cost, :score
192
+
193
+ def to_s # :nodoc:
194
+ "{ d: #{@deletions}, i: #{@insertions}, s: #{@substitutions}, t: #{@transpositions} }"
195
+ end
196
+ end
197
+
198
+ private :find_insertions, :find_substitutions
199
+ end
200
+
201
+ # extend String
202
+ class String
203
+ def fuzzy_match(other)
204
+ fs = FuzzyStrings.new(self)
205
+ fs.compare other
206
+ end
207
+ end
@@ -0,0 +1,80 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "FuzzyStrings" do
4
+ before(:all) do
5
+ @fs = FuzzyStrings.new("pattern")
6
+ end
7
+
8
+ it "matches equal strings with 0 costs" do
9
+ match = @fs.compare("pattern")
10
+ match.cost.should == 0
11
+ end
12
+
13
+ it "matches 1 transposition and 2 substitutions on 'pattren'" do
14
+ match = @fs.compare("pattren")
15
+ match.substitutions.should == 2
16
+ match.transpositions.should == 1
17
+ match.insertions.should == 0
18
+ match.deletions.should == 0
19
+ end
20
+
21
+ it "matches 2 deletions on 'patterned'" do
22
+ match = @fs.compare("patterned")
23
+ match.substitutions.should == 0
24
+ match.transpositions.should == 0
25
+ match.insertions.should == 0
26
+ match.deletions.should == 2
27
+ end
28
+
29
+ it "matches 1 insertions on 'patten'" do
30
+ match = @fs.compare("patten")
31
+ match.substitutions.should == 0
32
+ match.transpositions.should == 0
33
+ match.insertions.should == 1
34
+ match.deletions.should == 0
35
+ end
36
+
37
+ # 1 del, 1 sub
38
+ it "matches on 'patterer' with a max of 2" do
39
+ match = @fs.compare('patterer')
40
+ match.match?(:max => 2).should == true
41
+ match.match?(:max => 0).should == false
42
+ end
43
+
44
+ # 4 del
45
+ it "does not match 'patternless' with a max of 3" do
46
+ match = @fs.compare("patternless")
47
+ match.match?(:max => 3).should == false
48
+ end
49
+
50
+ # 1 del, 5 subst
51
+ it "does not match 'pappadums' with a max of 2" do
52
+ match = @fs.compare("papadums")
53
+ match.match?(:max => 2).should == false
54
+ end
55
+
56
+ # :-)
57
+ it "does go well with the chicken!" do
58
+ match = @fs.compare("chicken")
59
+ match.match?.should == false
60
+ end
61
+
62
+ it "does extended matching 1" do
63
+ match = @fs.compare('papadums')
64
+ match.match?(:deletions => 1, :substitutions => 5).should == true
65
+ match.match?(:deletions => 0).should == false
66
+ end
67
+
68
+ it "does extended matching 2" do
69
+ match = @fs.compare('ptatenr')
70
+ match.match?(:substitutions => 4, :transpositions => 1).should == false
71
+ match.match?(:substitutions => 2, :transpositions => 2).should == false
72
+ match.match?(:substitutions => 4, :transpositions => 2).should == true
73
+ end
74
+
75
+ it "does extended matching 2" do
76
+ match = @fs.compare('patat')
77
+ match.match?(:insertions => 2, :substitutions => 2).should == true
78
+ match.match?(:insertions => 1).should == false
79
+ end
80
+ end
@@ -0,0 +1,12 @@
1
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
2
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
3
+ require 'rspec'
4
+ require 'fuzzy_strings'
5
+
6
+ # Requires supporting files with custom matchers and macros, etc,
7
+ # in ./support/ and its subdirectories.
8
+ Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
9
+
10
+ RSpec.configure do |config|
11
+
12
+ end
metadata ADDED
@@ -0,0 +1,140 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fuzzy_strings
3
+ version: !ruby/object:Gem::Version
4
+ hash: 23
5
+ prerelease: false
6
+ segments:
7
+ - 1
8
+ - 0
9
+ - 0
10
+ version: 1.0.0
11
+ platform: ruby
12
+ authors:
13
+ - Hartog C. de Mik
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2011-05-04 00:00:00 +02:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ version_requirements: &id001 !ruby/object:Gem::Requirement
23
+ none: false
24
+ requirements:
25
+ - - ~>
26
+ - !ruby/object:Gem::Version
27
+ hash: 11
28
+ segments:
29
+ - 2
30
+ - 1
31
+ - 0
32
+ version: 2.1.0
33
+ requirement: *id001
34
+ prerelease: false
35
+ type: :development
36
+ name: rspec
37
+ - !ruby/object:Gem::Dependency
38
+ version_requirements: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ hash: 23
44
+ segments:
45
+ - 1
46
+ - 0
47
+ - 0
48
+ version: 1.0.0
49
+ requirement: *id002
50
+ prerelease: false
51
+ type: :development
52
+ name: bundler
53
+ - !ruby/object:Gem::Dependency
54
+ version_requirements: &id003 !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ~>
58
+ - !ruby/object:Gem::Version
59
+ hash: 1
60
+ segments:
61
+ - 1
62
+ - 5
63
+ - 1
64
+ version: 1.5.1
65
+ requirement: *id003
66
+ prerelease: false
67
+ type: :development
68
+ name: jeweler
69
+ - !ruby/object:Gem::Dependency
70
+ version_requirements: &id004 !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ hash: 3
76
+ segments:
77
+ - 0
78
+ version: "0"
79
+ requirement: *id004
80
+ prerelease: false
81
+ type: :development
82
+ name: rcov
83
+ description: Fuzzy String Matching based on Cost Of Operation
84
+ email: hartog.de.mik@gmail.com
85
+ executables: []
86
+
87
+ extensions: []
88
+
89
+ extra_rdoc_files:
90
+ - LICENSE.txt
91
+ - README.rdoc
92
+ files:
93
+ - .document
94
+ - .rspec
95
+ - Gemfile
96
+ - Gemfile.lock
97
+ - LICENSE.txt
98
+ - README.rdoc
99
+ - Rakefile
100
+ - VERSION
101
+ - lib/fuzzy_strings.rb
102
+ - spec/fuzzy_strings_spec.rb
103
+ - spec/spec_helper.rb
104
+ has_rdoc: true
105
+ homepage: http://simplic.it/tools/fuzzy_strings
106
+ licenses:
107
+ - MIT
108
+ post_install_message:
109
+ rdoc_options: []
110
+
111
+ require_paths:
112
+ - lib
113
+ required_ruby_version: !ruby/object:Gem::Requirement
114
+ none: false
115
+ requirements:
116
+ - - ">="
117
+ - !ruby/object:Gem::Version
118
+ hash: 3
119
+ segments:
120
+ - 0
121
+ version: "0"
122
+ required_rubygems_version: !ruby/object:Gem::Requirement
123
+ none: false
124
+ requirements:
125
+ - - ">="
126
+ - !ruby/object:Gem::Version
127
+ hash: 3
128
+ segments:
129
+ - 0
130
+ version: "0"
131
+ requirements: []
132
+
133
+ rubyforge_project:
134
+ rubygems_version: 1.3.7
135
+ signing_key:
136
+ specification_version: 3
137
+ summary: Fuzzy String Matching POC
138
+ test_files:
139
+ - spec/fuzzy_strings_spec.rb
140
+ - spec/spec_helper.rb