amatch 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,7 @@
1
+ 2009-08-25 (0.2.4)
2
+ * Included Jaro and Jaro-Winkler metrics implementation of Kevin Ballard
3
+ <kevin@rapleaf.com>. Thanks a lot.
4
+ * Made the extension compile under Ruby 1.9.
1
5
  2006-06-25 (0.2.3)
2
6
  * Fixed agrep.rb to use the new API.
3
7
  2005-10-11 (0.2.2)
@@ -1,12 +1,12 @@
1
- GNU GENERAL PUBLIC LICENSE
2
- Version 2, June 1991
1
+ GNU GENERAL PUBLIC LICENSE
2
+ Version 2, June 1991
3
3
 
4
4
  Copyright (C) 1989, 1991 Free Software Foundation, Inc.
5
5
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
6
6
  Everyone is permitted to copy and distribute verbatim copies
7
7
  of this license document, but changing it is not allowed.
8
8
 
9
- Preamble
9
+ Preamble
10
10
 
11
11
  The licenses for most software are designed to take away your
12
12
  freedom to share and change it. By contrast, the GNU General Public
@@ -56,7 +56,7 @@ patent must be licensed for everyone's free use or not licensed at all.
56
56
  The precise terms and conditions for copying, distribution and
57
57
  modification follow.
58
58
 
59
- GNU GENERAL PUBLIC LICENSE
59
+ GNU GENERAL PUBLIC LICENSE
60
60
  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61
61
 
62
62
  0. This License applies to any program or other work which contains
@@ -255,7 +255,7 @@ make exceptions for this. Our decision will be guided by the two goals
255
255
  of preserving the free status of all derivatives of our free software and
256
256
  of promoting the sharing and reuse of software generally.
257
257
 
258
- NO WARRANTY
258
+ NO WARRANTY
259
259
 
260
260
  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261
261
  FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
@@ -277,9 +277,9 @@ YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277
277
  PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278
278
  POSSIBILITY OF SUCH DAMAGES.
279
279
 
280
- END OF TERMS AND CONDITIONS
280
+ END OF TERMS AND CONDITIONS
281
281
 
282
- How to Apply These Terms to Your New Programs
282
+ How to Apply These Terms to Your New Programs
283
283
 
284
284
  If you develop a new program, and you want it to be of the greatest
285
285
  possible use to the public, the best way to achieve this is to make it
data/README ADDED
@@ -0,0 +1,25 @@
1
+ Installation
2
+ ============
3
+
4
+ Just type into the command line as root:
5
+
6
+ # ruby install.rb
7
+
8
+ If you have installed rake (rake.rubyforge.org), you can also type:
9
+
10
+ # rake install
11
+
12
+ To install this extension as a gem type
13
+
14
+ # gem install amatch
15
+
16
+ Author
17
+ ======
18
+
19
+ Florian Frank <flori@ping.de>
20
+
21
+ License
22
+ =======
23
+
24
+ GNU General Public License, Version 2 (GPLv2)
25
+
data/Rakefile CHANGED
@@ -1,32 +1,33 @@
1
- # vim: set et sw=2 ts=2:
2
- require 'rake/clean'
3
- require 'rake/testtask'
4
- require 'rake/gempackagetask'
5
- require 'rake/rdoctask'
6
- require 'rbconfig'
1
+ # vim: set filetype=ruby et sw=2 ts=2:
7
2
 
3
+ begin
4
+ require 'rake/gempackagetask'
5
+ rescue LoadError
6
+ end
7
+ require 'rbconfig'
8
8
  include Config
9
+ require 'rake/clean'
10
+ CLEAN.include 'coverage', 'doc'
11
+ require 'rake/testtask'
9
12
 
13
+ MAKE = ENV['MAKE'] || %w[gmake make].find { |c| system(c, '-v') }
14
+ PKG_NAME = 'amatch'
10
15
  PKG_VERSION = File.read('VERSION').chomp
11
- PKG_FILES = FileList['**/*']
12
- PKG_FILES.exclude(/CVS/)
13
- PKG_FILES.exclude(/^pkg/)
14
- PKG_FILES.exclude(/^doc/)
16
+ PKG_FILES = FileList["**/*"].exclude(/^(pkg|coverage|doc)/)
17
+ PKG_DOC_FILES = [ "ext/amatch.c" ].concat(Dir['lib/**/*.rb']) << 'doc-main.txt'
15
18
 
16
19
  task :default => :test
17
20
 
18
21
  desc "Run unit tests"
19
- task :test => :compile do
20
- cd 'tests' do
21
- ruby %{-I../ext runner.rb}
22
- end
22
+ task :test => :compile_ext do
23
+ sh %{testrb -Iext:lib tests/test_*.rb}
23
24
  end
24
25
 
25
26
  desc "Compiling library"
26
- task :compile do
27
+ task :compile_ext do
27
28
  cd 'ext' do
28
29
  ruby %{extconf.rb}
29
- sh "make"
30
+ sh MAKE
30
31
  end
31
32
  end
32
33
 
@@ -40,72 +41,73 @@ end
40
41
 
41
42
  desc "Removing generated files"
42
43
  task :clean do
43
- rm_rf 'doc'
44
- cd 'ext' do
44
+ cd 'ext' do
45
45
  ruby 'extconf.rb'
46
- sh "make distclean" if File.exist?('Makefile')
46
+ sh "#{MAKE} distclean" if File.exist?('Makefile')
47
47
  end
48
48
  end
49
49
 
50
- Rake::RDocTask.new do |rd|
51
- rd.main = 'Amatch'
52
- rd.rdoc_files.include("ext/amatch.c")
53
- rd.rdoc_dir = 'doc'
50
+ desc "Build the documentation"
51
+ task :doc do
52
+ sh "rdoc -m doc-main.txt -t '#{PKG_NAME} - Approximate Matching' #{PKG_DOC_FILES * ' '}"
54
53
  end
55
54
 
56
- spec = Gem::Specification.new do |s|
57
- #### Basic information.
58
-
59
- s.name = 'amatch'
60
- s.version = PKG_VERSION
61
- s.summary = "Approximate String Matching library"
62
- s.description = <<EOF
55
+ if defined? Gem
56
+ spec = Gem::Specification.new do |s|
57
+ s.name = 'amatch'
58
+ s.version = PKG_VERSION
59
+ s.summary = "Approximate String Matching library"
60
+ s.description = <<EOF
63
61
  Amatch is a library for approximate string matching and searching in strings.
64
62
  Several algorithms can be used to do this, and it's also possible to compute a
65
63
  similarity metric number between 0.0 and 1.0 for two given strings.
66
64
  EOF
67
65
 
68
- #### Dependencies and requirements.
66
+ s.files = PKG_FILES
69
67
 
70
- #s.add_dependency('log4r', '> 1.0.4')
71
- #s.requirements << ""
68
+ s.extensions << "ext/extconf.rb"
72
69
 
73
- s.files = PKG_FILES
70
+ s.require_path = 'ext'
74
71
 
75
- #### C code extensions.
72
+ s.bindir = "bin"
73
+ s.executables = ["agrep.rb"]
74
+ s.default_executable = "agrep.rb"
76
75
 
77
- s.extensions << "ext/extconf.rb"
76
+ s.has_rdoc = true
77
+ s.extra_rdoc_files.concat PKG_DOC_FILES
78
+ s.rdoc_options << '--main' << 'doc-main.txt' <<
79
+ '--title' << "#{PKG_NAME} - Approximate Matching"
80
+ s.test_files.concat Dir['tests/test_*.rb']
78
81
 
79
- #### Load-time details: library and application (you will need one or both).
80
-
81
- s.require_path = 'ext' # Use these for libraries.
82
- s.autorequire = 'amatch'
83
-
84
- s.bindir = "bin" # Use these for applications.
85
- s.executables = ["agrep.rb"]
86
- s.default_executable = "agrep.rb"
87
-
88
- #### Documentation and testing.
89
-
90
- s.has_rdoc = true
91
- #s.extra_rdoc_files = FileList['ext/amatch.c']
92
- s.rdoc_options <<
93
- '--title' << 'Amatch -- Approximate Matching' <<
94
- '--main' << 'Amatch' <<
95
- '--line-numbers'
96
- s.test_files << 'tests/runner.rb'
97
-
98
- #### Author and project details.
82
+ s.author = "Florian Frank"
83
+ s.email = "flori@ping.de"
84
+ s.homepage = "http://amatch.rubyforge.org"
85
+ s.rubyforge_project = "amatch"
86
+ end
99
87
 
100
- s.author = "Florian Frank"
101
- s.email = "flori@ping.de"
102
- s.homepage = "http://amatch.rubyforge.org"
103
- s.rubyforge_project = "amatch"
88
+ Rake::GemPackageTask.new(spec) do |pkg|
89
+ pkg.need_tar = true
90
+ pkg.package_files += PKG_FILES
91
+ end
104
92
  end
105
93
 
106
- Rake::GemPackageTask.new(spec) do |pkg|
107
- pkg.need_tar = true
108
- pkg.package_files += PKG_FILES
94
+ desc m = "Writing version information for #{PKG_VERSION}"
95
+ task :version do
96
+ puts m
97
+ File.open(File.join('lib', 'amatch', 'version.rb'), 'w') do |v|
98
+ v.puts <<EOT
99
+ module Amatch
100
+ # Amatch version
101
+ VERSION = '#{PKG_VERSION}'
102
+ VERSION_ARRAY = VERSION.split(/\\./).map { |x| x.to_i } # :nodoc:
103
+ VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
104
+ VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
105
+ VERSION_BUILD = VERSION_ARRAY[2] # :nodoc:
109
106
  end
107
+ EOT
108
+ end
109
+ end
110
+
110
111
 
111
- task :release => [ :clean, :package ]
112
+ desc "Prepare a new release"
113
+ task :release => [ :clean, :version, :package ]
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.3
1
+ 0.2.4
@@ -1,8 +1,4 @@
1
1
  #! /usr/bin/env ruby
2
- # vim: set et sw=2 ts=2:
3
- #
4
- ## $Id: agrep.rb,v 1.5 2006/09/26 15:59:48 flori Exp $
5
- #
6
2
 
7
3
  require 'amatch'
8
4
  require 'getoptlong'
@@ -0,0 +1,115 @@
1
+ == amatch - Approximate Matching Extension for Ruby
2
+
3
+ === Description
4
+
5
+ This is a collection of classes that can be used for Approximate
6
+ matching, searching, and comparing of Strings. They implement algorithms
7
+ that compute the Levenshtein edit distance, Sellers edit distance, the
8
+ Hamming distance, the longest common subsequence length, the longest common
9
+ substring length, the pair distance metric, the Jaro-Winkler metric.
10
+
11
+ === Author
12
+
13
+ Florian Frank mailto:flori@ping.de
14
+
15
+ === License
16
+
17
+ This is free software; you can redistribute it and/or modify it under
18
+ the terms of the GNU General Public License Version 2 as published by
19
+ the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
20
+
21
+ === Download
22
+
23
+ The latest version of <b>amatch</b> can be found at
24
+
25
+ * http://rubyforge.org/frs/?group_id=390
26
+
27
+ Online Documentation should be located at
28
+
29
+ * http://amatch.rubyforge.org
30
+
31
+ === Examples
32
+ require 'amatch'
33
+ # => true
34
+ include Amatch
35
+ # => Object
36
+
37
+ m = Sellers.new("pattern")
38
+ # => #<Amatch::Sellers:0x40366324>
39
+ m.match("pattren")
40
+ # => 2.0
41
+ m.substitution = m.insertion = 3
42
+ # => 3
43
+ m.match("pattren")
44
+ # => 4.0
45
+ m.reset_weights
46
+ # => #<Amatch::Sellers:0x40366324>
47
+ m.match(["pattren","parent"])
48
+ # => [2.0, 4.0]
49
+ m.search("abcpattrendef")
50
+ # => 2.0
51
+
52
+ m = Levenshtein.new("pattern")
53
+ # => #<Amatch::Levenshtein:0x4035919c>
54
+ m.match("pattren")
55
+ # => 2
56
+ m.search("abcpattrendef")
57
+ # => 2
58
+ "pattern language".levenshtein_similar("language of patterns")
59
+ # => 0.2
60
+
61
+ m = Hamming.new("pattern")
62
+ # => #<Amatch::Hamming:0x40350858>
63
+ m.match("pattren")
64
+ # => 2
65
+ "pattern language".hamming_similar("language of patterns")
66
+ # => 0.1
67
+
68
+ m = PairDistance.new("pattern")
69
+ # => #<Amatch::PairDistance:0x40349be8>
70
+ m.match("pattr en")
71
+ # => 0.545454545454545
72
+ m.match("pattr en", nil)
73
+ # => 0.461538461538462
74
+ m.match("pattr en", /t+/)
75
+ # => 0.285714285714286
76
+ "pattern language".pair_distance_similar("language of patterns")
77
+ # => 0.928571428571429
78
+
79
+ m = LongestSubsequence.new("pattern")
80
+ # => #<Amatch::LongestSubsequence:0x4033e900>
81
+ m.match("pattren")
82
+ # => 6
83
+ "pattern language".longest_subsequence_similar("language of patterns")
84
+ # => 0.4
85
+
86
+ m = LongestSubstring.new("pattern")
87
+ # => #<Amatch::LongestSubstring:0x403378d0>
88
+ m.match("pattren")
89
+ # => 4
90
+ "pattern language".longest_substring_similar("language of patterns")
91
+ # => 0.4
92
+
93
+ m = Jaro.new("pattern")
94
+ # => #<Amatch::Jaro:0x363b70>
95
+ m.match("paTTren")
96
+ # => 0.952380952380952
97
+ m.ignore_case = false
98
+ m.match("paTTren")
99
+ # => 0.742857142857143
100
+ "pattern language".jaro_similar("language of patterns")
101
+ # => 0.672222222222222
102
+
103
+ m = JaroWinkler.new("pattern")
104
+ # #<Amatch::JaroWinkler:0x3530b8>
105
+ m.match("paTTren")
106
+ # => 0.971428571712403
107
+ m.ignore_case = false
108
+ m.match("paTTren")
109
+ # => 0.79428571505206
110
+ m.scaling_factor = 0.05
111
+ m.match("pattren")
112
+ # => 0.961904762046678
113
+ "pattern language".jarowinkler_similar("language of patterns")
114
+ # => 0.672222222222222
115
+
@@ -1,5 +1,7 @@
1
1
  #include "ruby.h"
2
2
  #include "pair.h"
3
+ #include <ctype.h>
4
+ #include "common.h"
3
5
 
4
6
  /*
5
7
  * Document-method: pattern
@@ -19,7 +21,8 @@
19
21
 
20
22
 
21
23
  static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
22
- rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring;
24
+ rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
+ rb_cJaro, rb_cJaroWinkler;
23
26
 
24
27
  static ID id_split, id_to_f;
25
28
 
@@ -62,10 +65,10 @@ static void type##_pattern_set(type *amatch, VALUE pattern) \
62
65
  { \
63
66
  Check_Type(pattern, T_STRING); \
64
67
  free(amatch->pattern); \
65
- amatch->pattern_len = RSTRING(pattern)->len; \
68
+ amatch->pattern_len = RSTRING_LEN(pattern); \
66
69
  amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
67
- MEMCPY(amatch->pattern, RSTRING(pattern)->ptr, char, \
68
- RSTRING(pattern)->len); \
70
+ MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
+ RSTRING_LEN(pattern)); \
69
72
  } \
70
73
  static VALUE rb_##type##_pattern(VALUE self) \
71
74
  { \
@@ -80,16 +83,16 @@ static VALUE rb_##type##_pattern_set(VALUE self, VALUE pattern) \
80
83
  }
81
84
 
82
85
  #define DEF_ITERATE_STRINGS(type) \
83
- static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
84
- VALUE (*match_function) (type *amatch, VALUE strings)) \
86
+ static VALUE type##_iterate_strings(type *amatch, VALUE strings, \
87
+ VALUE (*match_function) (type *amatch, VALUE strings)) \
85
88
  { \
86
89
  if (TYPE(strings) == T_STRING) { \
87
90
  return match_function(amatch, strings); \
88
91
  } else { \
89
92
  Check_Type(strings, T_ARRAY); \
90
93
  int i; \
91
- VALUE result = rb_ary_new2(RARRAY(strings)->len); \
92
- for (i = 0; i < RARRAY(strings)->len; i++) { \
94
+ VALUE result = rb_ary_new2(RARRAY_LEN(strings)); \
95
+ for (i = 0; i < RARRAY_LEN(strings); i++) { \
93
96
  VALUE string = rb_ary_entry(strings, i); \
94
97
  if (TYPE(string) != T_STRING) { \
95
98
  rb_raise(rb_eTypeError, \
@@ -130,17 +133,25 @@ VALUE function(VALUE self, VALUE value) \
130
133
  obj = rb_funcall(obj, id_to_f, 0, 0); \
131
134
  else \
132
135
  Check_Type(obj, T_FLOAT)
133
- #define FLOAT2C(obj) RFLOAT(obj)->value
136
+ #define FLOAT2C(obj) (RFLOAT_VALUE(obj))
137
+
138
+ #define CAST2BOOL(obj) \
139
+ if (obj == Qfalse || obj == Qnil) \
140
+ obj = Qfalse; \
141
+ else \
142
+ obj = Qtrue;
143
+ #define BOOL2C(obj) (obj == Qtrue)
144
+ #define C2BOOL(obj) (obj ? Qtrue : Qfalse)
134
145
 
135
146
  #define OPTIMIZE_TIME \
136
- if (amatch->pattern_len < RSTRING(string)->len) { \
147
+ if (amatch->pattern_len < RSTRING_LEN(string)) { \
137
148
  a_ptr = amatch->pattern; \
138
149
  a_len = amatch->pattern_len; \
139
- b_ptr = RSTRING(string)->ptr; \
140
- b_len = RSTRING(string)->len; \
150
+ b_ptr = RSTRING_PTR(string); \
151
+ b_len = RSTRING_LEN(string); \
141
152
  } else { \
142
- a_ptr = RSTRING(string)->ptr; \
143
- a_len = RSTRING(string)->len; \
153
+ a_ptr = RSTRING_PTR(string); \
154
+ a_len = RSTRING_LEN(string); \
144
155
  b_ptr = amatch->pattern; \
145
156
  b_len = amatch->pattern_len; \
146
157
  }
@@ -148,8 +159,8 @@ VALUE function(VALUE self, VALUE value) \
148
159
  #define DONT_OPTIMIZE \
149
160
  a_ptr = amatch->pattern; \
150
161
  a_len = amatch->pattern_len; \
151
- b_ptr = RSTRING(string)->ptr; \
152
- b_len = RSTRING(string)->len; \
162
+ b_ptr = RSTRING_PTR(string); \
163
+ b_len = RSTRING_LEN(string); \
153
164
 
154
165
  /*
155
166
  * C structures of the Amatch classes
@@ -192,6 +203,27 @@ typedef struct PairDistanceStruct {
192
203
  DEF_ALLOCATOR(PairDistance)
193
204
  DEF_PATTERN_ACCESSOR(PairDistance)
194
205
 
206
+ typedef struct JaroStruct {
207
+ char *pattern;
208
+ int pattern_len;
209
+ int ignore_case;
210
+ } Jaro;
211
+
212
+ DEF_ALLOCATOR(Jaro)
213
+ DEF_PATTERN_ACCESSOR(Jaro)
214
+ DEF_ITERATE_STRINGS(Jaro)
215
+
216
+ typedef struct JaroWinklerStruct {
217
+ char *pattern;
218
+ int pattern_len;
219
+ int ignore_case;
220
+ float scaling_factor;
221
+ } JaroWinkler;
222
+
223
+ DEF_ALLOCATOR(JaroWinkler)
224
+ DEF_PATTERN_ACCESSOR(JaroWinkler)
225
+ DEF_ITERATE_STRINGS(JaroWinkler)
226
+
195
227
  /*
196
228
  * Levenshtein edit distances are computed here:
197
229
  */
@@ -616,6 +648,123 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
616
648
  return rb_float_new(((double) result) / b_len);
617
649
  }
618
650
 
651
+ /*
652
+ * Jaro computation
653
+ */
654
+
655
+ #define COMPUTE_JARO \
656
+ l[0] = ALLOC_N(int, a_len); \
657
+ MEMZERO(l[0], int, a_len); \
658
+ l[1] = ALLOC_N(int, b_len); \
659
+ MEMZERO(l[1], int, b_len); \
660
+ max_dist = ((a_len > b_len ? a_len : b_len) / 2) - 1; \
661
+ m = 0; \
662
+ for (i = 0; i < a_len; i++) { \
663
+ low = (i > max_dist ? i - max_dist : 0); \
664
+ high = (i + max_dist < b_len ? i + max_dist : b_len); \
665
+ for (j = low; j <= high; j++) { \
666
+ if (!l[1][j] && a_ptr[i] == b_ptr[j]) { \
667
+ l[0][i] = 1; \
668
+ l[1][j] = 1; \
669
+ m++; \
670
+ break; \
671
+ } \
672
+ } \
673
+ } \
674
+ if (m == 0) { \
675
+ result = 0.0; \
676
+ } else { \
677
+ k = t = 0; \
678
+ for (i = 0; i < a_len; i++) { \
679
+ if (l[0][i]) { \
680
+ for (j = k; j < b_len; j++) { \
681
+ if (l[1][j]) { \
682
+ k = j + 1; \
683
+ break; \
684
+ } \
685
+ } \
686
+ if (a_ptr[i] != b_ptr[j]) { \
687
+ t++; \
688
+ } \
689
+ } \
690
+ } \
691
+ t = t / 2; \
692
+ result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
693
+ }
694
+
695
+ #define LOWERCASE_STRINGS \
696
+ char *ying = ALLOC_N(char, a_len); \
697
+ MEMCPY(ying, a_ptr, char, a_len); \
698
+ a_ptr = ying; \
699
+ char *yang = ALLOC_N(char, b_len); \
700
+ MEMCPY(yang, b_ptr, char, b_len); \
701
+ b_ptr = yang; \
702
+ for (i = 0; i < a_len; i++) { \
703
+ if (islower(a_ptr[i])) a_ptr[i] = toupper(a_ptr[i]); \
704
+ } \
705
+ for (i = 0; i < b_len; i++) { \
706
+ if (islower(b_ptr[i])) b_ptr[i] = toupper(b_ptr[i]); \
707
+ }
708
+
709
+ #define FREE_STRINGS \
710
+ xfree(a_ptr); \
711
+ xfree(b_ptr);
712
+
713
+ static VALUE Jaro_match(Jaro *amatch, VALUE string)
714
+ {
715
+ char *a_ptr, *b_ptr;
716
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high;
717
+ int *l[2];
718
+ double result;
719
+
720
+ Check_Type(string, T_STRING);
721
+ OPTIMIZE_TIME
722
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
723
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
724
+ if (amatch->ignore_case) {
725
+ LOWERCASE_STRINGS
726
+ }
727
+ COMPUTE_JARO
728
+ if (amatch->ignore_case) {
729
+ FREE_STRINGS
730
+ }
731
+ return rb_float_new(result);
732
+ }
733
+
734
+ /*
735
+ * Jaro-Winkler computation
736
+ */
737
+
738
+ static VALUE JaroWinkler_match(JaroWinkler *amatch, VALUE string)
739
+ {
740
+ char *a_ptr, *b_ptr;
741
+ int a_len, b_len, max_dist, m, t, i, j, k, low, high, n;
742
+ int *l[2];
743
+ double result;
744
+
745
+ Check_Type(string, T_STRING);
746
+ OPTIMIZE_TIME
747
+ if (a_len == 0 && b_len == 0) return rb_float_new(1.0);
748
+ if (a_len == 0 || b_len == 0) return rb_float_new(0.0);
749
+ if (amatch->ignore_case) {
750
+ LOWERCASE_STRINGS
751
+ }
752
+ COMPUTE_JARO
753
+ n = 0;
754
+ for (i = 0; i < (a_len >= 4 ? 4 : a_len); i++) {
755
+ if (a_ptr[i] == b_ptr[i]) {
756
+ n++;
757
+ } else {
758
+ break;
759
+ }
760
+ }
761
+ result = result + n*amatch->scaling_factor*(1-result);
762
+ if (amatch->ignore_case) {
763
+ FREE_STRINGS
764
+ }
765
+ return rb_float_new(result);
766
+ }
767
+
619
768
  /*
620
769
  * Ruby API
621
770
  */
@@ -951,8 +1100,8 @@ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
951
1100
  } else {
952
1101
  Check_Type(strings, T_ARRAY);
953
1102
  int i;
954
- result = rb_ary_new2(RARRAY(strings)->len);
955
- for (i = 0; i < RARRAY(strings)->len; i++) {
1103
+ result = rb_ary_new2(RARRAY_LEN(strings));
1104
+ for (i = 0; i < RARRAY_LEN(strings); i++) {
956
1105
  VALUE string = rb_ary_entry(strings, i);
957
1106
  if (TYPE(string) != T_STRING) {
958
1107
  rb_raise(rb_eTypeError,
@@ -1214,104 +1363,183 @@ static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1214
1363
  VALUE amatch = rb_LongestSubstring_new(rb_cLongestSubstring, self);
1215
1364
  return rb_LongestSubstring_similar(amatch, strings);
1216
1365
  }
1366
+
1367
+ /*
1368
+ * Document-class: Amatch::Jaro
1369
+ *
1370
+ * This class computes the Jaro metric for two strings.
1371
+ * The Jaro metric computes the similarity between 0 (no match)
1372
+ * and 1 (exact match) by looking for matching and transposed characters.
1373
+ */
1374
+ DEF_RB_FREE(Jaro, Jaro)
1375
+
1376
+ /*
1377
+ * Document-method: ignore_case
1378
+ *
1379
+ * call-seq: ignore_case -> true/false
1380
+ *
1381
+ * Returns whether case is ignored when computing matching characters.
1382
+ */
1383
+ DEF_RB_READER(Jaro, rb_Jaro_ignore_case, ignore_case, C2BOOL)
1384
+
1385
+ /*
1386
+ * Document-method: ignore_case=
1387
+ *
1388
+ * call-seq: ignore_case=(true/false)
1389
+ *
1390
+ * Sets whether case is ignored when computing matching characters.
1391
+ */
1392
+ DEF_RB_WRITER(Jaro, rb_Jaro_ignore_case_set, ignore_case,
1393
+ int, CAST2BOOL, BOOL2C, != Qundef)
1394
+
1395
+ /*
1396
+ * call-seq: new(pattern)
1397
+ *
1398
+ * Creates a new Amatch::Jaro instance from <code>pattern</code>.
1399
+ */
1400
+ static VALUE rb_Jaro_initialize(VALUE self, VALUE pattern)
1401
+ {
1402
+ GET_STRUCT(Jaro)
1403
+ Jaro_pattern_set(amatch, pattern);
1404
+ amatch->ignore_case = 1;
1405
+ return self;
1406
+ }
1407
+
1408
+ DEF_CONSTRUCTOR(Jaro, Jaro)
1217
1409
 
1218
1410
  /*
1219
- * = amatch - Approximate Matching Extension for Ruby
1411
+ * call-seq: match(strings) -> results
1220
1412
  *
1221
- * == Description
1413
+ * Uses this Amatch::Jaro instance to match
1414
+ * Jaro#pattern against <code>strings</code>, that is compute the
1415
+ * jaro metric with the strings. <code>strings</code> has to be
1416
+ * either a String or an Array of Strings. The returned <code>results</code>
1417
+ * are either a Float or an Array of Floats respectively.
1418
+ */
1419
+ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1420
+ {
1421
+ GET_STRUCT(Jaro)
1422
+ return Jaro_iterate_strings(amatch, strings, Jaro_match);
1423
+ }
1424
+
1425
+ /*
1426
+ * call-seq: jaro_similar(strings) -> results
1222
1427
  *
1223
- * This is a collection of classes that can be used for Approximate
1224
- * matching, searching, and comparing of Strings. They implement algorithms
1225
- * that compute the Levenshtein edit distance, Sellers edit distance, the
1226
- * Hamming distance, the longest common subsequence length, the longest common
1227
- * substring length, and the pair distance metric.
1428
+ * If called on a String, this string is used as a
1429
+ * Amatch::Jaro#pattern to match against <code>strings</code>. It
1430
+ * returns a Jaro metric number between 0.0 for very
1431
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1432
+ * either a String or an Array of Strings. The returned <code>results</code>
1433
+ * are either a Float or an Array of Floats respectively.
1434
+ */
1435
+ static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1436
+ {
1437
+ VALUE amatch = rb_Jaro_new(rb_cJaro, self);
1438
+ return rb_Jaro_match(amatch, strings);
1439
+ }
1440
+
1441
+ /*
1442
+ * Document-class: Amatch::JaroWinkler
1228
1443
  *
1229
- * == Author
1444
+ * This class computes the Jaro-Winkler metric for two strings.
1445
+ * The Jaro-Winkler metric computes the similarity between 0 (no match)
1446
+ * and 1 (exact match) by looking for matching and transposed characters.
1230
1447
  *
1231
- * Florian Frank mailto:flori@ping.de
1448
+ * It is a variant of the Jaro metric, with additional weighting towards
1449
+ * common prefixes.
1450
+ */
1451
+ DEF_RB_FREE(JaroWinkler, JaroWinkler)
1452
+
1453
+ /*
1454
+ * Document-method: ignore_case
1232
1455
  *
1233
- * == License
1456
+ * call-seq: ignore_case -> true/false
1234
1457
  *
1235
- * This is free software; you can redistribute it and/or modify it under
1236
- * the terms of the GNU General Public License Version 2 as published by
1237
- * the Free Software Foundation: http://www.gnu.org/copyleft/gpl.html
1458
+ * Returns whether case is ignored when computing matching characters.
1459
+ * Default is true.
1460
+ */
1461
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_ignore_case, ignore_case, C2BOOL)
1462
+
1463
+ /*
1464
+ * Document-method: scaling_factor
1238
1465
  *
1239
- * == Download
1466
+ * call-seq: scaling_factor -> weight
1240
1467
  *
1241
- * The latest version of <b>amatch</b> can be found at
1468
+ * The scaling factor is how much weight to give common prefixes.
1469
+ * Default is 0.1.
1470
+ */
1471
+ DEF_RB_READER(JaroWinkler, rb_JaroWinkler_scaling_factor, scaling_factor, rb_float_new)
1472
+
1473
+ /*
1474
+ * Document-method: ignore_case=
1242
1475
  *
1243
- * * http://rubyforge.org/frs/?group_id=390
1476
+ * call-seq: ignore_case=(true/false)
1244
1477
  *
1245
- * Online Documentation should be located at
1478
+ * Sets whether case is ignored when computing matching characters.
1479
+ */
1480
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_ignore_case_set, ignore_case,
1481
+ int, CAST2BOOL, BOOL2C, != Qundef)
1482
+
1483
+ /*
1484
+ * Document-method: scaling_factor=
1485
+ *
1486
+ * call-seq: scaling_factor=(weight)
1487
+ *
1488
+ * Sets the weight to give common prefixes.
1489
+ */
1490
+ DEF_RB_WRITER(JaroWinkler, rb_JaroWinkler_scaling_factor_set, scaling_factor,
1491
+ double, CAST2FLOAT, FLOAT2C, >= 0)
1492
+
1493
+ /*
1494
+ * call-seq: new(pattern)
1246
1495
  *
1247
- * * http://amatch.rubyforge.org
1496
+ * Creates a new Amatch::JaroWinkler instance from <code>pattern</code>.
1497
+ */
1498
+ static VALUE rb_JaroWinkler_initialize(VALUE self, VALUE pattern)
1499
+ {
1500
+ GET_STRUCT(JaroWinkler)
1501
+ JaroWinkler_pattern_set(amatch, pattern);
1502
+ amatch->ignore_case = 1;
1503
+ amatch->scaling_factor = 0.1;
1504
+ return self;
1505
+ }
1506
+
1507
+ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1508
+
1509
+ /*
1510
+ * call-seq: match(strings) -> results
1248
1511
  *
1249
- * == Examples
1250
- * require 'amatch'
1251
- * # => true
1252
- * include Amatch
1253
- * # => Object
1254
- *
1255
- * m = Sellers.new("pattern")
1256
- * # => #<Amatch::Sellers:0x40366324>
1257
- * m.match("pattren")
1258
- * # => 2.0
1259
- * m.substitution = m.insertion = 3
1260
- * # => 3
1261
- * m.match("pattren")
1262
- * # => 4.0
1263
- * m.reset_weights
1264
- * # => #<Amatch::Sellers:0x40366324>
1265
- * m.match(["pattren","parent"])
1266
- * # => [2.0, 4.0]
1267
- * m.search("abcpattrendef")
1268
- * # => 2.0
1269
- *
1270
- * m = Levenshtein.new("pattern")
1271
- * # => #<Amatch::Levenshtein:0x4035919c>
1272
- * m.match("pattren")
1273
- * # => 2
1274
- * m.search("abcpattrendef")
1275
- * # => 2
1276
- * "pattern language".levenshtein_similar("language of patterns")
1277
- * # => 0.2
1278
- *
1279
- * m = Hamming.new("pattern")
1280
- * # => #<Amatch::Hamming:0x40350858>
1281
- * m.match("pattren")
1282
- * # => 2
1283
- * "pattern language".hamming_similar("language of patterns")
1284
- * # => 0.1
1285
- *
1286
- * m = PairDistance.new("pattern")
1287
- * # => #<Amatch::PairDistance:0x40349be8>
1288
- * m.match("pattr en")
1289
- * # => 0.545454545454545
1290
- * m.match("pattr en", nil)
1291
- * # => 0.461538461538462
1292
- * m.match("pattr en", /t+/)
1293
- * # => 0.285714285714286
1294
- * "pattern language".pair_distance_similar("language of patterns")
1295
- * # => 0.928571428571429
1296
- *
1297
- * m = LongestSubsequence.new("pattern")
1298
- * # => #<Amatch::LongestSubsequence:0x4033e900>
1299
- * m.match("pattren")
1300
- * # => 6
1301
- * "pattern language".longest_subsequence_similar("language of patterns")
1302
- * # => 0.4
1303
- *
1304
- * m = LongestSubstring.new("pattern")
1305
- * # => #<Amatch::LongestSubstring:0x403378d0>
1306
- * m.match("pattren")
1307
- * # => 4
1308
- * "pattern language".longest_substring_similar("language of patterns")
1309
- * # => 0.4
1512
+ * Uses this Amatch::Jaro instance to match
1513
+ * Jaro#pattern against <code>strings</code>, that is compute the
1514
+ * jaro metric with the strings. <code>strings</code> has to be
1515
+ * either a String or an Array of Strings. The returned <code>results</code>
1516
+ * are either a Float or an Array of Floats respectively.
1517
+ */
1518
+ static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1519
+ {
1520
+ GET_STRUCT(JaroWinkler)
1521
+ return JaroWinkler_iterate_strings(amatch, strings, JaroWinkler_match);
1522
+ }
1523
+
1524
+ /*
1525
+ * call-seq: jarowinkler_similar(strings) -> results
1310
1526
  *
1527
+ * If called on a String, this string is used as a
1528
+ * Amatch::JaroWinkler#pattern to match against <code>strings</code>. It
1529
+ * returns a Jaro-Winkler metric number between 0.0 for very
1530
+ * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1531
+ * either a String or an Array of Strings. The returned <code>results</code>
1532
+ * are either a Float or an Array of Floats respectively.
1311
1533
  */
1534
+ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1535
+ {
1536
+ VALUE amatch = rb_JaroWinkler_new(rb_cJaro, self);
1537
+ return rb_JaroWinkler_match(amatch, strings);
1538
+ }
1312
1539
 
1313
1540
  void Init_amatch()
1314
1541
  {
1542
+ rb_require("amatch/version");
1315
1543
  rb_mAmatch = rb_define_module("Amatch");
1316
1544
 
1317
1545
  /* Levenshtein */
@@ -1382,7 +1610,32 @@ void Init_amatch()
1382
1610
  rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1383
1611
  rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1384
1612
 
1613
+ /* Jaro */
1614
+ rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
1615
+ rb_define_alloc_func(rb_cJaro, rb_Jaro_s_allocate);
1616
+ rb_define_method(rb_cJaro, "initialize", rb_Jaro_initialize, 1);
1617
+ rb_define_method(rb_cJaro, "pattern", rb_Jaro_pattern, 0);
1618
+ rb_define_method(rb_cJaro, "pattern=", rb_Jaro_pattern_set, 1);
1619
+ rb_define_method(rb_cJaro, "ignore_case", rb_Jaro_ignore_case, 0);
1620
+ rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1621
+ rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1622
+ rb_define_alias(rb_cJaro, "similar", "match");
1623
+ rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
1624
+
1625
+ /* Jaro-Winkler */
1626
+ rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
1627
+ rb_define_alloc_func(rb_cJaroWinkler, rb_JaroWinkler_s_allocate);
1628
+ rb_define_method(rb_cJaroWinkler, "initialize", rb_JaroWinkler_initialize, 1);
1629
+ rb_define_method(rb_cJaroWinkler, "pattern", rb_JaroWinkler_pattern, 0);
1630
+ rb_define_method(rb_cJaroWinkler, "pattern=", rb_JaroWinkler_pattern_set, 1);
1631
+ rb_define_method(rb_cJaroWinkler, "ignore_case", rb_JaroWinkler_ignore_case, 0);
1632
+ rb_define_method(rb_cJaroWinkler, "ignore_case=", rb_JaroWinkler_ignore_case_set, 1);
1633
+ rb_define_method(rb_cJaroWinkler, "scaling_factor", rb_JaroWinkler_scaling_factor, 0);
1634
+ rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1635
+ rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1636
+ rb_define_alias(rb_cJaroWinkler, "similar", "match");
1637
+ rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1638
+
1385
1639
  id_split = rb_intern("split");
1386
1640
  id_to_f = rb_intern("to_f");
1387
1641
  }
1388
- /* vim: set et cin sw=4 ts=4: */