amatch 0.2.8 → 0.2.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,6 +1,7 @@
1
1
  rvm:
2
2
  - 1.8.7
3
3
  - 1.9.2
4
+ - 1.9.3
4
5
  - ruby-head
5
6
  - ree
6
7
  - rbx
data/CHANGES CHANGED
@@ -1,3 +1,10 @@
1
+ 2011-11-14 (0.2.9)
2
+ * Provide amatch/rude and amatch/polite for require (the latter doesn't
3
+ extend ::String on its own)
4
+ * pair_distance_similar method now can take an optional regexp argument for
5
+ tokenizing.
6
+ 2011-09-26 (0.2.8)
7
+ * Depend on tins library.
1
8
  2011-08-06 (0.2.7)
2
9
  * Fix some violations of ISO C90 standard.
3
10
  2011-07-16 (0.2.6)
data/VERSION CHANGED
@@ -1 +1 @@
1
- 0.2.8
1
+ 0.2.9
@@ -2,38 +2,38 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "amatch"
5
- s.version = "0.2.8"
5
+ s.version = "0.2.9"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Florian Frank"]
9
- s.date = "2011-09-25"
9
+ s.date = "2011-11-15"
10
10
  s.description = "Amatch is a library for approximate string matching and searching in strings.\nSeveral algorithms can be used to do this, and it's also possible to compute a\nsimilarity metric number between 0.0 and 1.0 for two given strings.\n"
11
11
  s.email = "flori@ping.de"
12
12
  s.executables = ["agrep.rb"]
13
13
  s.extensions = ["ext/extconf.rb"]
14
- s.extra_rdoc_files = ["README.rdoc", "lib/amatch/version.rb", "ext/amatch.c", "ext/pair.c"]
15
- s.files = [".gitignore", ".travis.yml", "CHANGES", "COPYING", "Gemfile", "README.rdoc", "Rakefile", "VERSION", "amatch.gemspec", "bin/agrep.rb", "ext/amatch.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib/amatch/.keep", "lib/amatch/version.rb", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
14
+ s.extra_rdoc_files = ["README.rdoc", "lib/amatch/polite.rb", "lib/amatch/rude.rb", "lib/amatch/version.rb", "lib/amatch.rb", "ext/amatch_ext.c", "ext/pair.c"]
15
+ s.files = [".gitignore", ".travis.yml", "CHANGES", "COPYING", "Gemfile", "README.rdoc", "Rakefile", "VERSION", "amatch.gemspec", "bin/agrep.rb", "ext/amatch_ext.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib/amatch.rb", "lib/amatch/.keep", "lib/amatch/polite.rb", "lib/amatch/rude.rb", "lib/amatch/version.rb", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
16
16
  s.homepage = "http://github.com/flori/amatch"
17
17
  s.rdoc_options = ["--title", "Amatch - Approximate Matching", "--main", "README.rdoc"]
18
18
  s.require_paths = ["lib", "ext"]
19
19
  s.rubygems_version = "1.8.10"
20
20
  s.summary = "Approximate String Matching library"
21
- s.test_files = ["tests/test_sellers.rb", "tests/test_jaro.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_hamming.rb", "tests/test_pair_distance.rb", "tests/test_levenshtein.rb", "tests/test_jaro_winkler.rb"]
21
+ s.test_files = ["tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
22
22
 
23
23
  if s.respond_to? :specification_version then
24
24
  s.specification_version = 3
25
25
 
26
26
  if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
27
- s.add_development_dependency(%q<gem_hadar>, ["~> 0.1.0"])
27
+ s.add_development_dependency(%q<gem_hadar>, ["~> 0.1.1"])
28
28
  s.add_development_dependency(%q<test-unit>, ["~> 2.3"])
29
29
  s.add_runtime_dependency(%q<tins>, ["~> 0.3"])
30
30
  else
31
- s.add_dependency(%q<gem_hadar>, ["~> 0.1.0"])
31
+ s.add_dependency(%q<gem_hadar>, ["~> 0.1.1"])
32
32
  s.add_dependency(%q<test-unit>, ["~> 2.3"])
33
33
  s.add_dependency(%q<tins>, ["~> 0.3"])
34
34
  end
35
35
  else
36
- s.add_dependency(%q<gem_hadar>, ["~> 0.1.0"])
36
+ s.add_dependency(%q<gem_hadar>, ["~> 0.1.1"])
37
37
  s.add_dependency(%q<test-unit>, ["~> 2.3"])
38
38
  s.add_dependency(%q<tins>, ["~> 0.3"])
39
39
  end
@@ -20,7 +20,7 @@
20
20
  */
21
21
 
22
22
 
23
- static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
23
+ static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
24
24
  rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
25
25
  rb_cJaro, rb_cJaroWinkler;
26
26
 
@@ -55,17 +55,17 @@ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
55
55
  static void rb_##klass##_free(type *amatch) \
56
56
  { \
57
57
  MEMZERO(amatch->pattern, char, amatch->pattern_len); \
58
- free(amatch->pattern); \
58
+ xfree(amatch->pattern); \
59
59
  MEMZERO(amatch, type, 1); \
60
- free(amatch); \
60
+ xfree(amatch); \
61
61
  }
62
62
 
63
63
  #define DEF_PATTERN_ACCESSOR(type) \
64
64
  static void type##_pattern_set(type *amatch, VALUE pattern) \
65
65
  { \
66
66
  Check_Type(pattern, T_STRING); \
67
- free(amatch->pattern); \
68
- amatch->pattern_len = RSTRING_LEN(pattern); \
67
+ xfree(amatch->pattern); \
68
+ amatch->pattern_len = (int) RSTRING_LEN(pattern); \
69
69
  amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
70
70
  MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
71
71
  RSTRING_LEN(pattern)); \
@@ -147,21 +147,21 @@ VALUE function(VALUE self, VALUE value) \
147
147
  #define OPTIMIZE_TIME \
148
148
  if (amatch->pattern_len < RSTRING_LEN(string)) { \
149
149
  a_ptr = amatch->pattern; \
150
- a_len = amatch->pattern_len; \
150
+ a_len = (int) amatch->pattern_len; \
151
151
  b_ptr = RSTRING_PTR(string); \
152
- b_len = RSTRING_LEN(string); \
152
+ b_len = (int) RSTRING_LEN(string); \
153
153
  } else { \
154
154
  a_ptr = RSTRING_PTR(string); \
155
- a_len = RSTRING_LEN(string); \
155
+ a_len = (int) RSTRING_LEN(string); \
156
156
  b_ptr = amatch->pattern; \
157
- b_len = amatch->pattern_len; \
157
+ b_len = (int) amatch->pattern_len; \
158
158
  }
159
159
 
160
160
  #define DONT_OPTIMIZE \
161
161
  a_ptr = amatch->pattern; \
162
- a_len = amatch->pattern_len; \
162
+ a_len = (int) amatch->pattern_len; \
163
163
  b_ptr = RSTRING_PTR(string); \
164
- b_len = RSTRING_LEN(string); \
164
+ b_len = (int) RSTRING_LEN(string); \
165
165
 
166
166
  /*
167
167
  * C structures of the Amatch classes
@@ -215,10 +215,10 @@ DEF_PATTERN_ACCESSOR(Jaro)
215
215
  DEF_ITERATE_STRINGS(Jaro)
216
216
 
217
217
  typedef struct JaroWinklerStruct {
218
- char *pattern;
219
- int pattern_len;
220
- int ignore_case;
221
- float scaling_factor;
218
+ char *pattern;
219
+ int pattern_len;
220
+ int ignore_case;
221
+ double scaling_factor;
222
222
  } JaroWinkler;
223
223
 
224
224
  DEF_ALLOCATOR(JaroWinkler)
@@ -271,8 +271,8 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
271
271
 
272
272
  result = INT2FIX(v[p][b_len]);
273
273
 
274
- free(v[0]);
275
- free(v[1]);
274
+ xfree(v[0]);
275
+ xfree(v[1]);
276
276
 
277
277
  return result;
278
278
  }
@@ -303,8 +303,8 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
303
303
  } else {
304
304
  result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
305
305
  }
306
- free(v[0]);
307
- free(v[1]);
306
+ xfree(v[0]);
307
+ xfree(v[1]);
308
308
  return result;
309
309
  }
310
310
 
@@ -332,8 +332,8 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
332
332
 
333
333
  result = INT2FIX(min);
334
334
 
335
- free(v[0]);
336
- free(v[1]);
335
+ xfree(v[0]);
336
+ xfree(v[1]);
337
337
 
338
338
  return result;
339
339
  }
@@ -385,8 +385,8 @@ static VALUE Sellers_match(Sellers *amatch, VALUE string)
385
385
  COMPUTE_SELLERS_DISTANCE
386
386
 
387
387
  result = rb_float_new(v[p][b_len]);
388
- free(v[0]);
389
- free(v[1]);
388
+ xfree(v[0]);
389
+ xfree(v[1]);
390
390
  return result;
391
391
  }
392
392
 
@@ -430,8 +430,8 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
430
430
  } else {
431
431
  result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
432
432
  }
433
- free(v[0]);
434
- free(v[1]);
433
+ xfree(v[0]);
434
+ xfree(v[1]);
435
435
  return result;
436
436
  }
437
437
 
@@ -457,8 +457,8 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
457
457
  if (v[p][i] < min) min = v[p][i];
458
458
  }
459
459
  result = rb_float_new(min);
460
- free(v[0]);
461
- free(v[1]);
460
+ xfree(v[0]);
461
+ xfree(v[1]);
462
462
 
463
463
  return result;
464
464
  }
@@ -467,8 +467,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
467
467
  * Pair distances are computed here:
468
468
  */
469
469
 
470
- static VALUE PairDistance_match(
471
- PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
470
+ static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
472
471
  {
473
472
  double result;
474
473
  VALUE tokens;
@@ -564,8 +563,8 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
564
563
  c = (c + 1) % 2; \
565
564
  } \
566
565
  result = l[p][0]; \
567
- free(l[0]); \
568
- free(l[1]);
566
+ xfree(l[0]); \
567
+ xfree(l[1]);
569
568
 
570
569
 
571
570
  static VALUE LongestSubsequence_match(General *amatch, VALUE string)
@@ -619,8 +618,8 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
619
618
  p = c; \
620
619
  c = (c + 1) % 2; \
621
620
  } \
622
- free(l[0]); \
623
- free(l[1]);
621
+ xfree(l[0]); \
622
+ xfree(l[1]);
624
623
 
625
624
  static VALUE LongestSubstring_match(General *amatch, VALUE string)
626
625
  {
@@ -692,8 +691,8 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
692
691
  t = t / 2; \
693
692
  result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
694
693
  } \
695
- free(l[0]); \
696
- free(l[1]);
694
+ xfree(l[0]); \
695
+ xfree(l[1]);
697
696
 
698
697
 
699
698
  #define LOWERCASE_STRINGS \
@@ -811,7 +810,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
811
810
  * Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
812
811
  * against <code>strings</code>. It returns the number operations, the Sellers
813
812
  * distance. <code>strings</code> has to be either a String or an Array of
814
- * Strings. The returned <code>results</code> are either a Float or an Array of
813
+ * Strings. The returned <code>results</code> is either a Float or an Array of
815
814
  * Floats respectively.
816
815
  */
817
816
  static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
@@ -827,7 +826,7 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
827
826
  * against <code>strings</code>, and compute a Levenshtein distance metric
828
827
  * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
829
828
  * <code>strings</code> has to be either a String or an Array of Strings. The
830
- * returned <code>results</code> are either a Fixnum or an Array of Fixnums
829
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
831
830
  * respectively.
832
831
  */
833
832
  static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
@@ -843,7 +842,7 @@ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
843
842
  * to match against <code>strings</code>. It returns a Levenshtein distance
844
843
  * metric number between 0.0 for very unsimilar strings and 1.0 for an exact
845
844
  * match. <code>strings</code> has to be either a String or an Array of
846
- * Strings. The returned <code>results</code> are either a Float or an Array of
845
+ * Strings. The returned <code>results</code> is either a Float or an Array of
847
846
  * Floats respectively.
848
847
  */
849
848
  static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
@@ -859,7 +858,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
859
858
  * edit distance (the sum of character operations) as a Fixnum value, by greedy
860
859
  * trimming prefixes or postfixes of the match. <code>strings</code> has
861
860
  * to be either a String or an Array of Strings. The returned
862
- * <code>results</code> are either a Float or an Array of Floats respectively.
861
+ * <code>results</code> is either a Float or an Array of Floats respectively.
863
862
  */
864
863
  static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
865
864
  {
@@ -998,7 +997,7 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
998
997
  * <code>strings</code>, while taking into account the given weights. It
999
998
  * returns the number of weighted character operations, the Sellers distance.
1000
999
  * <code>strings</code> has to be either a String or an Array of Strings. The
1001
- * returned <code>results</code> are either a Float or an Array of Floats
1000
+ * returned <code>results</code> is either a Float or an Array of Floats
1002
1001
  * respectively.
1003
1002
  */
1004
1003
  static VALUE rb_Sellers_match(VALUE self, VALUE strings)
@@ -1014,7 +1013,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
1014
1013
  * against <code>strings</code> (taking into account the given weights), and
1015
1014
  * compute a Sellers distance metric number between 0.0 for very unsimilar
1016
1015
  * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1017
- * String or an Array of Strings. The returned <code>results</code> are either
1016
+ * String or an Array of Strings. The returned <code>results</code> is either
1018
1017
  * a Fixnum or an Array of Fixnums
1019
1018
  * respectively.
1020
1019
  */
@@ -1031,7 +1030,7 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
1031
1030
  * distance (the sum of weighted character operations) as a Float value, by
1032
1031
  * greedy trimming prefixes or postfixes of the match. <code>strings</code> has
1033
1032
  * to be either a String or an Array of Strings. The returned
1034
- * <code>results</code> are either a Float or an Array of Floats respectively.
1033
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1035
1034
  */
1036
1035
  static VALUE rb_Sellers_search(VALUE self, VALUE strings)
1037
1036
  {
@@ -1089,7 +1088,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
1089
1088
  * splitting should be omitted, call the method with nil as <code>regexp</code>
1090
1089
  * explicitly.
1091
1090
  *
1092
- * The returned <code>results</code> are either a Float or an
1091
+ * The returned <code>results</code> is either a Float or an
1093
1092
  * Array of Floats respectively.
1094
1093
  */
1095
1094
  static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
@@ -1125,19 +1124,30 @@ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
1125
1124
  }
1126
1125
 
1127
1126
  /*
1128
- * call-seq: pair_distance_similar(strings) -> results
1127
+ * call-seq: pair_distance_similar(strings, regexp = nil) -> results
1129
1128
  *
1130
1129
  * If called on a String, this string is used as a Amatch::PairDistance#pattern
1131
1130
  * to match against <code>strings</code> using /\s+/ as the tokenizing regular
1132
1131
  * expression. It returns a pair distance metric number between 0.0 for very
1133
1132
  * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1134
- * either a String or an Array of Strings. The returned <code>results</code>
1135
- * are either a Float or an Array of Floats respectively.
1133
+ * either a String or an Array of Strings.
1134
+ *
1135
+ * The returned <code>results</code> is either a Float or an Array of Floats
1136
+ * respectively.
1136
1137
  */
1137
- static VALUE rb_str_pair_distance_similar(VALUE self, VALUE strings)
1138
+ static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
1138
1139
  {
1139
- VALUE amatch = rb_PairDistance_new(rb_cPairDistance, self);
1140
- return rb_PairDistance_match(1, &strings, amatch);
1140
+ VALUE amatch, string, regexp = Qnil;
1141
+ rb_scan_args(argc, argv, "11", &string, &regexp);
1142
+ amatch = rb_PairDistance_new(rb_cPairDistance, self);
1143
+ if (NIL_P(regexp)) {
1144
+ return rb_PairDistance_match(1, &string, amatch);
1145
+ } else {
1146
+ VALUE *args = alloca(2);
1147
+ args[0] = string;
1148
+ args[1] = regexp;
1149
+ return rb_PairDistance_match(2, args, amatch);
1150
+ }
1141
1151
  }
1142
1152
 
1143
1153
  /*
@@ -1175,7 +1185,7 @@ DEF_CONSTRUCTOR(Hamming, General)
1175
1185
  * <code>strings</code>, that is compute the hamming distance between
1176
1186
  * <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
1177
1187
  * be either a String or an Array of Strings. The returned <code>results</code>
1178
- * are either a Fixnum or an Array of Fixnums respectively.
1188
+ * is either a Fixnum or an Array of Fixnums respectively.
1179
1189
  */
1180
1190
  static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1181
1191
  {
@@ -1190,7 +1200,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
1190
1200
  * <code>strings</code>, and compute a Hamming distance metric number between
1191
1201
  * 0.0 for very unsimilar strings and 1.0 for an exact match.
1192
1202
  * <code>strings</code> has to be either a String or an Array of Strings. The
1193
- * returned <code>results</code> are either a Fixnum or an Array of Fixnums
1203
+ * returned <code>results</code> is either a Fixnum or an Array of Fixnums
1194
1204
  * respectively.
1195
1205
  */
1196
1206
  static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
@@ -1207,7 +1217,7 @@ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
1207
1217
  * number between 0.0 for very unsimilar strings and 1.0 for an exact match.
1208
1218
  * <code>strings</code>
1209
1219
  * has to be either a String or an Array of Strings. The returned
1210
- * <code>results</code> are either a Float or an Array of Floats respectively.
1220
+ * <code>results</code> is either a Float or an Array of Floats respectively.
1211
1221
  */
1212
1222
  static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
1213
1223
  {
@@ -1251,7 +1261,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
1251
1261
  * LongestSubsequence#pattern against <code>strings</code>, that is compute the
1252
1262
  * length of the longest common subsequence. <code>strings</code> has to be
1253
1263
  * either a String or an Array of Strings. The returned <code>results</code>
1254
- * are either a Fixnum or an Array of Fixnums respectively.
1264
+ * is either a Fixnum or an Array of Fixnums respectively.
1255
1265
  */
1256
1266
  static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1257
1267
  {
@@ -1266,7 +1276,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
1266
1276
  * Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
1267
1277
  * a longest substring distance metric number between 0.0 for very unsimilar
1268
1278
  * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1269
- * String or an Array of Strings. The returned <code>results</code> are either
1279
+ * String or an Array of Strings. The returned <code>results</code> is either
1270
1280
  * a Fixnum or an Array of Fixnums
1271
1281
  */
1272
1282
  static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
@@ -1283,7 +1293,7 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
1283
1293
  * returns a longest subsequence distance metric number between 0.0 for very
1284
1294
  * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1285
1295
  * either a String or an Array of Strings. The returned <code>results</code>
1286
- * are either a Float or an Array of Floats respectively.
1296
+ * is either a Float or an Array of Floats respectively.
1287
1297
  */
1288
1298
  static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
1289
1299
  {
@@ -1328,7 +1338,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
1328
1338
  * LongestSubstring#pattern against <code>strings</code>, that is compute the
1329
1339
  * length of the longest common substring. <code>strings</code> has to be
1330
1340
  * either a String or an Array of Strings. The returned <code>results</code>
1331
- * are either a Fixnum or an Array of Fixnums respectively.
1341
+ * is either a Fixnum or an Array of Fixnums respectively.
1332
1342
  */
1333
1343
  static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1334
1344
  {
@@ -1343,7 +1353,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
1343
1353
  * Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
1344
1354
  * longest substring distance metric number between 0.0 for very unsimilar
1345
1355
  * strings and 1.0 for an exact match. <code>strings</code> has to be either a
1346
- * String or an Array of Strings. The returned <code>results</code> are either
1356
+ * String or an Array of Strings. The returned <code>results</code> is either
1347
1357
  * a Fixnum or an Array of Fixnums
1348
1358
  * respectively.
1349
1359
  */
@@ -1361,7 +1371,7 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
1361
1371
  * returns a longest substring distance metric number between 0.0 for very
1362
1372
  * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1363
1373
  * either a String or an Array of Strings. The returned <code>results</code>
1364
- * are either a Float or an Array of Floats respectively.
1374
+ * is either a Float or an Array of Floats respectively.
1365
1375
  */
1366
1376
  static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
1367
1377
  {
@@ -1419,7 +1429,7 @@ DEF_CONSTRUCTOR(Jaro, Jaro)
1419
1429
  * Jaro#pattern against <code>strings</code>, that is compute the
1420
1430
  * jaro metric with the strings. <code>strings</code> has to be
1421
1431
  * either a String or an Array of Strings. The returned <code>results</code>
1422
- * are either a Float or an Array of Floats respectively.
1432
+ * is either a Float or an Array of Floats respectively.
1423
1433
  */
1424
1434
  static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1425
1435
  {
@@ -1435,7 +1445,7 @@ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
1435
1445
  * returns a Jaro metric number between 0.0 for very
1436
1446
  * unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
1437
1447
  * either a String or an Array of Strings. The returned <code>results</code>
1438
- * are either a Float or an Array of Floats respectively.
1448
+ * is either a Float or an Array of Floats respectively.
1439
1449
  */
1440
1450
  static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
1441
1451
  {
@@ -1518,7 +1528,7 @@ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
1518
1528
  * Jaro#pattern against <code>strings</code>, that is compute the
1519
1529
  * jaro metric with the strings. <code>strings</code> has to be
1520
1530
  * either a String or an Array of Strings. The returned <code>results</code>
1521
- * are either a Float or an Array of Floats respectively.
1531
+ * is either a Float or an Array of Floats respectively.
1522
1532
  */
1523
1533
  static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
1524
1534
  {
@@ -1542,10 +1552,17 @@ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
1542
1552
  return rb_JaroWinkler_match(amatch, strings);
1543
1553
  }
1544
1554
 
1545
- void Init_amatch()
1555
+ /*
1556
+ * This is the namespace module that includes all other classes, modules, and
1557
+ * constants.
1558
+ */
1559
+
1560
+ void Init_amatch_ext()
1546
1561
  {
1547
1562
  rb_require("amatch/version");
1548
1563
  rb_mAmatch = rb_define_module("Amatch");
1564
+ /* This module can be mixed into ::String or its subclasses to mixin the similary methods directly. */
1565
+ rb_mAmatchStringMethods = rb_define_module_under(rb_mAmatch, "StringMethods");
1549
1566
 
1550
1567
  /* Levenshtein */
1551
1568
  rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
@@ -1556,7 +1573,7 @@ void Init_amatch()
1556
1573
  rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
1557
1574
  rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
1558
1575
  rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
1559
- rb_define_method(rb_cString, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1576
+ rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
1560
1577
 
1561
1578
  /* Sellers */
1562
1579
  rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
@@ -1583,9 +1600,9 @@ void Init_amatch()
1583
1600
  rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
1584
1601
  rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
1585
1602
  rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
1586
- rb_define_method(rb_cString, "hamming_similar", rb_str_hamming_similar, 1);
1603
+ rb_define_method(rb_mAmatchStringMethods, "hamming_similar", rb_str_hamming_similar, 1);
1587
1604
 
1588
- /* Pair Distance Metric */
1605
+ /* Pair Distance Metric / Dice Coefficient */
1589
1606
  rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
1590
1607
  rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
1591
1608
  rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
@@ -1593,7 +1610,7 @@ void Init_amatch()
1593
1610
  rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
1594
1611
  rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
1595
1612
  rb_define_alias(rb_cPairDistance, "similar", "match");
1596
- rb_define_method(rb_cString, "pair_distance_similar", rb_str_pair_distance_similar, 1);
1613
+ rb_define_method(rb_mAmatchStringMethods, "pair_distance_similar", rb_str_pair_distance_similar, -1);
1597
1614
 
1598
1615
  /* Longest Common Subsequence */
1599
1616
  rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
@@ -1603,7 +1620,7 @@ void Init_amatch()
1603
1620
  rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
1604
1621
  rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
1605
1622
  rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
1606
- rb_define_method(rb_cString, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1623
+ rb_define_method(rb_mAmatchStringMethods, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
1607
1624
 
1608
1625
  /* Longest Common Substring */
1609
1626
  rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
@@ -1613,7 +1630,7 @@ void Init_amatch()
1613
1630
  rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
1614
1631
  rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
1615
1632
  rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
1616
- rb_define_method(rb_cString, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1633
+ rb_define_method(rb_mAmatchStringMethods, "longest_substring_similar", rb_str_longest_substring_similar, 1);
1617
1634
 
1618
1635
  /* Jaro */
1619
1636
  rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
@@ -1625,7 +1642,7 @@ void Init_amatch()
1625
1642
  rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
1626
1643
  rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
1627
1644
  rb_define_alias(rb_cJaro, "similar", "match");
1628
- rb_define_method(rb_cString, "jaro_similar", rb_str_jaro_similar, 1);
1645
+ rb_define_method(rb_mAmatchStringMethods, "jaro_similar", rb_str_jaro_similar, 1);
1629
1646
 
1630
1647
  /* Jaro-Winkler */
1631
1648
  rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
@@ -1639,7 +1656,7 @@ void Init_amatch()
1639
1656
  rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
1640
1657
  rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
1641
1658
  rb_define_alias(rb_cJaroWinkler, "similar", "match");
1642
- rb_define_method(rb_cString, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1659
+ rb_define_method(rb_mAmatchStringMethods, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
1643
1660
 
1644
1661
  id_split = rb_intern("split");
1645
1662
  id_to_f = rb_intern("to_f");
@@ -3,4 +3,4 @@ require 'rbconfig'
3
3
  if CONFIG['CC'] == 'gcc'
4
4
  CONFIG['CC'] = 'gcc -Wall '
5
5
  end
6
- create_makefile 'amatch'
6
+ create_makefile 'amatch_ext'
data/ext/pair.c CHANGED
@@ -7,7 +7,7 @@ static int predict_length(VALUE tokens)
7
7
  int i, l, result;
8
8
  for (i = 0, result = 0; i < RARRAY_LEN(tokens); i++) {
9
9
  VALUE t = rb_ary_entry(tokens, i);
10
- l = RSTRING_LEN(t) - 1;
10
+ l = (int) RSTRING_LEN(t) - 1;
11
11
  if (l > 0) result += l;
12
12
  }
13
13
  return result;
@@ -0,0 +1,3 @@
1
+ module Amatch
2
+ end
3
+ require 'amatch/rude'
@@ -0,0 +1,7 @@
1
+ module Amatch
2
+ end
3
+ require 'amatch_ext'
4
+ module Amatch
5
+ # alias
6
+ DiceCoefficient = PairDistance
7
+ end
@@ -0,0 +1,7 @@
1
+ require 'amatch_ext'
2
+ module Amatch
3
+ DiceCoefficient = PairDistance
4
+ end
5
+ class ::String
6
+ include ::Amatch::StringMethods
7
+ end
@@ -1,6 +1,6 @@
1
1
  module Amatch
2
2
  # Amatch version
3
- VERSION = '0.2.8'
3
+ VERSION = '0.2.9'
4
4
  VERSION_ARRAY = VERSION.split(/\./).map { |x| x.to_i } # :nodoc:
5
5
  VERSION_MAJOR = VERSION_ARRAY[0] # :nodoc:
6
6
  VERSION_MINOR = VERSION_ARRAY[1] # :nodoc:
@@ -1,5 +1,5 @@
1
1
  require 'test/unit'
2
- require File.dirname(__FILE__) + "/../ext/amatch"
2
+ require 'amatch'
3
3
 
4
4
  class TestJaro < Test::Unit::TestCase
5
5
  include Amatch
@@ -11,6 +11,10 @@ class TestLevenshtein < Test::Unit::TestCase
11
11
  @long = Levenshtein.new('A' * 160)
12
12
  end
13
13
 
14
+ def test_version
15
+ assert_kind_of String, Amatch::VERSION
16
+ end
17
+
14
18
  def test_match
15
19
  assert_equal 4, @simple.match('')
16
20
  assert_equal 0, @simple.match('test')
@@ -15,6 +15,10 @@ class TestPairDistance < Test::Unit::TestCase
15
15
  @long = PairDistance.new('A' * 160)
16
16
  end
17
17
 
18
+ def test_alternative_constant
19
+ assert_equal PairDistance, DiceCoefficient
20
+ end
21
+
18
22
  def test_empty
19
23
  assert_in_delta 1, @empty.match(''), D
20
24
  assert_in_delta 0, @empty.match('not empty'), D
@@ -55,6 +59,8 @@ class TestPairDistance < Test::Unit::TestCase
55
59
  assert_in_delta 0.6, @single.match('test aaa bbb'), D
56
60
  assert_in_delta 0.6, @single.match('bbb aaa test'), D
57
61
  assert_in_delta 0.8571428, @single.pattern.pair_distance_similar('atest'), D
62
+ assert_in_delta 1.0, @france.pattern.pair_distance_similar('of france, republic', /[, ]+/), D
63
+ assert_in_delta 0.9230769, @france.pattern.pair_distance_similar('of france, republik', /[, ]+/), D
58
64
  end
59
65
 
60
66
  def test_csv
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amatch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-09-25 00:00:00.000000000Z
12
+ date: 2011-11-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: gem_hadar
16
- requirement: &2156333160 !ruby/object:Gem::Requirement
16
+ requirement: &2152298700 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 0.1.0
21
+ version: 0.1.1
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *2156333160
24
+ version_requirements: *2152298700
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: test-unit
27
- requirement: &2156332280 !ruby/object:Gem::Requirement
27
+ requirement: &2152312400 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '2.3'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *2156332280
35
+ version_requirements: *2152312400
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: tins
38
- requirement: &2156331380 !ruby/object:Gem::Requirement
38
+ requirement: &2152310440 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '0.3'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *2156331380
46
+ version_requirements: *2152310440
47
47
  description: ! 'Amatch is a library for approximate string matching and searching
48
48
  in strings.
49
49
 
@@ -59,8 +59,11 @@ extensions:
59
59
  - ext/extconf.rb
60
60
  extra_rdoc_files:
61
61
  - README.rdoc
62
+ - lib/amatch/polite.rb
63
+ - lib/amatch/rude.rb
62
64
  - lib/amatch/version.rb
63
- - ext/amatch.c
65
+ - lib/amatch.rb
66
+ - ext/amatch_ext.c
64
67
  - ext/pair.c
65
68
  files:
66
69
  - .gitignore
@@ -73,13 +76,16 @@ files:
73
76
  - VERSION
74
77
  - amatch.gemspec
75
78
  - bin/agrep.rb
76
- - ext/amatch.c
79
+ - ext/amatch_ext.c
77
80
  - ext/common.h
78
81
  - ext/extconf.rb
79
82
  - ext/pair.c
80
83
  - ext/pair.h
81
84
  - install.rb
85
+ - lib/amatch.rb
82
86
  - lib/amatch/.keep
87
+ - lib/amatch/polite.rb
88
+ - lib/amatch/rude.rb
83
89
  - lib/amatch/version.rb
84
90
  - tests/test_hamming.rb
85
91
  - tests/test_jaro.rb
@@ -119,11 +125,11 @@ signing_key:
119
125
  specification_version: 3
120
126
  summary: Approximate String Matching library
121
127
  test_files:
122
- - tests/test_sellers.rb
128
+ - tests/test_hamming.rb
123
129
  - tests/test_jaro.rb
130
+ - tests/test_jaro_winkler.rb
131
+ - tests/test_levenshtein.rb
124
132
  - tests/test_longest_subsequence.rb
125
133
  - tests/test_longest_substring.rb
126
- - tests/test_hamming.rb
127
134
  - tests/test_pair_distance.rb
128
- - tests/test_levenshtein.rb
129
- - tests/test_jaro_winkler.rb
135
+ - tests/test_sellers.rb