amatch 0.2.8 → 0.2.9
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +1 -0
- data/CHANGES +7 -0
- data/VERSION +1 -1
- data/amatch.gemspec +8 -8
- data/ext/{amatch.c → amatch_ext.c} +87 -70
- data/ext/extconf.rb +1 -1
- data/ext/pair.c +1 -1
- data/lib/amatch.rb +3 -0
- data/lib/amatch/polite.rb +7 -0
- data/lib/amatch/rude.rb +7 -0
- data/lib/amatch/version.rb +1 -1
- data/tests/test_jaro.rb +1 -1
- data/tests/test_levenshtein.rb +4 -0
- data/tests/test_pair_distance.rb +6 -0
- metadata +21 -15
data/.travis.yml
CHANGED
data/CHANGES
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
2011-11-14 (0.2.9)
|
2
|
+
* Provide amatch/rude and amatch/polite for require (the latter doesn't
|
3
|
+
extend ::String on its own)
|
4
|
+
* pair_distance_similar method now can take an optional regexp argument for
|
5
|
+
tokenizing.
|
6
|
+
2011-09-26 (0.2.8)
|
7
|
+
* Depend on tins library.
|
1
8
|
2011-08-06 (0.2.7)
|
2
9
|
* Fix some violations of ISO C90 standard.
|
3
10
|
2011-07-16 (0.2.6)
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.9
|
data/amatch.gemspec
CHANGED
@@ -2,38 +2,38 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = "amatch"
|
5
|
-
s.version = "0.2.
|
5
|
+
s.version = "0.2.9"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Florian Frank"]
|
9
|
-
s.date = "2011-
|
9
|
+
s.date = "2011-11-15"
|
10
10
|
s.description = "Amatch is a library for approximate string matching and searching in strings.\nSeveral algorithms can be used to do this, and it's also possible to compute a\nsimilarity metric number between 0.0 and 1.0 for two given strings.\n"
|
11
11
|
s.email = "flori@ping.de"
|
12
12
|
s.executables = ["agrep.rb"]
|
13
13
|
s.extensions = ["ext/extconf.rb"]
|
14
|
-
s.extra_rdoc_files = ["README.rdoc", "lib/amatch/version.rb", "
|
15
|
-
s.files = [".gitignore", ".travis.yml", "CHANGES", "COPYING", "Gemfile", "README.rdoc", "Rakefile", "VERSION", "amatch.gemspec", "bin/agrep.rb", "ext/
|
14
|
+
s.extra_rdoc_files = ["README.rdoc", "lib/amatch/polite.rb", "lib/amatch/rude.rb", "lib/amatch/version.rb", "lib/amatch.rb", "ext/amatch_ext.c", "ext/pair.c"]
|
15
|
+
s.files = [".gitignore", ".travis.yml", "CHANGES", "COPYING", "Gemfile", "README.rdoc", "Rakefile", "VERSION", "amatch.gemspec", "bin/agrep.rb", "ext/amatch_ext.c", "ext/common.h", "ext/extconf.rb", "ext/pair.c", "ext/pair.h", "install.rb", "lib/amatch.rb", "lib/amatch/.keep", "lib/amatch/polite.rb", "lib/amatch/rude.rb", "lib/amatch/version.rb", "tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
|
16
16
|
s.homepage = "http://github.com/flori/amatch"
|
17
17
|
s.rdoc_options = ["--title", "Amatch - Approximate Matching", "--main", "README.rdoc"]
|
18
18
|
s.require_paths = ["lib", "ext"]
|
19
19
|
s.rubygems_version = "1.8.10"
|
20
20
|
s.summary = "Approximate String Matching library"
|
21
|
-
s.test_files = ["tests/
|
21
|
+
s.test_files = ["tests/test_hamming.rb", "tests/test_jaro.rb", "tests/test_jaro_winkler.rb", "tests/test_levenshtein.rb", "tests/test_longest_subsequence.rb", "tests/test_longest_substring.rb", "tests/test_pair_distance.rb", "tests/test_sellers.rb"]
|
22
22
|
|
23
23
|
if s.respond_to? :specification_version then
|
24
24
|
s.specification_version = 3
|
25
25
|
|
26
26
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
27
|
-
s.add_development_dependency(%q<gem_hadar>, ["~> 0.1.
|
27
|
+
s.add_development_dependency(%q<gem_hadar>, ["~> 0.1.1"])
|
28
28
|
s.add_development_dependency(%q<test-unit>, ["~> 2.3"])
|
29
29
|
s.add_runtime_dependency(%q<tins>, ["~> 0.3"])
|
30
30
|
else
|
31
|
-
s.add_dependency(%q<gem_hadar>, ["~> 0.1.
|
31
|
+
s.add_dependency(%q<gem_hadar>, ["~> 0.1.1"])
|
32
32
|
s.add_dependency(%q<test-unit>, ["~> 2.3"])
|
33
33
|
s.add_dependency(%q<tins>, ["~> 0.3"])
|
34
34
|
end
|
35
35
|
else
|
36
|
-
s.add_dependency(%q<gem_hadar>, ["~> 0.1.
|
36
|
+
s.add_dependency(%q<gem_hadar>, ["~> 0.1.1"])
|
37
37
|
s.add_dependency(%q<test-unit>, ["~> 2.3"])
|
38
38
|
s.add_dependency(%q<tins>, ["~> 0.3"])
|
39
39
|
end
|
@@ -20,7 +20,7 @@
|
|
20
20
|
*/
|
21
21
|
|
22
22
|
|
23
|
-
static VALUE rb_mAmatch, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
23
|
+
static VALUE rb_mAmatch, rb_mAmatchStringMethods, rb_cLevenshtein, rb_cSellers, rb_cHamming,
|
24
24
|
rb_cPairDistance, rb_cLongestSubsequence, rb_cLongestSubstring,
|
25
25
|
rb_cJaro, rb_cJaroWinkler;
|
26
26
|
|
@@ -55,17 +55,17 @@ VALUE rb_##klass##_new(VALUE klass2, VALUE pattern) \
|
|
55
55
|
static void rb_##klass##_free(type *amatch) \
|
56
56
|
{ \
|
57
57
|
MEMZERO(amatch->pattern, char, amatch->pattern_len); \
|
58
|
-
|
58
|
+
xfree(amatch->pattern); \
|
59
59
|
MEMZERO(amatch, type, 1); \
|
60
|
-
|
60
|
+
xfree(amatch); \
|
61
61
|
}
|
62
62
|
|
63
63
|
#define DEF_PATTERN_ACCESSOR(type) \
|
64
64
|
static void type##_pattern_set(type *amatch, VALUE pattern) \
|
65
65
|
{ \
|
66
66
|
Check_Type(pattern, T_STRING); \
|
67
|
-
|
68
|
-
amatch->pattern_len = RSTRING_LEN(pattern);
|
67
|
+
xfree(amatch->pattern); \
|
68
|
+
amatch->pattern_len = (int) RSTRING_LEN(pattern); \
|
69
69
|
amatch->pattern = ALLOC_N(char, amatch->pattern_len); \
|
70
70
|
MEMCPY(amatch->pattern, RSTRING_PTR(pattern), char, \
|
71
71
|
RSTRING_LEN(pattern)); \
|
@@ -147,21 +147,21 @@ VALUE function(VALUE self, VALUE value) \
|
|
147
147
|
#define OPTIMIZE_TIME \
|
148
148
|
if (amatch->pattern_len < RSTRING_LEN(string)) { \
|
149
149
|
a_ptr = amatch->pattern; \
|
150
|
-
a_len = amatch->pattern_len;
|
150
|
+
a_len = (int) amatch->pattern_len; \
|
151
151
|
b_ptr = RSTRING_PTR(string); \
|
152
|
-
b_len = RSTRING_LEN(string);
|
152
|
+
b_len = (int) RSTRING_LEN(string); \
|
153
153
|
} else { \
|
154
154
|
a_ptr = RSTRING_PTR(string); \
|
155
|
-
a_len = RSTRING_LEN(string);
|
155
|
+
a_len = (int) RSTRING_LEN(string); \
|
156
156
|
b_ptr = amatch->pattern; \
|
157
|
-
b_len = amatch->pattern_len;
|
157
|
+
b_len = (int) amatch->pattern_len; \
|
158
158
|
}
|
159
159
|
|
160
160
|
#define DONT_OPTIMIZE \
|
161
161
|
a_ptr = amatch->pattern; \
|
162
|
-
a_len = amatch->pattern_len;
|
162
|
+
a_len = (int) amatch->pattern_len; \
|
163
163
|
b_ptr = RSTRING_PTR(string); \
|
164
|
-
b_len = RSTRING_LEN(string);
|
164
|
+
b_len = (int) RSTRING_LEN(string); \
|
165
165
|
|
166
166
|
/*
|
167
167
|
* C structures of the Amatch classes
|
@@ -215,10 +215,10 @@ DEF_PATTERN_ACCESSOR(Jaro)
|
|
215
215
|
DEF_ITERATE_STRINGS(Jaro)
|
216
216
|
|
217
217
|
typedef struct JaroWinklerStruct {
|
218
|
-
char
|
219
|
-
int
|
220
|
-
int
|
221
|
-
|
218
|
+
char *pattern;
|
219
|
+
int pattern_len;
|
220
|
+
int ignore_case;
|
221
|
+
double scaling_factor;
|
222
222
|
} JaroWinkler;
|
223
223
|
|
224
224
|
DEF_ALLOCATOR(JaroWinkler)
|
@@ -271,8 +271,8 @@ static VALUE Levenshtein_match(General *amatch, VALUE string)
|
|
271
271
|
|
272
272
|
result = INT2FIX(v[p][b_len]);
|
273
273
|
|
274
|
-
|
275
|
-
|
274
|
+
xfree(v[0]);
|
275
|
+
xfree(v[1]);
|
276
276
|
|
277
277
|
return result;
|
278
278
|
}
|
@@ -303,8 +303,8 @@ static VALUE Levenshtein_similar(General *amatch, VALUE string)
|
|
303
303
|
} else {
|
304
304
|
result = rb_float_new(1.0 - ((double) v[p][b_len]) / a_len);
|
305
305
|
}
|
306
|
-
|
307
|
-
|
306
|
+
xfree(v[0]);
|
307
|
+
xfree(v[1]);
|
308
308
|
return result;
|
309
309
|
}
|
310
310
|
|
@@ -332,8 +332,8 @@ static VALUE Levenshtein_search(General *amatch, VALUE string)
|
|
332
332
|
|
333
333
|
result = INT2FIX(min);
|
334
334
|
|
335
|
-
|
336
|
-
|
335
|
+
xfree(v[0]);
|
336
|
+
xfree(v[1]);
|
337
337
|
|
338
338
|
return result;
|
339
339
|
}
|
@@ -385,8 +385,8 @@ static VALUE Sellers_match(Sellers *amatch, VALUE string)
|
|
385
385
|
COMPUTE_SELLERS_DISTANCE
|
386
386
|
|
387
387
|
result = rb_float_new(v[p][b_len]);
|
388
|
-
|
389
|
-
|
388
|
+
xfree(v[0]);
|
389
|
+
xfree(v[1]);
|
390
390
|
return result;
|
391
391
|
}
|
392
392
|
|
@@ -430,8 +430,8 @@ static VALUE Sellers_similar(Sellers *amatch, VALUE string)
|
|
430
430
|
} else {
|
431
431
|
result = rb_float_new(1.0 - v[p][b_len] / (a_len * max_weight));
|
432
432
|
}
|
433
|
-
|
434
|
-
|
433
|
+
xfree(v[0]);
|
434
|
+
xfree(v[1]);
|
435
435
|
return result;
|
436
436
|
}
|
437
437
|
|
@@ -457,8 +457,8 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
457
457
|
if (v[p][i] < min) min = v[p][i];
|
458
458
|
}
|
459
459
|
result = rb_float_new(min);
|
460
|
-
|
461
|
-
|
460
|
+
xfree(v[0]);
|
461
|
+
xfree(v[1]);
|
462
462
|
|
463
463
|
return result;
|
464
464
|
}
|
@@ -467,8 +467,7 @@ static VALUE Sellers_search(Sellers *amatch, VALUE string)
|
|
467
467
|
* Pair distances are computed here:
|
468
468
|
*/
|
469
469
|
|
470
|
-
static VALUE PairDistance_match(
|
471
|
-
PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
470
|
+
static VALUE PairDistance_match(PairDistance *amatch, VALUE string, VALUE regexp, int use_regexp)
|
472
471
|
{
|
473
472
|
double result;
|
474
473
|
VALUE tokens;
|
@@ -564,8 +563,8 @@ static VALUE Hamming_similar(General *amatch, VALUE string)
|
|
564
563
|
c = (c + 1) % 2; \
|
565
564
|
} \
|
566
565
|
result = l[p][0]; \
|
567
|
-
|
568
|
-
|
566
|
+
xfree(l[0]); \
|
567
|
+
xfree(l[1]);
|
569
568
|
|
570
569
|
|
571
570
|
static VALUE LongestSubsequence_match(General *amatch, VALUE string)
|
@@ -619,8 +618,8 @@ static VALUE LongestSubsequence_similar(General *amatch, VALUE string)
|
|
619
618
|
p = c; \
|
620
619
|
c = (c + 1) % 2; \
|
621
620
|
} \
|
622
|
-
|
623
|
-
|
621
|
+
xfree(l[0]); \
|
622
|
+
xfree(l[1]);
|
624
623
|
|
625
624
|
static VALUE LongestSubstring_match(General *amatch, VALUE string)
|
626
625
|
{
|
@@ -692,8 +691,8 @@ static VALUE LongestSubstring_similar(General *amatch, VALUE string)
|
|
692
691
|
t = t / 2; \
|
693
692
|
result = (((double)m)/a_len + ((double)m)/b_len + ((double)(m-t))/m)/3.0; \
|
694
693
|
} \
|
695
|
-
|
696
|
-
|
694
|
+
xfree(l[0]); \
|
695
|
+
xfree(l[1]);
|
697
696
|
|
698
697
|
|
699
698
|
#define LOWERCASE_STRINGS \
|
@@ -811,7 +810,7 @@ DEF_CONSTRUCTOR(Levenshtein, General)
|
|
811
810
|
* Uses this Amatch::Levenshtein instance to match Amatch::Levenshtein#pattern
|
812
811
|
* against <code>strings</code>. It returns the number operations, the Sellers
|
813
812
|
* distance. <code>strings</code> has to be either a String or an Array of
|
814
|
-
* Strings. The returned <code>results</code>
|
813
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
815
814
|
* Floats respectively.
|
816
815
|
*/
|
817
816
|
static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
@@ -827,7 +826,7 @@ static VALUE rb_Levenshtein_match(VALUE self, VALUE strings)
|
|
827
826
|
* against <code>strings</code>, and compute a Levenshtein distance metric
|
828
827
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
829
828
|
* <code>strings</code> has to be either a String or an Array of Strings. The
|
830
|
-
* returned <code>results</code>
|
829
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
831
830
|
* respectively.
|
832
831
|
*/
|
833
832
|
static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
@@ -843,7 +842,7 @@ static VALUE rb_Levenshtein_similar(VALUE self, VALUE strings)
|
|
843
842
|
* to match against <code>strings</code>. It returns a Levenshtein distance
|
844
843
|
* metric number between 0.0 for very unsimilar strings and 1.0 for an exact
|
845
844
|
* match. <code>strings</code> has to be either a String or an Array of
|
846
|
-
* Strings. The returned <code>results</code>
|
845
|
+
* Strings. The returned <code>results</code> is either a Float or an Array of
|
847
846
|
* Floats respectively.
|
848
847
|
*/
|
849
848
|
static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
@@ -859,7 +858,7 @@ static VALUE rb_str_levenshtein_similar(VALUE self, VALUE strings)
|
|
859
858
|
* edit distance (the sum of character operations) as a Fixnum value, by greedy
|
860
859
|
* trimming prefixes or postfixes of the match. <code>strings</code> has
|
861
860
|
* to be either a String or an Array of Strings. The returned
|
862
|
-
* <code>results</code>
|
861
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
863
862
|
*/
|
864
863
|
static VALUE rb_Levenshtein_search(VALUE self, VALUE strings)
|
865
864
|
{
|
@@ -998,7 +997,7 @@ DEF_CONSTRUCTOR(Sellers, Sellers)
|
|
998
997
|
* <code>strings</code>, while taking into account the given weights. It
|
999
998
|
* returns the number of weighted character operations, the Sellers distance.
|
1000
999
|
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1001
|
-
* returned <code>results</code>
|
1000
|
+
* returned <code>results</code> is either a Float or an Array of Floats
|
1002
1001
|
* respectively.
|
1003
1002
|
*/
|
1004
1003
|
static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
@@ -1014,7 +1013,7 @@ static VALUE rb_Sellers_match(VALUE self, VALUE strings)
|
|
1014
1013
|
* against <code>strings</code> (taking into account the given weights), and
|
1015
1014
|
* compute a Sellers distance metric number between 0.0 for very unsimilar
|
1016
1015
|
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1017
|
-
* String or an Array of Strings. The returned <code>results</code>
|
1016
|
+
* String or an Array of Strings. The returned <code>results</code> is either
|
1018
1017
|
* a Fixnum or an Array of Fixnums
|
1019
1018
|
* respectively.
|
1020
1019
|
*/
|
@@ -1031,7 +1030,7 @@ static VALUE rb_Sellers_similar(VALUE self, VALUE strings)
|
|
1031
1030
|
* distance (the sum of weighted character operations) as a Float value, by
|
1032
1031
|
* greedy trimming prefixes or postfixes of the match. <code>strings</code> has
|
1033
1032
|
* to be either a String or an Array of Strings. The returned
|
1034
|
-
* <code>results</code>
|
1033
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1035
1034
|
*/
|
1036
1035
|
static VALUE rb_Sellers_search(VALUE self, VALUE strings)
|
1037
1036
|
{
|
@@ -1089,7 +1088,7 @@ DEF_CONSTRUCTOR(PairDistance, PairDistance)
|
|
1089
1088
|
* splitting should be omitted, call the method with nil as <code>regexp</code>
|
1090
1089
|
* explicitly.
|
1091
1090
|
*
|
1092
|
-
* The returned <code>results</code>
|
1091
|
+
* The returned <code>results</code> is either a Float or an
|
1093
1092
|
* Array of Floats respectively.
|
1094
1093
|
*/
|
1095
1094
|
static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
@@ -1125,19 +1124,30 @@ static VALUE rb_PairDistance_match(int argc, VALUE *argv, VALUE self)
|
|
1125
1124
|
}
|
1126
1125
|
|
1127
1126
|
/*
|
1128
|
-
* call-seq: pair_distance_similar(strings) -> results
|
1127
|
+
* call-seq: pair_distance_similar(strings, regexp = nil) -> results
|
1129
1128
|
*
|
1130
1129
|
* If called on a String, this string is used as a Amatch::PairDistance#pattern
|
1131
1130
|
* to match against <code>strings</code> using /\s+/ as the tokenizing regular
|
1132
1131
|
* expression. It returns a pair distance metric number between 0.0 for very
|
1133
1132
|
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1134
|
-
* either a String or an Array of Strings.
|
1135
|
-
*
|
1133
|
+
* either a String or an Array of Strings.
|
1134
|
+
*
|
1135
|
+
* The returned <code>results</code> is either a Float or an Array of Floats
|
1136
|
+
* respectively.
|
1136
1137
|
*/
|
1137
|
-
static VALUE rb_str_pair_distance_similar(VALUE
|
1138
|
+
static VALUE rb_str_pair_distance_similar(int argc, VALUE *argv, VALUE self)
|
1138
1139
|
{
|
1139
|
-
VALUE amatch =
|
1140
|
-
|
1140
|
+
VALUE amatch, string, regexp = Qnil;
|
1141
|
+
rb_scan_args(argc, argv, "11", &string, ®exp);
|
1142
|
+
amatch = rb_PairDistance_new(rb_cPairDistance, self);
|
1143
|
+
if (NIL_P(regexp)) {
|
1144
|
+
return rb_PairDistance_match(1, &string, amatch);
|
1145
|
+
} else {
|
1146
|
+
VALUE *args = alloca(2);
|
1147
|
+
args[0] = string;
|
1148
|
+
args[1] = regexp;
|
1149
|
+
return rb_PairDistance_match(2, args, amatch);
|
1150
|
+
}
|
1141
1151
|
}
|
1142
1152
|
|
1143
1153
|
/*
|
@@ -1175,7 +1185,7 @@ DEF_CONSTRUCTOR(Hamming, General)
|
|
1175
1185
|
* <code>strings</code>, that is compute the hamming distance between
|
1176
1186
|
* <code>pattern</code> and <code>strings</code>. <code>strings</code> has to
|
1177
1187
|
* be either a String or an Array of Strings. The returned <code>results</code>
|
1178
|
-
*
|
1188
|
+
* is either a Fixnum or an Array of Fixnums respectively.
|
1179
1189
|
*/
|
1180
1190
|
static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
1181
1191
|
{
|
@@ -1190,7 +1200,7 @@ static VALUE rb_Hamming_match(VALUE self, VALUE strings)
|
|
1190
1200
|
* <code>strings</code>, and compute a Hamming distance metric number between
|
1191
1201
|
* 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1192
1202
|
* <code>strings</code> has to be either a String or an Array of Strings. The
|
1193
|
-
* returned <code>results</code>
|
1203
|
+
* returned <code>results</code> is either a Fixnum or an Array of Fixnums
|
1194
1204
|
* respectively.
|
1195
1205
|
*/
|
1196
1206
|
static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
@@ -1207,7 +1217,7 @@ static VALUE rb_Hamming_similar(VALUE self, VALUE strings)
|
|
1207
1217
|
* number between 0.0 for very unsimilar strings and 1.0 for an exact match.
|
1208
1218
|
* <code>strings</code>
|
1209
1219
|
* has to be either a String or an Array of Strings. The returned
|
1210
|
-
* <code>results</code>
|
1220
|
+
* <code>results</code> is either a Float or an Array of Floats respectively.
|
1211
1221
|
*/
|
1212
1222
|
static VALUE rb_str_hamming_similar(VALUE self, VALUE strings)
|
1213
1223
|
{
|
@@ -1251,7 +1261,7 @@ DEF_CONSTRUCTOR(LongestSubsequence, General)
|
|
1251
1261
|
* LongestSubsequence#pattern against <code>strings</code>, that is compute the
|
1252
1262
|
* length of the longest common subsequence. <code>strings</code> has to be
|
1253
1263
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1254
|
-
*
|
1264
|
+
* is either a Fixnum or an Array of Fixnums respectively.
|
1255
1265
|
*/
|
1256
1266
|
static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
1257
1267
|
{
|
@@ -1266,7 +1276,7 @@ static VALUE rb_LongestSubsequence_match(VALUE self, VALUE strings)
|
|
1266
1276
|
* Amatch::LongestSubsequence#pattern against <code>strings</code>, and compute
|
1267
1277
|
* a longest substring distance metric number between 0.0 for very unsimilar
|
1268
1278
|
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1269
|
-
* String or an Array of Strings. The returned <code>results</code>
|
1279
|
+
* String or an Array of Strings. The returned <code>results</code> is either
|
1270
1280
|
* a Fixnum or an Array of Fixnums
|
1271
1281
|
*/
|
1272
1282
|
static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
@@ -1283,7 +1293,7 @@ static VALUE rb_LongestSubsequence_similar(VALUE self, VALUE strings)
|
|
1283
1293
|
* returns a longest subsequence distance metric number between 0.0 for very
|
1284
1294
|
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1285
1295
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1286
|
-
*
|
1296
|
+
* is either a Float or an Array of Floats respectively.
|
1287
1297
|
*/
|
1288
1298
|
static VALUE rb_str_longest_subsequence_similar(VALUE self, VALUE strings)
|
1289
1299
|
{
|
@@ -1328,7 +1338,7 @@ DEF_CONSTRUCTOR(LongestSubstring, General)
|
|
1328
1338
|
* LongestSubstring#pattern against <code>strings</code>, that is compute the
|
1329
1339
|
* length of the longest common substring. <code>strings</code> has to be
|
1330
1340
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1331
|
-
*
|
1341
|
+
* is either a Fixnum or an Array of Fixnums respectively.
|
1332
1342
|
*/
|
1333
1343
|
static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
1334
1344
|
{
|
@@ -1343,7 +1353,7 @@ static VALUE rb_LongestSubstring_match(VALUE self, VALUE strings)
|
|
1343
1353
|
* Amatch::LongestSubstring#pattern against <code>strings</code>, and compute a
|
1344
1354
|
* longest substring distance metric number between 0.0 for very unsimilar
|
1345
1355
|
* strings and 1.0 for an exact match. <code>strings</code> has to be either a
|
1346
|
-
* String or an Array of Strings. The returned <code>results</code>
|
1356
|
+
* String or an Array of Strings. The returned <code>results</code> is either
|
1347
1357
|
* a Fixnum or an Array of Fixnums
|
1348
1358
|
* respectively.
|
1349
1359
|
*/
|
@@ -1361,7 +1371,7 @@ static VALUE rb_LongestSubstring_similar(VALUE self, VALUE strings)
|
|
1361
1371
|
* returns a longest substring distance metric number between 0.0 for very
|
1362
1372
|
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1363
1373
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1364
|
-
*
|
1374
|
+
* is either a Float or an Array of Floats respectively.
|
1365
1375
|
*/
|
1366
1376
|
static VALUE rb_str_longest_substring_similar(VALUE self, VALUE strings)
|
1367
1377
|
{
|
@@ -1419,7 +1429,7 @@ DEF_CONSTRUCTOR(Jaro, Jaro)
|
|
1419
1429
|
* Jaro#pattern against <code>strings</code>, that is compute the
|
1420
1430
|
* jaro metric with the strings. <code>strings</code> has to be
|
1421
1431
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1422
|
-
*
|
1432
|
+
* is either a Float or an Array of Floats respectively.
|
1423
1433
|
*/
|
1424
1434
|
static VALUE rb_Jaro_match(VALUE self, VALUE strings)
|
1425
1435
|
{
|
@@ -1435,7 +1445,7 @@ static VALUE rb_Jaro_match(VALUE self, VALUE strings)
|
|
1435
1445
|
* returns a Jaro metric number between 0.0 for very
|
1436
1446
|
* unsimilar strings and 1.0 for an exact match. <code>strings</code> has to be
|
1437
1447
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1438
|
-
*
|
1448
|
+
* is either a Float or an Array of Floats respectively.
|
1439
1449
|
*/
|
1440
1450
|
static VALUE rb_str_jaro_similar(VALUE self, VALUE strings)
|
1441
1451
|
{
|
@@ -1518,7 +1528,7 @@ DEF_CONSTRUCTOR(JaroWinkler, JaroWinkler)
|
|
1518
1528
|
* Jaro#pattern against <code>strings</code>, that is compute the
|
1519
1529
|
* jaro metric with the strings. <code>strings</code> has to be
|
1520
1530
|
* either a String or an Array of Strings. The returned <code>results</code>
|
1521
|
-
*
|
1531
|
+
* is either a Float or an Array of Floats respectively.
|
1522
1532
|
*/
|
1523
1533
|
static VALUE rb_JaroWinkler_match(VALUE self, VALUE strings)
|
1524
1534
|
{
|
@@ -1542,10 +1552,17 @@ static VALUE rb_str_jarowinkler_similar(VALUE self, VALUE strings)
|
|
1542
1552
|
return rb_JaroWinkler_match(amatch, strings);
|
1543
1553
|
}
|
1544
1554
|
|
1545
|
-
|
1555
|
+
/*
|
1556
|
+
* This is the namespace module that includes all other classes, modules, and
|
1557
|
+
* constants.
|
1558
|
+
*/
|
1559
|
+
|
1560
|
+
void Init_amatch_ext()
|
1546
1561
|
{
|
1547
1562
|
rb_require("amatch/version");
|
1548
1563
|
rb_mAmatch = rb_define_module("Amatch");
|
1564
|
+
/* This module can be mixed into ::String or its subclasses to mixin the similary methods directly. */
|
1565
|
+
rb_mAmatchStringMethods = rb_define_module_under(rb_mAmatch, "StringMethods");
|
1549
1566
|
|
1550
1567
|
/* Levenshtein */
|
1551
1568
|
rb_cLevenshtein = rb_define_class_under(rb_mAmatch, "Levenshtein", rb_cObject);
|
@@ -1556,7 +1573,7 @@ void Init_amatch()
|
|
1556
1573
|
rb_define_method(rb_cLevenshtein, "match", rb_Levenshtein_match, 1);
|
1557
1574
|
rb_define_method(rb_cLevenshtein, "search", rb_Levenshtein_search, 1);
|
1558
1575
|
rb_define_method(rb_cLevenshtein, "similar", rb_Levenshtein_similar, 1);
|
1559
|
-
rb_define_method(
|
1576
|
+
rb_define_method(rb_mAmatchStringMethods, "levenshtein_similar", rb_str_levenshtein_similar, 1);
|
1560
1577
|
|
1561
1578
|
/* Sellers */
|
1562
1579
|
rb_cSellers = rb_define_class_under(rb_mAmatch, "Sellers", rb_cObject);
|
@@ -1583,9 +1600,9 @@ void Init_amatch()
|
|
1583
1600
|
rb_define_method(rb_cHamming, "pattern=", rb_General_pattern_set, 1);
|
1584
1601
|
rb_define_method(rb_cHamming, "match", rb_Hamming_match, 1);
|
1585
1602
|
rb_define_method(rb_cHamming, "similar", rb_Hamming_similar, 1);
|
1586
|
-
rb_define_method(
|
1603
|
+
rb_define_method(rb_mAmatchStringMethods, "hamming_similar", rb_str_hamming_similar, 1);
|
1587
1604
|
|
1588
|
-
/* Pair Distance Metric */
|
1605
|
+
/* Pair Distance Metric / Dice Coefficient */
|
1589
1606
|
rb_cPairDistance = rb_define_class_under(rb_mAmatch, "PairDistance", rb_cObject);
|
1590
1607
|
rb_define_alloc_func(rb_cPairDistance, rb_PairDistance_s_allocate);
|
1591
1608
|
rb_define_method(rb_cPairDistance, "initialize", rb_PairDistance_initialize, 1);
|
@@ -1593,7 +1610,7 @@ void Init_amatch()
|
|
1593
1610
|
rb_define_method(rb_cPairDistance, "pattern=", rb_PairDistance_pattern_set, 1);
|
1594
1611
|
rb_define_method(rb_cPairDistance, "match", rb_PairDistance_match, -1);
|
1595
1612
|
rb_define_alias(rb_cPairDistance, "similar", "match");
|
1596
|
-
rb_define_method(
|
1613
|
+
rb_define_method(rb_mAmatchStringMethods, "pair_distance_similar", rb_str_pair_distance_similar, -1);
|
1597
1614
|
|
1598
1615
|
/* Longest Common Subsequence */
|
1599
1616
|
rb_cLongestSubsequence = rb_define_class_under(rb_mAmatch, "LongestSubsequence", rb_cObject);
|
@@ -1603,7 +1620,7 @@ void Init_amatch()
|
|
1603
1620
|
rb_define_method(rb_cLongestSubsequence, "pattern=", rb_General_pattern_set, 1);
|
1604
1621
|
rb_define_method(rb_cLongestSubsequence, "match", rb_LongestSubsequence_match, 1);
|
1605
1622
|
rb_define_method(rb_cLongestSubsequence, "similar", rb_LongestSubsequence_similar, 1);
|
1606
|
-
rb_define_method(
|
1623
|
+
rb_define_method(rb_mAmatchStringMethods, "longest_subsequence_similar", rb_str_longest_subsequence_similar, 1);
|
1607
1624
|
|
1608
1625
|
/* Longest Common Substring */
|
1609
1626
|
rb_cLongestSubstring = rb_define_class_under(rb_mAmatch, "LongestSubstring", rb_cObject);
|
@@ -1613,7 +1630,7 @@ void Init_amatch()
|
|
1613
1630
|
rb_define_method(rb_cLongestSubstring, "pattern=", rb_General_pattern_set, 1);
|
1614
1631
|
rb_define_method(rb_cLongestSubstring, "match", rb_LongestSubstring_match, 1);
|
1615
1632
|
rb_define_method(rb_cLongestSubstring, "similar", rb_LongestSubstring_similar, 1);
|
1616
|
-
rb_define_method(
|
1633
|
+
rb_define_method(rb_mAmatchStringMethods, "longest_substring_similar", rb_str_longest_substring_similar, 1);
|
1617
1634
|
|
1618
1635
|
/* Jaro */
|
1619
1636
|
rb_cJaro = rb_define_class_under(rb_mAmatch, "Jaro", rb_cObject);
|
@@ -1625,7 +1642,7 @@ void Init_amatch()
|
|
1625
1642
|
rb_define_method(rb_cJaro, "ignore_case=", rb_Jaro_ignore_case_set, 1);
|
1626
1643
|
rb_define_method(rb_cJaro, "match", rb_Jaro_match, 1);
|
1627
1644
|
rb_define_alias(rb_cJaro, "similar", "match");
|
1628
|
-
rb_define_method(
|
1645
|
+
rb_define_method(rb_mAmatchStringMethods, "jaro_similar", rb_str_jaro_similar, 1);
|
1629
1646
|
|
1630
1647
|
/* Jaro-Winkler */
|
1631
1648
|
rb_cJaroWinkler = rb_define_class_under(rb_mAmatch, "JaroWinkler", rb_cObject);
|
@@ -1639,7 +1656,7 @@ void Init_amatch()
|
|
1639
1656
|
rb_define_method(rb_cJaroWinkler, "scaling_factor=", rb_JaroWinkler_scaling_factor_set, 1);
|
1640
1657
|
rb_define_method(rb_cJaroWinkler, "match", rb_JaroWinkler_match, 1);
|
1641
1658
|
rb_define_alias(rb_cJaroWinkler, "similar", "match");
|
1642
|
-
rb_define_method(
|
1659
|
+
rb_define_method(rb_mAmatchStringMethods, "jarowinkler_similar", rb_str_jarowinkler_similar, 1);
|
1643
1660
|
|
1644
1661
|
id_split = rb_intern("split");
|
1645
1662
|
id_to_f = rb_intern("to_f");
|
data/ext/extconf.rb
CHANGED
data/ext/pair.c
CHANGED
data/lib/amatch.rb
ADDED
data/lib/amatch/rude.rb
ADDED
data/lib/amatch/version.rb
CHANGED
data/tests/test_jaro.rb
CHANGED
data/tests/test_levenshtein.rb
CHANGED
@@ -11,6 +11,10 @@ class TestLevenshtein < Test::Unit::TestCase
|
|
11
11
|
@long = Levenshtein.new('A' * 160)
|
12
12
|
end
|
13
13
|
|
14
|
+
def test_version
|
15
|
+
assert_kind_of String, Amatch::VERSION
|
16
|
+
end
|
17
|
+
|
14
18
|
def test_match
|
15
19
|
assert_equal 4, @simple.match('')
|
16
20
|
assert_equal 0, @simple.match('test')
|
data/tests/test_pair_distance.rb
CHANGED
@@ -15,6 +15,10 @@ class TestPairDistance < Test::Unit::TestCase
|
|
15
15
|
@long = PairDistance.new('A' * 160)
|
16
16
|
end
|
17
17
|
|
18
|
+
def test_alternative_constant
|
19
|
+
assert_equal PairDistance, DiceCoefficient
|
20
|
+
end
|
21
|
+
|
18
22
|
def test_empty
|
19
23
|
assert_in_delta 1, @empty.match(''), D
|
20
24
|
assert_in_delta 0, @empty.match('not empty'), D
|
@@ -55,6 +59,8 @@ class TestPairDistance < Test::Unit::TestCase
|
|
55
59
|
assert_in_delta 0.6, @single.match('test aaa bbb'), D
|
56
60
|
assert_in_delta 0.6, @single.match('bbb aaa test'), D
|
57
61
|
assert_in_delta 0.8571428, @single.pattern.pair_distance_similar('atest'), D
|
62
|
+
assert_in_delta 1.0, @france.pattern.pair_distance_similar('of france, republic', /[, ]+/), D
|
63
|
+
assert_in_delta 0.9230769, @france.pattern.pair_distance_similar('of france, republik', /[, ]+/), D
|
58
64
|
end
|
59
65
|
|
60
66
|
def test_csv
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amatch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,22 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-11-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: gem_hadar
|
16
|
-
requirement: &
|
16
|
+
requirement: &2152298700 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0.1.
|
21
|
+
version: 0.1.1
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2152298700
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: test-unit
|
27
|
-
requirement: &
|
27
|
+
requirement: &2152312400 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '2.3'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2152312400
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: tins
|
38
|
-
requirement: &
|
38
|
+
requirement: &2152310440 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0.3'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2152310440
|
47
47
|
description: ! 'Amatch is a library for approximate string matching and searching
|
48
48
|
in strings.
|
49
49
|
|
@@ -59,8 +59,11 @@ extensions:
|
|
59
59
|
- ext/extconf.rb
|
60
60
|
extra_rdoc_files:
|
61
61
|
- README.rdoc
|
62
|
+
- lib/amatch/polite.rb
|
63
|
+
- lib/amatch/rude.rb
|
62
64
|
- lib/amatch/version.rb
|
63
|
-
-
|
65
|
+
- lib/amatch.rb
|
66
|
+
- ext/amatch_ext.c
|
64
67
|
- ext/pair.c
|
65
68
|
files:
|
66
69
|
- .gitignore
|
@@ -73,13 +76,16 @@ files:
|
|
73
76
|
- VERSION
|
74
77
|
- amatch.gemspec
|
75
78
|
- bin/agrep.rb
|
76
|
-
- ext/
|
79
|
+
- ext/amatch_ext.c
|
77
80
|
- ext/common.h
|
78
81
|
- ext/extconf.rb
|
79
82
|
- ext/pair.c
|
80
83
|
- ext/pair.h
|
81
84
|
- install.rb
|
85
|
+
- lib/amatch.rb
|
82
86
|
- lib/amatch/.keep
|
87
|
+
- lib/amatch/polite.rb
|
88
|
+
- lib/amatch/rude.rb
|
83
89
|
- lib/amatch/version.rb
|
84
90
|
- tests/test_hamming.rb
|
85
91
|
- tests/test_jaro.rb
|
@@ -119,11 +125,11 @@ signing_key:
|
|
119
125
|
specification_version: 3
|
120
126
|
summary: Approximate String Matching library
|
121
127
|
test_files:
|
122
|
-
- tests/
|
128
|
+
- tests/test_hamming.rb
|
123
129
|
- tests/test_jaro.rb
|
130
|
+
- tests/test_jaro_winkler.rb
|
131
|
+
- tests/test_levenshtein.rb
|
124
132
|
- tests/test_longest_subsequence.rb
|
125
133
|
- tests/test_longest_substring.rb
|
126
|
-
- tests/test_hamming.rb
|
127
134
|
- tests/test_pair_distance.rb
|
128
|
-
- tests/
|
129
|
-
- tests/test_jaro_winkler.rb
|
135
|
+
- tests/test_sellers.rb
|