str_metrics 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: f773ff65c7ef2fb3bc5f396cf10e13e7d98d3037981de45a846f8b9fc6328caa
4
+ data.tar.gz: c6b3838b89a063c4e57cf70e06a7e9226bee9ecb84a99d0f2773775dbb56cc6e
5
+ SHA512:
6
+ metadata.gz: 22df7813421a8850d277065d8595aadf5a30c0fc96ebfb90bfaac05e3906030c5874dd4a8da38e1a0af67277f22ca2c46940515d509ed6bd63c0d332e6620b3b
7
+ data.tar.gz: 7addd087a16eb72ed3428a9b90ce51c0e0ed5865bc2d8aebe7b899f2adc5fdbbcad15c73b60af3701213129e9f7fed6e69a834637f7e32c14f478f8caaa7e589
data/Cargo.toml ADDED
@@ -0,0 +1,15 @@
1
+ [package]
2
+ name = "str_metrics"
3
+ version = "0.1.0"
4
+ authors = ["Anirban Mukhopadhyay <anirban.mukhop@gmail.com>"]
5
+ edition = "2018"
6
+
7
+ [lib]
8
+ crate-type = ["cdylib"]
9
+
10
+ [dependencies]
11
+ unicode-segmentation = "^1.6"
12
+ libc = "^0.2"
13
+ itertools = "^0.8"
14
+
15
+ # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Anirban Mukhopadhyay
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,170 @@
1
+ # StrMetrics
2
+
3
+ [![checks](https://github.com/anirbanmu/str_metrics/workflows/checks/badge.svg)](https://github.com/anirbanmu/str_metrics/actions?query=workflow%3Achecks)
4
+
5
+ Ruby gem (native extension in Rust) providing implementations of various string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein, Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation) are supported. All comparison of strings is done at the grapheme cluster level as described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries); this may be different from many gems that calculate string metrics.
6
+
7
+ ## Getting Started
8
+ ### Prerequisites
9
+
10
+ Install Rust (tested with version `>= 1.38.0`) with:
11
+
12
+ ```sh
13
+ curl https://sh.rustup.rs -sSf | sh
14
+ ```
15
+
16
+ ### Installation
17
+
18
+ #### With [`bundler`](https://bundler.io/)
19
+
20
+ Add this line to your application's Gemfile:
21
+
22
+ ```ruby
23
+ gem 'str_metrics'
24
+ ```
25
+
26
+ And then execute:
27
+
28
+ $ bundle install
29
+
30
+ #### Without `bundler`
31
+
32
+ $ gem install str_metrics
33
+
34
+ ## Usage
35
+
36
+ All you need to do to use the metrics provided in this gem is to make sure `str_metrics` is required like:
37
+
38
+ ```ruby
39
+ require 'str_metrics'
40
+ ```
41
+
42
+ Each metric is shown below with an example & meanings of optional parameters.
43
+
44
+ ### Sørensen–Dice
45
+
46
+ ```ruby
47
+ StrMetrics::SorensenDice.coefficient('abc', 'bcd', ignore_case: false)
48
+ => 0.5
49
+ ```
50
+ Options:
51
+
52
+ Keyword | Type | Default | Description
53
+ --- | --- | --- | ---
54
+ `ignore_case` | boolean | `false` | Case insensitive comparison?
55
+
56
+ ### Levenshtein
57
+
58
+ ```ruby
59
+ StrMetrics::Levenshtein.distance('abc', 'acb', ignore_case: false)
60
+ => 2
61
+ ```
62
+ Options:
63
+
64
+ Keyword | Type | Default | Description
65
+ --- | --- | --- | ---
66
+ `ignore_case` | boolean | `false` | Case insensitive comparison?
67
+
68
+ ### Damerau–Levenshtein
69
+
70
+ ```ruby
71
+ StrMetrics::DamerauLevenshtein.distance('abc', 'acb', ignore_case: false)
72
+ => 1
73
+ ```
74
+ Options:
75
+
76
+ Keyword | Type | Default | Description
77
+ --- | --- | --- | ---
78
+ `ignore_case` | boolean | `false` | Case insensitive comparison?
79
+
80
+ ### Jaro
81
+
82
+ ```ruby
83
+ StrMetrics::Jaro.similarity('abc', 'aac', ignore_case: false)
84
+ => 0.7777777777777777
85
+ ```
86
+ Options:
87
+
88
+ Keyword | Type | Default | Description
89
+ --- | --- | --- | ---
90
+ `ignore_case` | boolean | `false` | Case insensitive comparison?
91
+
92
+ ### Jaro–Winkler
93
+
94
+ ```ruby
95
+ StrMetrics::JaroWinkler.similarity('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
96
+ => 0.7999999999999999
97
+
98
+ StrMetrics::JaroWinkler.distance('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
99
+ => 0.20000000000000007
100
+ ```
101
+ Options:
102
+
103
+ Keyword | Type | Default | Description
104
+ --- | --- | --- | ---
105
+ `ignore_case` | boolean | `false` | Case insensitive comparison?
106
+ `prefix_scaling_factor` | decimal | `0.1` | Constant scaling factor for how much to weight common prefixes. Should not exceed 0.25.
107
+ `prefix_scaling_bonus_threshold` | decimal | `0.7` | Prefix bonus weighting will only be applied if the Jaro similarity is greater given value.
108
+
109
+ ## Motivation
110
+
111
+ The main motivation was to have a central gem which can provide a variety of string metric calculations. Secondary motivation was to experiment with writing a native extension in Rust (instead of C).
112
+
113
+ ## Development
114
+
115
+ ### Getting started
116
+
117
+ ```bash
118
+ gem install bundler
119
+ git clone https://github.com/anirbanmu/str_metrics.git
120
+ cd ./str_metrics
121
+ bundle install
122
+ ```
123
+
124
+ ### Building (for native component)
125
+
126
+ ```bash
127
+ rake rust_build
128
+ ```
129
+
130
+ ### Testing (will build native component before running tests)
131
+ ```bash
132
+ rake spec
133
+ ```
134
+
135
+ ### Local installation
136
+ ```bash
137
+ rake install
138
+ ```
139
+
140
+ ### Deploying a new version
141
+ To deploy a new version of the gem to rubygems:
142
+
143
+ 1. Bump version in [version.rb](lib/str_metrics/version.rb) according to [SemVer](https://semver.org/).
144
+ 2. Get your code merged to master
145
+ 3. After a `git pull` on master:
146
+
147
+ ```bash
148
+ rake build && rake release
149
+ ```
150
+
151
+ ## Authors
152
+ - [Anirban Mukhopadhyay](https://github.com/anirbanmu)
153
+
154
+ See all repo contributors [here](https://github.com/anirbanmu/str_metrics/contributors).
155
+
156
+ ## Versioning
157
+
158
+ [SemVer](https://semver.org/) is employed. See [tags](https://github.com/anirbanmu/str_metrics/tags) for released versions.
159
+
160
+ ## Contributing
161
+
162
+ Bug reports and pull requests are welcome on GitHub at https://github.com/anirbanmu/str_metrics.
163
+
164
+ ## Code of Conduct
165
+
166
+ Everyone interacting in this project's codebase, issue trackers etc. are expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
167
+
168
+ ## License
169
+
170
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
data/extconf.rb ADDED
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ abort 'Rust compiler required (https://www.rust-lang.org/)' if `which rustc`.empty?
4
+
5
+ File.open('Makefile', 'wb') do |f|
6
+ f.puts(<<~MKCONTENT)
7
+ all:
8
+ \tcargo rustc --release
9
+ \tmv ./target/release/libstr_metrics.so ./lib/str_metrics
10
+ clean:
11
+ install:
12
+ \trm -r target
13
+ MKCONTENT
14
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module StrMetrics
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ffi'
4
+ require 'str_metrics/version'
5
+
6
+ # Namespace for gem
7
+ module StrMetrics
8
+ # Interface with Rust functions (not meant for public usage)
9
+ module Native
10
+ extend FFI::Library
11
+
12
+ ffi_lib File.expand_path('./str_metrics/libstr_metrics.so', __dir__)
13
+
14
+ attach_function :sorensen_dice_coefficient, %i[string string char], :double
15
+ attach_function :levenshtein_distance, %i[string string char], :int64
16
+ attach_function :damerau_levenshtein_distance, %i[string string char], :int64
17
+ attach_function :jaro_similarity, %i[string string char], :double
18
+ attach_function :jaro_winkler_similarity, %i[string string char int double double], :double
19
+ attach_function :jaro_winkler_distance, %i[string string char int double double], :double
20
+ end
21
+
22
+ private_constant :Native
23
+
24
+ refine String do
25
+ def to_utf8
26
+ encoding == Encoding::UTF_8 ? self : encode('UTF-8')
27
+ end
28
+ end
29
+
30
+ using self # activates refinement
31
+
32
+ # Namespace for Sorensen-Dice
33
+ module SorensenDice
34
+ def self.coefficient(a, b, ignore_case: false)
35
+ Native.sorensen_dice_coefficient(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
36
+ end
37
+ end
38
+
39
+ # Namespace for Levenshtein
40
+ module Levenshtein
41
+ def self.distance(a, b, ignore_case: false)
42
+ Native.levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
43
+ end
44
+ end
45
+
46
+ # Namespace for Damerau-Levenshtein
47
+ module DamerauLevenshtein
48
+ def self.distance(a, b, ignore_case: false)
49
+ Native.damerau_levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
50
+ end
51
+ end
52
+
53
+ # Namespace for Jaro
54
+ module Jaro
55
+ def self.similarity(a, b, ignore_case: false)
56
+ Native.jaro_similarity(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
57
+ end
58
+ end
59
+
60
+ # Namespace for Jaro-Winkler
61
+ module JaroWinkler
62
+ def self.similarity(
63
+ a,
64
+ b,
65
+ ignore_case: false,
66
+ prefix_scaling_factor: 0.1,
67
+ prefix_scaling_bonus_threshold: 0.7
68
+ )
69
+ Native.jaro_winkler_similarity(
70
+ a&.to_utf8,
71
+ b&.to_utf8,
72
+ ignore_case ? 1 : 0,
73
+ 4,
74
+ prefix_scaling_factor,
75
+ prefix_scaling_bonus_threshold
76
+ )
77
+ end
78
+
79
+ def self.distance(
80
+ a,
81
+ b,
82
+ ignore_case: false,
83
+ prefix_scaling_factor: 0.1,
84
+ prefix_scaling_bonus_threshold: 0.7
85
+ )
86
+ Native.jaro_winkler_distance(
87
+ a&.to_utf8,
88
+ b&.to_utf8,
89
+ ignore_case ? 1 : 0,
90
+ 4,
91
+ prefix_scaling_factor,
92
+ prefix_scaling_bonus_threshold
93
+ )
94
+ end
95
+ end
96
+ end
data/src/lib.rs ADDED
@@ -0,0 +1,167 @@
1
+ mod metrics;
2
+
3
+ use libc::{c_char, c_double};
4
+ use std::ffi::CStr;
5
+
6
+ fn cstr_from_raw(s: &*const c_char) -> &CStr {
7
+ unsafe { CStr::from_ptr(*s) }
8
+ }
9
+
10
+ #[no_mangle]
11
+ pub extern "C" fn sorensen_dice_coefficient(
12
+ a: *const c_char,
13
+ b: *const c_char,
14
+ ignore_case: c_char,
15
+ ) -> c_double {
16
+ if a.is_null() || b.is_null() {
17
+ return 0.0;
18
+ }
19
+
20
+ let a_c_str = cstr_from_raw(&a);
21
+ let b_c_str = cstr_from_raw(&b);
22
+
23
+ let a_str = match a_c_str.to_str() {
24
+ Err(_e) => return 0.0,
25
+ Ok(s) => s,
26
+ };
27
+
28
+ let b_str = match b_c_str.to_str() {
29
+ Err(_e) => return 0.0,
30
+ Ok(s) => s,
31
+ };
32
+
33
+ metrics::sorensen_dice::coefficient(a_str, b_str, ignore_case == 1)
34
+ }
35
+
36
+ #[no_mangle]
37
+ pub extern "C" fn jaro_similarity(
38
+ a: *const c_char,
39
+ b: *const c_char,
40
+ ignore_case: c_char,
41
+ ) -> c_double {
42
+ if a.is_null() || b.is_null() {
43
+ return 0.0;
44
+ }
45
+
46
+ let a_c_str = cstr_from_raw(&a);
47
+ let b_c_str = cstr_from_raw(&b);
48
+
49
+ let a_str = match a_c_str.to_str() {
50
+ Err(_e) => return 0.0,
51
+ Ok(s) => s,
52
+ };
53
+
54
+ let b_str = match b_c_str.to_str() {
55
+ Err(_e) => return 0.0,
56
+ Ok(s) => s,
57
+ };
58
+
59
+ metrics::jaro::similarity(a_str, b_str, ignore_case == 1).value
60
+ }
61
+
62
+ #[no_mangle]
63
+ pub extern "C" fn jaro_winkler_similarity(
64
+ a: *const c_char,
65
+ b: *const c_char,
66
+ ignore_case: c_char,
67
+ prefix_length: u32,
68
+ prefix_scaling_factor: c_double,
69
+ prefix_scaling_bonus_threshold: c_double,
70
+ ) -> c_double {
71
+ if a.is_null() || b.is_null() {
72
+ return 0.0;
73
+ }
74
+
75
+ let a_c_str = cstr_from_raw(&a);
76
+ let b_c_str = cstr_from_raw(&b);
77
+
78
+ let a_str = match a_c_str.to_str() {
79
+ Err(_e) => return 0.0,
80
+ Ok(s) => s,
81
+ };
82
+
83
+ let b_str = match b_c_str.to_str() {
84
+ Err(_e) => return 0.0,
85
+ Ok(s) => s,
86
+ };
87
+
88
+ metrics::jaro_winkler::similarity(
89
+ a_str,
90
+ b_str,
91
+ ignore_case == 1,
92
+ prefix_length,
93
+ prefix_scaling_factor,
94
+ prefix_scaling_bonus_threshold,
95
+ )
96
+ }
97
+
98
+ #[no_mangle]
99
+ pub extern "C" fn jaro_winkler_distance(
100
+ a: *const c_char,
101
+ b: *const c_char,
102
+ ignore_case: c_char,
103
+ prefix_length: u32,
104
+ prefix_scaling_factor: c_double,
105
+ prefix_scaling_bonus_threshold: c_double,
106
+ ) -> c_double {
107
+ 1.0 - jaro_winkler_similarity(
108
+ a,
109
+ b,
110
+ ignore_case,
111
+ prefix_length,
112
+ prefix_scaling_factor,
113
+ prefix_scaling_bonus_threshold,
114
+ )
115
+ }
116
+
117
+ #[no_mangle]
118
+ pub extern "C" fn levenshtein_distance(
119
+ a: *const c_char,
120
+ b: *const c_char,
121
+ ignore_case: c_char,
122
+ ) -> i64 {
123
+ if a.is_null() || b.is_null() {
124
+ return std::i64::MAX;
125
+ }
126
+
127
+ let a_c_str = cstr_from_raw(&a);
128
+ let b_c_str = cstr_from_raw(&b);
129
+
130
+ let a_str = match a_c_str.to_str() {
131
+ Err(_e) => return std::i64::MAX,
132
+ Ok(s) => s,
133
+ };
134
+
135
+ let b_str = match b_c_str.to_str() {
136
+ Err(_e) => return std::i64::MAX,
137
+ Ok(s) => s,
138
+ };
139
+
140
+ metrics::levenshtein::distance(a_str, b_str, ignore_case == 1)
141
+ }
142
+
143
+ #[no_mangle]
144
+ pub extern "C" fn damerau_levenshtein_distance(
145
+ a: *const c_char,
146
+ b: *const c_char,
147
+ ignore_case: c_char,
148
+ ) -> i64 {
149
+ if a.is_null() || b.is_null() {
150
+ return std::i64::MAX;
151
+ }
152
+
153
+ let a_c_str = cstr_from_raw(&a);
154
+ let b_c_str = cstr_from_raw(&b);
155
+
156
+ let a_str = match a_c_str.to_str() {
157
+ Err(_e) => return std::i64::MAX,
158
+ Ok(s) => s,
159
+ };
160
+
161
+ let b_str = match b_c_str.to_str() {
162
+ Err(_e) => return std::i64::MAX,
163
+ Ok(s) => s,
164
+ };
165
+
166
+ metrics::damerau_levenshtein::distance(a_str, b_str, ignore_case == 1)
167
+ }
@@ -0,0 +1,71 @@
1
+ use crate::metrics::utils::graphemes;
2
+ use crate::metrics::utils::Array2D;
3
+
4
+ use std::cmp;
5
+ use std::collections::HashMap;
6
+ use std::hash::Hash;
7
+
8
+ pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
9
+ if ignore_case {
10
+ return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
11
+ }
12
+
13
+ distance_impl(&graphemes(a), &graphemes(b))
14
+ }
15
+
16
+ // From https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance
17
+ fn distance_impl<T: Hash + Eq>(a: &[T], b: &[T]) -> i64 {
18
+ let lens = [a.len(), b.len()];
19
+ if lens[0] == 0 {
20
+ return lens[1] as i64;
21
+ }
22
+ if lens[1] == 0 {
23
+ return lens[0] as i64;
24
+ }
25
+
26
+ let rows = lens[0] + 2;
27
+ let columns = lens[1] + 2;
28
+
29
+ let max_dist = (lens[0] + lens[1]) as i64;
30
+ let mut dist_matrix = Array2D::new(rows, columns);
31
+
32
+ dist_matrix[(0, 0)] = max_dist;
33
+ for i in 1..rows {
34
+ dist_matrix[(i, 0)] = max_dist;
35
+ dist_matrix[(i, 1)] = (i - 1) as i64;
36
+ }
37
+ for j in 1..columns {
38
+ dist_matrix[(0, j)] = max_dist;
39
+ dist_matrix[(1, j)] = (j - 1) as i64;
40
+ }
41
+
42
+ let mut da: HashMap<&T, usize> = HashMap::new();
43
+
44
+ for i in 1..=lens[0] {
45
+ let mut db = 0;
46
+ for j in 1..=lens[1] {
47
+ let k = da.entry(&b[j - 1]).or_insert(0);
48
+ let l = db;
49
+ let cost = if a[i - 1] == b[j - 1] {
50
+ db = j;
51
+ 0
52
+ } else {
53
+ 1
54
+ };
55
+
56
+ dist_matrix[(i + 1, j + 1)] = cmp::min(
57
+ dist_matrix[(i, j)] + cost,
58
+ cmp::min(
59
+ dist_matrix[(i + 1, j)] + 1,
60
+ cmp::min(
61
+ dist_matrix[(i, j + 1)] + 1,
62
+ dist_matrix[(*k, l)] + (i - *k - 1) as i64 + 1 + (j - l - 1) as i64,
63
+ ),
64
+ ),
65
+ );
66
+ }
67
+ da.insert(&a[i - 1], i);
68
+ }
69
+
70
+ dist_matrix[(lens[0] + 1, lens[1] + 1)]
71
+ }
@@ -0,0 +1,93 @@
1
+ use crate::metrics::utils::graphemes;
2
+ use std::cmp;
3
+
4
+ pub struct JaroSimilarityResult {
5
+ pub value: f64,
6
+ pub max_prefix_length: i64,
7
+ }
8
+
9
+ pub fn similarity(a: &str, b: &str, ignore_case: bool) -> JaroSimilarityResult {
10
+ if ignore_case {
11
+ return similarity_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
12
+ }
13
+
14
+ similarity_impl(&graphemes(a), &graphemes(b))
15
+ }
16
+
17
+ fn similarity_impl<T: Eq>(a: &[T], b: &[T]) -> JaroSimilarityResult {
18
+ let mut graphemes = [a, b];
19
+ if graphemes[0].len() > graphemes[1].len() {
20
+ graphemes.swap(0, 1);
21
+ }
22
+
23
+ // let grapheme_iterators = [UnicodeSegmentation::graphemes(&case_handled[0][..], true), UnicodeSegmentation::graphemes(&case_handled[1][..], true)];
24
+ let lens = [graphemes[0].len(), graphemes[1].len()];
25
+
26
+ let max_length = cmp::max(lens[0], lens[1]);
27
+ let matching_dist = if max_length < 2 {
28
+ 0
29
+ } else {
30
+ (max_length / 2) - 1
31
+ };
32
+
33
+ let mut matching_indices = [
34
+ Vec::with_capacity(max_length),
35
+ Vec::with_capacity(max_length),
36
+ ];
37
+
38
+ // Find matches
39
+ let mut last_matched_prefix_index = -1;
40
+ {
41
+ let mut b_matched = vec![false; lens[1]];
42
+ for (i, grapheme) in graphemes[0].iter().enumerate() {
43
+ let start = cmp::max(0 as i64, i as i64 - matching_dist as i64) as usize;
44
+ let end = cmp::min(lens[1], i + matching_dist + 1);
45
+
46
+ // Keep track of prefix match
47
+ // Safe to access i in b since a.len < b.len
48
+ if grapheme == &graphemes[1][i]
49
+ && ((last_matched_prefix_index == -1 && i == 0)
50
+ || last_matched_prefix_index == i as i64 - 1)
51
+ {
52
+ last_matched_prefix_index = i as i64;
53
+ }
54
+
55
+ for (j, matched) in b_matched.iter_mut().enumerate().take(end).skip(start) {
56
+ if grapheme == &graphemes[1][j] && !*matched {
57
+ *matched = true;
58
+ matching_indices[0].push(i);
59
+ matching_indices[1].push(j);
60
+ break;
61
+ }
62
+ }
63
+ }
64
+ }
65
+
66
+ let matches = matching_indices[0].len();
67
+ if matches == 0 {
68
+ return JaroSimilarityResult {
69
+ value: 0.0,
70
+ max_prefix_length: 0,
71
+ };
72
+ }
73
+
74
+ // Find transpositions / 2 in matches
75
+ matching_indices[1].sort_unstable();
76
+ let transpositions = matching_indices[0]
77
+ .iter()
78
+ .zip(matching_indices[1].iter())
79
+ .fold(0.0, |acc, (i, j)| {
80
+ if graphemes[0][*i] == graphemes[1][*j] {
81
+ acc
82
+ } else {
83
+ acc + 0.5
84
+ }
85
+ });
86
+
87
+ let m = matches as f64;
88
+ let t = transpositions;
89
+ JaroSimilarityResult {
90
+ value: ((m / lens[0] as f64) + (m / lens[1] as f64) + ((m - t) / m)) / 3.0,
91
+ max_prefix_length: last_matched_prefix_index + 1,
92
+ }
93
+ }
@@ -0,0 +1,20 @@
1
+ use crate::metrics::jaro;
2
+ use std::cmp;
3
+
4
+ pub fn similarity(
5
+ a: &str,
6
+ b: &str,
7
+ ignore_case: bool,
8
+ prefix_length: u32,
9
+ prefix_scaling_factor: f64,
10
+ prefix_scaling_bonus_threshold: f64,
11
+ ) -> f64 {
12
+ let jaro_similarity = jaro::similarity(a, b, ignore_case);
13
+ let common_prefix_len = cmp::min(prefix_length as i64, jaro_similarity.max_prefix_length);
14
+
15
+ if jaro_similarity.value > prefix_scaling_bonus_threshold {
16
+ return jaro_similarity.value
17
+ + common_prefix_len as f64 * prefix_scaling_factor * (1.0 - jaro_similarity.value);
18
+ }
19
+ jaro_similarity.value
20
+ }
@@ -0,0 +1,46 @@
1
+ use crate::metrics::utils::graphemes;
2
+ use crate::metrics::utils::Array2D;
3
+ use std::cmp;
4
+
5
+ pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
6
+ if ignore_case {
7
+ return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
8
+ }
9
+
10
+ distance_impl(&graphemes(a), &graphemes(b))
11
+ }
12
+
13
+ fn distance_impl<T: Eq>(a: &[T], b: &[T]) -> i64 {
14
+ let lens = [a.len(), b.len()];
15
+ if lens[0] == 0 {
16
+ return lens[1] as i64;
17
+ }
18
+ if lens[1] == 0 {
19
+ return lens[0] as i64;
20
+ }
21
+
22
+ let rows = lens[0] + 1;
23
+ let columns = lens[1] + 1;
24
+
25
+ let mut dist_matrix = Array2D::new(rows, columns);
26
+
27
+ for i in 0..rows {
28
+ dist_matrix[(i, 0)] = i as i64;
29
+ }
30
+ for j in 0..columns {
31
+ dist_matrix[(0, j)] = j as i64;
32
+ }
33
+
34
+ for i in 1..rows {
35
+ for j in 1..columns {
36
+ let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
37
+
38
+ dist_matrix[(i, j)] = cmp::min(
39
+ cmp::min(dist_matrix[(i - 1, j)] + 1, dist_matrix[(i, j - 1)] + 1),
40
+ dist_matrix[(i - 1, j - 1)] + cost,
41
+ );
42
+ }
43
+ }
44
+
45
+ dist_matrix[(rows - 1, columns - 1)]
46
+ }
@@ -0,0 +1,6 @@
1
+ pub mod damerau_levenshtein;
2
+ pub mod jaro;
3
+ pub mod jaro_winkler;
4
+ pub mod levenshtein;
5
+ pub mod sorensen_dice;
6
+ mod utils;
@@ -0,0 +1,47 @@
1
+ use crate::metrics::utils::generate_bigrams;
2
+ use std::collections::hash_map::Entry::{Occupied, Vacant};
3
+ use std::collections::HashMap;
4
+
5
+ pub fn coefficient(a: &str, b: &str, ignore_case: bool) -> f64 {
6
+ if ignore_case {
7
+ return coefficient_impl(&a.to_lowercase(), &b.to_lowercase());
8
+ }
9
+ coefficient_impl(a, b)
10
+ }
11
+
12
+ fn coefficient_impl(a: &str, b: &str) -> f64 {
13
+ let a_bigrams = generate_bigrams(&a);
14
+ let mut b_bigrams_hash: HashMap<&str, i64> = HashMap::new();
15
+
16
+ let mut total_bigrams = a_bigrams.len();
17
+
18
+ {
19
+ let b_bigrams = generate_bigrams(&b);
20
+ for s in &b_bigrams {
21
+ let counter = b_bigrams_hash.entry(s).or_insert(0);
22
+ *counter += 1;
23
+ }
24
+
25
+ total_bigrams += b_bigrams.len();
26
+ }
27
+
28
+ if total_bigrams == 0 {
29
+ return 0.0;
30
+ }
31
+
32
+ let mut intersections = 0;
33
+ for bigram in &a_bigrams {
34
+ match b_bigrams_hash.entry(bigram) {
35
+ Vacant(_) => {}
36
+ Occupied(entry) => {
37
+ let counter = entry.get();
38
+ if counter > &0 {
39
+ *entry.into_mut() = entry.get() - 1;
40
+ intersections += 1;
41
+ }
42
+ }
43
+ }
44
+ }
45
+
46
+ 2.0 * intersections as f64 / total_bigrams as f64
47
+ }
@@ -0,0 +1,30 @@
1
+ use std::ops::Index;
2
+ use std::ops::IndexMut;
3
+
4
+ pub struct Array2D<T> {
5
+ arr: Vec<T>,
6
+ cols: usize,
7
+ }
8
+
9
+ impl<T: Clone + Default> Array2D<T> {
10
+ pub fn new(rows: usize, columns: usize) -> Array2D<T> {
11
+ Array2D {
12
+ arr: vec![Default::default(); rows * columns],
13
+ cols: columns,
14
+ }
15
+ }
16
+ }
17
+
18
+ impl<T> Index<(usize, usize)> for Array2D<T> {
19
+ type Output = T;
20
+
21
+ fn index(&self, (y, x): (usize, usize)) -> &Self::Output {
22
+ &self.arr[y * self.cols + x]
23
+ }
24
+ }
25
+
26
+ impl<T> IndexMut<(usize, usize)> for Array2D<T> {
27
+ fn index_mut(&mut self, (y, x): (usize, usize)) -> &mut Self::Output {
28
+ &mut self.arr[y * self.cols + x]
29
+ }
30
+ }
@@ -0,0 +1,19 @@
1
+ extern crate itertools;
2
+ extern crate unicode_segmentation;
3
+
4
+ use itertools::Itertools;
5
+ use unicode_segmentation::UnicodeSegmentation;
6
+
7
+ mod array_2d;
8
+ pub use array_2d::Array2D;
9
+
10
+ pub fn graphemes(s: &str) -> Vec<&str> {
11
+ UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>()
12
+ }
13
+
14
+ pub fn generate_bigrams(s: &str) -> Vec<&str> {
15
+ UnicodeSegmentation::grapheme_indices(s, true)
16
+ .tuple_windows()
17
+ .map(|(a, b)| &s[a.0..b.0 + b.1.len()])
18
+ .collect::<Vec<&str>>()
19
+ }
metadata ADDED
@@ -0,0 +1,167 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: str_metrics
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Anirban Mukhopadhyay
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-03-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ffi
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: pry
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rake
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '12.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '12.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rspec
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rubocop
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop-performance
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rubocop-rspec
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: 'Ruby gem (native extension in Rust) providing implementations of various
112
+ string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein,
113
+ Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation)
114
+ are supported. All comparison of strings is done at the grapheme cluster level as
115
+ described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries);
116
+ this may be different from many gems that calculate string metrics.'
117
+ email:
118
+ - anirban.mukhop@gmail.com
119
+ executables: []
120
+ extensions:
121
+ - extconf.rb
122
+ extra_rdoc_files: []
123
+ files:
124
+ - Cargo.toml
125
+ - LICENSE
126
+ - README.md
127
+ - extconf.rb
128
+ - lib/str_metrics.rb
129
+ - lib/str_metrics/version.rb
130
+ - src/lib.rs
131
+ - src/metrics/damerau_levenshtein.rs
132
+ - src/metrics/jaro.rs
133
+ - src/metrics/jaro_winkler.rs
134
+ - src/metrics/levenshtein.rs
135
+ - src/metrics/mod.rs
136
+ - src/metrics/sorensen_dice.rs
137
+ - src/metrics/utils/array_2d.rs
138
+ - src/metrics/utils/mod.rs
139
+ homepage: https://github.com/anirbanmu/str_metrics
140
+ licenses:
141
+ - MIT
142
+ metadata:
143
+ allowed_push_host: https://rubygems.org
144
+ homepage_uri: https://github.com/anirbanmu/str_metrics
145
+ bug_tracker_uri: https://github.com/anirbanmu/str_metrics/issues
146
+ source_code_uri: https://github.com/anirbanmu/str_metrics
147
+ changelog_uri: https://github.com/anirbanmu/str_metrics/blob/v0.1.0/CHANGELOG.md
148
+ post_install_message:
149
+ rdoc_options: []
150
+ require_paths:
151
+ - lib
152
+ required_ruby_version: !ruby/object:Gem::Requirement
153
+ requirements:
154
+ - - ">="
155
+ - !ruby/object:Gem::Version
156
+ version: 2.3.0
157
+ required_rubygems_version: !ruby/object:Gem::Requirement
158
+ requirements:
159
+ - - ">="
160
+ - !ruby/object:Gem::Version
161
+ version: '0'
162
+ requirements: []
163
+ rubygems_version: 3.0.6
164
+ signing_key:
165
+ specification_version: 4
166
+ summary: Ruby gem providing native implementations of various string metrics
167
+ test_files: []