str_metrics 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.toml +15 -0
- data/LICENSE +21 -0
- data/README.md +170 -0
- data/extconf.rb +14 -0
- data/lib/str_metrics/version.rb +5 -0
- data/lib/str_metrics.rb +96 -0
- data/src/lib.rs +167 -0
- data/src/metrics/damerau_levenshtein.rs +71 -0
- data/src/metrics/jaro.rs +93 -0
- data/src/metrics/jaro_winkler.rs +20 -0
- data/src/metrics/levenshtein.rs +46 -0
- data/src/metrics/mod.rs +6 -0
- data/src/metrics/sorensen_dice.rs +47 -0
- data/src/metrics/utils/array_2d.rs +30 -0
- data/src/metrics/utils/mod.rs +19 -0
- metadata +167 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f773ff65c7ef2fb3bc5f396cf10e13e7d98d3037981de45a846f8b9fc6328caa
|
4
|
+
data.tar.gz: c6b3838b89a063c4e57cf70e06a7e9226bee9ecb84a99d0f2773775dbb56cc6e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 22df7813421a8850d277065d8595aadf5a30c0fc96ebfb90bfaac05e3906030c5874dd4a8da38e1a0af67277f22ca2c46940515d509ed6bd63c0d332e6620b3b
|
7
|
+
data.tar.gz: 7addd087a16eb72ed3428a9b90ce51c0e0ed5865bc2d8aebe7b899f2adc5fdbbcad15c73b60af3701213129e9f7fed6e69a834637f7e32c14f478f8caaa7e589
|
data/Cargo.toml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
[package]
|
2
|
+
name = "str_metrics"
|
3
|
+
version = "0.1.0"
|
4
|
+
authors = ["Anirban Mukhopadhyay <anirban.mukhop@gmail.com>"]
|
5
|
+
edition = "2018"
|
6
|
+
|
7
|
+
[lib]
|
8
|
+
crate-type = ["cdylib"]
|
9
|
+
|
10
|
+
[dependencies]
|
11
|
+
unicode-segmentation = "^1.6"
|
12
|
+
libc = "^0.2"
|
13
|
+
itertools = "^0.8"
|
14
|
+
|
15
|
+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2020 Anirban Mukhopadhyay
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
# StrMetrics
|
2
|
+
|
3
|
+
[](https://github.com/anirbanmu/str_metrics/actions?query=workflow%3Achecks)
|
4
|
+
|
5
|
+
Ruby gem (native extension in Rust) providing implementations of various string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein, Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation) are supported. All comparison of strings is done at the grapheme cluster level as described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries); this may be different from many gems that calculate string metrics.
|
6
|
+
|
7
|
+
## Getting Started
|
8
|
+
### Prerequisites
|
9
|
+
|
10
|
+
Install Rust (tested with version `>= 1.38.0`) with:
|
11
|
+
|
12
|
+
```sh
|
13
|
+
curl https://sh.rustup.rs -sSf | sh
|
14
|
+
```
|
15
|
+
|
16
|
+
### Installation
|
17
|
+
|
18
|
+
#### With [`bundler`](https://bundler.io/)
|
19
|
+
|
20
|
+
Add this line to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'str_metrics'
|
24
|
+
```
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle install
|
29
|
+
|
30
|
+
#### Without `bundler`
|
31
|
+
|
32
|
+
$ gem install str_metrics
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
All you need to do to use the metrics provided in this gem is to make sure `str_metrics` is required like:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'str_metrics'
|
40
|
+
```
|
41
|
+
|
42
|
+
Each metric is shown below with an example & meanings of optional parameters.
|
43
|
+
|
44
|
+
### Sørensen–Dice
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
StrMetrics::SorensenDice.coefficient('abc', 'bcd', ignore_case: false)
|
48
|
+
=> 0.5
|
49
|
+
```
|
50
|
+
Options:
|
51
|
+
|
52
|
+
Keyword | Type | Default | Description
|
53
|
+
--- | --- | --- | ---
|
54
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
55
|
+
|
56
|
+
### Levenshtein
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
StrMetrics::Levenshtein.distance('abc', 'acb', ignore_case: false)
|
60
|
+
=> 2
|
61
|
+
```
|
62
|
+
Options:
|
63
|
+
|
64
|
+
Keyword | Type | Default | Description
|
65
|
+
--- | --- | --- | ---
|
66
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
67
|
+
|
68
|
+
### Damerau–Levenshtein
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
StrMetrics::DamerauLevenshtein.distance('abc', 'acb', ignore_case: false)
|
72
|
+
=> 1
|
73
|
+
```
|
74
|
+
Options:
|
75
|
+
|
76
|
+
Keyword | Type | Default | Description
|
77
|
+
--- | --- | --- | ---
|
78
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
79
|
+
|
80
|
+
### Jaro
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
StrMetrics::Jaro.similarity('abc', 'aac', ignore_case: false)
|
84
|
+
=> 0.7777777777777777
|
85
|
+
```
|
86
|
+
Options:
|
87
|
+
|
88
|
+
Keyword | Type | Default | Description
|
89
|
+
--- | --- | --- | ---
|
90
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
91
|
+
|
92
|
+
### Jaro–Winkler
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
StrMetrics::JaroWinkler.similarity('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
|
96
|
+
=> 0.7999999999999999
|
97
|
+
|
98
|
+
StrMetrics::JaroWinkler.distance('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
|
99
|
+
=> 0.20000000000000007
|
100
|
+
```
|
101
|
+
Options:
|
102
|
+
|
103
|
+
Keyword | Type | Default | Description
|
104
|
+
--- | --- | --- | ---
|
105
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
106
|
+
`prefix_scaling_factor` | decimal | `0.1` | Constant scaling factor for how much to weight common prefixes. Should not exceed 0.25.
|
107
|
+
`prefix_scaling_bonus_threshold` | decimal | `0.7` | Prefix bonus weighting will only be applied if the Jaro similarity is greater given value.
|
108
|
+
|
109
|
+
## Motivation
|
110
|
+
|
111
|
+
The main motivation was to have a central gem which can provide a variety of string metric calculations. Secondary motivation was to experiment with writing a native extension in Rust (instead of C).
|
112
|
+
|
113
|
+
## Development
|
114
|
+
|
115
|
+
### Getting started
|
116
|
+
|
117
|
+
```bash
|
118
|
+
gem install bundler
|
119
|
+
git clone https://github.com/anirbanmu/str_metrics.git
|
120
|
+
cd ./str_metrics
|
121
|
+
bundle install
|
122
|
+
```
|
123
|
+
|
124
|
+
### Building (for native component)
|
125
|
+
|
126
|
+
```bash
|
127
|
+
rake rust_build
|
128
|
+
```
|
129
|
+
|
130
|
+
### Testing (will build native component before running tests)
|
131
|
+
```bash
|
132
|
+
rake spec
|
133
|
+
```
|
134
|
+
|
135
|
+
### Local installation
|
136
|
+
```bash
|
137
|
+
rake install
|
138
|
+
```
|
139
|
+
|
140
|
+
### Deploying a new version
|
141
|
+
To deploy a new version of the gem to rubygems:
|
142
|
+
|
143
|
+
1. Bump version in [version.rb](lib/str_metrics/version.rb) according to [SemVer](https://semver.org/).
|
144
|
+
2. Get your code merged to master
|
145
|
+
3. After a `git pull` on master:
|
146
|
+
|
147
|
+
```bash
|
148
|
+
rake build && rake release
|
149
|
+
```
|
150
|
+
|
151
|
+
## Authors
|
152
|
+
- [Anirban Mukhopadhyay](https://github.com/anirbanmu)
|
153
|
+
|
154
|
+
See all repo contributors [here](https://github.com/anirbanmu/str_metrics/contributors).
|
155
|
+
|
156
|
+
## Versioning
|
157
|
+
|
158
|
+
[SemVer](https://semver.org/) is employed. See [tags](https://github.com/anirbanmu/str_metrics/tags) for released versions.
|
159
|
+
|
160
|
+
## Contributing
|
161
|
+
|
162
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/anirbanmu/str_metrics.
|
163
|
+
|
164
|
+
## Code of Conduct
|
165
|
+
|
166
|
+
Everyone interacting in this project's codebase, issue trackers etc. are expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
|
167
|
+
|
168
|
+
## License
|
169
|
+
|
170
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
|
data/extconf.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
abort 'Rust compiler required (https://www.rust-lang.org/)' if `which rustc`.empty?
|
4
|
+
|
5
|
+
File.open('Makefile', 'wb') do |f|
|
6
|
+
f.puts(<<~MKCONTENT)
|
7
|
+
all:
|
8
|
+
\tcargo rustc --release
|
9
|
+
\tmv ./target/release/libstr_metrics.so ./lib/str_metrics
|
10
|
+
clean:
|
11
|
+
install:
|
12
|
+
\trm -r target
|
13
|
+
MKCONTENT
|
14
|
+
end
|
data/lib/str_metrics.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ffi'
|
4
|
+
require 'str_metrics/version'
|
5
|
+
|
6
|
+
# Namespace for gem
|
7
|
+
module StrMetrics
|
8
|
+
# Interface with Rust functions (not meant for public usage)
|
9
|
+
module Native
|
10
|
+
extend FFI::Library
|
11
|
+
|
12
|
+
ffi_lib File.expand_path('./str_metrics/libstr_metrics.so', __dir__)
|
13
|
+
|
14
|
+
attach_function :sorensen_dice_coefficient, %i[string string char], :double
|
15
|
+
attach_function :levenshtein_distance, %i[string string char], :int64
|
16
|
+
attach_function :damerau_levenshtein_distance, %i[string string char], :int64
|
17
|
+
attach_function :jaro_similarity, %i[string string char], :double
|
18
|
+
attach_function :jaro_winkler_similarity, %i[string string char int double double], :double
|
19
|
+
attach_function :jaro_winkler_distance, %i[string string char int double double], :double
|
20
|
+
end
|
21
|
+
|
22
|
+
private_constant :Native
|
23
|
+
|
24
|
+
refine String do
|
25
|
+
def to_utf8
|
26
|
+
encoding == Encoding::UTF_8 ? self : encode('UTF-8')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
using self # activates refinement
|
31
|
+
|
32
|
+
# Namespace for Sorensen-Dice
|
33
|
+
module SorensenDice
|
34
|
+
def self.coefficient(a, b, ignore_case: false)
|
35
|
+
Native.sorensen_dice_coefficient(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Namespace for Levenshtein
|
40
|
+
module Levenshtein
|
41
|
+
def self.distance(a, b, ignore_case: false)
|
42
|
+
Native.levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Namespace for Damerau-Levenshtein
|
47
|
+
module DamerauLevenshtein
|
48
|
+
def self.distance(a, b, ignore_case: false)
|
49
|
+
Native.damerau_levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Namespace for Jaro
|
54
|
+
module Jaro
|
55
|
+
def self.similarity(a, b, ignore_case: false)
|
56
|
+
Native.jaro_similarity(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Namespace for Jaro-Winkler
|
61
|
+
module JaroWinkler
|
62
|
+
def self.similarity(
|
63
|
+
a,
|
64
|
+
b,
|
65
|
+
ignore_case: false,
|
66
|
+
prefix_scaling_factor: 0.1,
|
67
|
+
prefix_scaling_bonus_threshold: 0.7
|
68
|
+
)
|
69
|
+
Native.jaro_winkler_similarity(
|
70
|
+
a&.to_utf8,
|
71
|
+
b&.to_utf8,
|
72
|
+
ignore_case ? 1 : 0,
|
73
|
+
4,
|
74
|
+
prefix_scaling_factor,
|
75
|
+
prefix_scaling_bonus_threshold
|
76
|
+
)
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.distance(
|
80
|
+
a,
|
81
|
+
b,
|
82
|
+
ignore_case: false,
|
83
|
+
prefix_scaling_factor: 0.1,
|
84
|
+
prefix_scaling_bonus_threshold: 0.7
|
85
|
+
)
|
86
|
+
Native.jaro_winkler_distance(
|
87
|
+
a&.to_utf8,
|
88
|
+
b&.to_utf8,
|
89
|
+
ignore_case ? 1 : 0,
|
90
|
+
4,
|
91
|
+
prefix_scaling_factor,
|
92
|
+
prefix_scaling_bonus_threshold
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/src/lib.rs
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
mod metrics;
|
2
|
+
|
3
|
+
use libc::{c_char, c_double};
|
4
|
+
use std::ffi::CStr;
|
5
|
+
|
6
|
+
fn cstr_from_raw(s: &*const c_char) -> &CStr {
|
7
|
+
unsafe { CStr::from_ptr(*s) }
|
8
|
+
}
|
9
|
+
|
10
|
+
#[no_mangle]
|
11
|
+
pub extern "C" fn sorensen_dice_coefficient(
|
12
|
+
a: *const c_char,
|
13
|
+
b: *const c_char,
|
14
|
+
ignore_case: c_char,
|
15
|
+
) -> c_double {
|
16
|
+
if a.is_null() || b.is_null() {
|
17
|
+
return 0.0;
|
18
|
+
}
|
19
|
+
|
20
|
+
let a_c_str = cstr_from_raw(&a);
|
21
|
+
let b_c_str = cstr_from_raw(&b);
|
22
|
+
|
23
|
+
let a_str = match a_c_str.to_str() {
|
24
|
+
Err(_e) => return 0.0,
|
25
|
+
Ok(s) => s,
|
26
|
+
};
|
27
|
+
|
28
|
+
let b_str = match b_c_str.to_str() {
|
29
|
+
Err(_e) => return 0.0,
|
30
|
+
Ok(s) => s,
|
31
|
+
};
|
32
|
+
|
33
|
+
metrics::sorensen_dice::coefficient(a_str, b_str, ignore_case == 1)
|
34
|
+
}
|
35
|
+
|
36
|
+
#[no_mangle]
|
37
|
+
pub extern "C" fn jaro_similarity(
|
38
|
+
a: *const c_char,
|
39
|
+
b: *const c_char,
|
40
|
+
ignore_case: c_char,
|
41
|
+
) -> c_double {
|
42
|
+
if a.is_null() || b.is_null() {
|
43
|
+
return 0.0;
|
44
|
+
}
|
45
|
+
|
46
|
+
let a_c_str = cstr_from_raw(&a);
|
47
|
+
let b_c_str = cstr_from_raw(&b);
|
48
|
+
|
49
|
+
let a_str = match a_c_str.to_str() {
|
50
|
+
Err(_e) => return 0.0,
|
51
|
+
Ok(s) => s,
|
52
|
+
};
|
53
|
+
|
54
|
+
let b_str = match b_c_str.to_str() {
|
55
|
+
Err(_e) => return 0.0,
|
56
|
+
Ok(s) => s,
|
57
|
+
};
|
58
|
+
|
59
|
+
metrics::jaro::similarity(a_str, b_str, ignore_case == 1).value
|
60
|
+
}
|
61
|
+
|
62
|
+
#[no_mangle]
|
63
|
+
pub extern "C" fn jaro_winkler_similarity(
|
64
|
+
a: *const c_char,
|
65
|
+
b: *const c_char,
|
66
|
+
ignore_case: c_char,
|
67
|
+
prefix_length: u32,
|
68
|
+
prefix_scaling_factor: c_double,
|
69
|
+
prefix_scaling_bonus_threshold: c_double,
|
70
|
+
) -> c_double {
|
71
|
+
if a.is_null() || b.is_null() {
|
72
|
+
return 0.0;
|
73
|
+
}
|
74
|
+
|
75
|
+
let a_c_str = cstr_from_raw(&a);
|
76
|
+
let b_c_str = cstr_from_raw(&b);
|
77
|
+
|
78
|
+
let a_str = match a_c_str.to_str() {
|
79
|
+
Err(_e) => return 0.0,
|
80
|
+
Ok(s) => s,
|
81
|
+
};
|
82
|
+
|
83
|
+
let b_str = match b_c_str.to_str() {
|
84
|
+
Err(_e) => return 0.0,
|
85
|
+
Ok(s) => s,
|
86
|
+
};
|
87
|
+
|
88
|
+
metrics::jaro_winkler::similarity(
|
89
|
+
a_str,
|
90
|
+
b_str,
|
91
|
+
ignore_case == 1,
|
92
|
+
prefix_length,
|
93
|
+
prefix_scaling_factor,
|
94
|
+
prefix_scaling_bonus_threshold,
|
95
|
+
)
|
96
|
+
}
|
97
|
+
|
98
|
+
#[no_mangle]
|
99
|
+
pub extern "C" fn jaro_winkler_distance(
|
100
|
+
a: *const c_char,
|
101
|
+
b: *const c_char,
|
102
|
+
ignore_case: c_char,
|
103
|
+
prefix_length: u32,
|
104
|
+
prefix_scaling_factor: c_double,
|
105
|
+
prefix_scaling_bonus_threshold: c_double,
|
106
|
+
) -> c_double {
|
107
|
+
1.0 - jaro_winkler_similarity(
|
108
|
+
a,
|
109
|
+
b,
|
110
|
+
ignore_case,
|
111
|
+
prefix_length,
|
112
|
+
prefix_scaling_factor,
|
113
|
+
prefix_scaling_bonus_threshold,
|
114
|
+
)
|
115
|
+
}
|
116
|
+
|
117
|
+
#[no_mangle]
|
118
|
+
pub extern "C" fn levenshtein_distance(
|
119
|
+
a: *const c_char,
|
120
|
+
b: *const c_char,
|
121
|
+
ignore_case: c_char,
|
122
|
+
) -> i64 {
|
123
|
+
if a.is_null() || b.is_null() {
|
124
|
+
return std::i64::MAX;
|
125
|
+
}
|
126
|
+
|
127
|
+
let a_c_str = cstr_from_raw(&a);
|
128
|
+
let b_c_str = cstr_from_raw(&b);
|
129
|
+
|
130
|
+
let a_str = match a_c_str.to_str() {
|
131
|
+
Err(_e) => return std::i64::MAX,
|
132
|
+
Ok(s) => s,
|
133
|
+
};
|
134
|
+
|
135
|
+
let b_str = match b_c_str.to_str() {
|
136
|
+
Err(_e) => return std::i64::MAX,
|
137
|
+
Ok(s) => s,
|
138
|
+
};
|
139
|
+
|
140
|
+
metrics::levenshtein::distance(a_str, b_str, ignore_case == 1)
|
141
|
+
}
|
142
|
+
|
143
|
+
#[no_mangle]
|
144
|
+
pub extern "C" fn damerau_levenshtein_distance(
|
145
|
+
a: *const c_char,
|
146
|
+
b: *const c_char,
|
147
|
+
ignore_case: c_char,
|
148
|
+
) -> i64 {
|
149
|
+
if a.is_null() || b.is_null() {
|
150
|
+
return std::i64::MAX;
|
151
|
+
}
|
152
|
+
|
153
|
+
let a_c_str = cstr_from_raw(&a);
|
154
|
+
let b_c_str = cstr_from_raw(&b);
|
155
|
+
|
156
|
+
let a_str = match a_c_str.to_str() {
|
157
|
+
Err(_e) => return std::i64::MAX,
|
158
|
+
Ok(s) => s,
|
159
|
+
};
|
160
|
+
|
161
|
+
let b_str = match b_c_str.to_str() {
|
162
|
+
Err(_e) => return std::i64::MAX,
|
163
|
+
Ok(s) => s,
|
164
|
+
};
|
165
|
+
|
166
|
+
metrics::damerau_levenshtein::distance(a_str, b_str, ignore_case == 1)
|
167
|
+
}
|
@@ -0,0 +1,71 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use crate::metrics::utils::Array2D;
|
3
|
+
|
4
|
+
use std::cmp;
|
5
|
+
use std::collections::HashMap;
|
6
|
+
use std::hash::Hash;
|
7
|
+
|
8
|
+
pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
|
9
|
+
if ignore_case {
|
10
|
+
return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
11
|
+
}
|
12
|
+
|
13
|
+
distance_impl(&graphemes(a), &graphemes(b))
|
14
|
+
}
|
15
|
+
|
16
|
+
// From https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance
|
17
|
+
fn distance_impl<T: Hash + Eq>(a: &[T], b: &[T]) -> i64 {
|
18
|
+
let lens = [a.len(), b.len()];
|
19
|
+
if lens[0] == 0 {
|
20
|
+
return lens[1] as i64;
|
21
|
+
}
|
22
|
+
if lens[1] == 0 {
|
23
|
+
return lens[0] as i64;
|
24
|
+
}
|
25
|
+
|
26
|
+
let rows = lens[0] + 2;
|
27
|
+
let columns = lens[1] + 2;
|
28
|
+
|
29
|
+
let max_dist = (lens[0] + lens[1]) as i64;
|
30
|
+
let mut dist_matrix = Array2D::new(rows, columns);
|
31
|
+
|
32
|
+
dist_matrix[(0, 0)] = max_dist;
|
33
|
+
for i in 1..rows {
|
34
|
+
dist_matrix[(i, 0)] = max_dist;
|
35
|
+
dist_matrix[(i, 1)] = (i - 1) as i64;
|
36
|
+
}
|
37
|
+
for j in 1..columns {
|
38
|
+
dist_matrix[(0, j)] = max_dist;
|
39
|
+
dist_matrix[(1, j)] = (j - 1) as i64;
|
40
|
+
}
|
41
|
+
|
42
|
+
let mut da: HashMap<&T, usize> = HashMap::new();
|
43
|
+
|
44
|
+
for i in 1..=lens[0] {
|
45
|
+
let mut db = 0;
|
46
|
+
for j in 1..=lens[1] {
|
47
|
+
let k = da.entry(&b[j - 1]).or_insert(0);
|
48
|
+
let l = db;
|
49
|
+
let cost = if a[i - 1] == b[j - 1] {
|
50
|
+
db = j;
|
51
|
+
0
|
52
|
+
} else {
|
53
|
+
1
|
54
|
+
};
|
55
|
+
|
56
|
+
dist_matrix[(i + 1, j + 1)] = cmp::min(
|
57
|
+
dist_matrix[(i, j)] + cost,
|
58
|
+
cmp::min(
|
59
|
+
dist_matrix[(i + 1, j)] + 1,
|
60
|
+
cmp::min(
|
61
|
+
dist_matrix[(i, j + 1)] + 1,
|
62
|
+
dist_matrix[(*k, l)] + (i - *k - 1) as i64 + 1 + (j - l - 1) as i64,
|
63
|
+
),
|
64
|
+
),
|
65
|
+
);
|
66
|
+
}
|
67
|
+
da.insert(&a[i - 1], i);
|
68
|
+
}
|
69
|
+
|
70
|
+
dist_matrix[(lens[0] + 1, lens[1] + 1)]
|
71
|
+
}
|
data/src/metrics/jaro.rs
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use std::cmp;
|
3
|
+
|
4
|
+
pub struct JaroSimilarityResult {
|
5
|
+
pub value: f64,
|
6
|
+
pub max_prefix_length: i64,
|
7
|
+
}
|
8
|
+
|
9
|
+
pub fn similarity(a: &str, b: &str, ignore_case: bool) -> JaroSimilarityResult {
|
10
|
+
if ignore_case {
|
11
|
+
return similarity_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
12
|
+
}
|
13
|
+
|
14
|
+
similarity_impl(&graphemes(a), &graphemes(b))
|
15
|
+
}
|
16
|
+
|
17
|
+
fn similarity_impl<T: Eq>(a: &[T], b: &[T]) -> JaroSimilarityResult {
|
18
|
+
let mut graphemes = [a, b];
|
19
|
+
if graphemes[0].len() > graphemes[1].len() {
|
20
|
+
graphemes.swap(0, 1);
|
21
|
+
}
|
22
|
+
|
23
|
+
// let grapheme_iterators = [UnicodeSegmentation::graphemes(&case_handled[0][..], true), UnicodeSegmentation::graphemes(&case_handled[1][..], true)];
|
24
|
+
let lens = [graphemes[0].len(), graphemes[1].len()];
|
25
|
+
|
26
|
+
let max_length = cmp::max(lens[0], lens[1]);
|
27
|
+
let matching_dist = if max_length < 2 {
|
28
|
+
0
|
29
|
+
} else {
|
30
|
+
(max_length / 2) - 1
|
31
|
+
};
|
32
|
+
|
33
|
+
let mut matching_indices = [
|
34
|
+
Vec::with_capacity(max_length),
|
35
|
+
Vec::with_capacity(max_length),
|
36
|
+
];
|
37
|
+
|
38
|
+
// Find matches
|
39
|
+
let mut last_matched_prefix_index = -1;
|
40
|
+
{
|
41
|
+
let mut b_matched = vec![false; lens[1]];
|
42
|
+
for (i, grapheme) in graphemes[0].iter().enumerate() {
|
43
|
+
let start = cmp::max(0 as i64, i as i64 - matching_dist as i64) as usize;
|
44
|
+
let end = cmp::min(lens[1], i + matching_dist + 1);
|
45
|
+
|
46
|
+
// Keep track of prefix match
|
47
|
+
// Safe to access i in b since a.len < b.len
|
48
|
+
if grapheme == &graphemes[1][i]
|
49
|
+
&& ((last_matched_prefix_index == -1 && i == 0)
|
50
|
+
|| last_matched_prefix_index == i as i64 - 1)
|
51
|
+
{
|
52
|
+
last_matched_prefix_index = i as i64;
|
53
|
+
}
|
54
|
+
|
55
|
+
for (j, matched) in b_matched.iter_mut().enumerate().take(end).skip(start) {
|
56
|
+
if grapheme == &graphemes[1][j] && !*matched {
|
57
|
+
*matched = true;
|
58
|
+
matching_indices[0].push(i);
|
59
|
+
matching_indices[1].push(j);
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
let matches = matching_indices[0].len();
|
67
|
+
if matches == 0 {
|
68
|
+
return JaroSimilarityResult {
|
69
|
+
value: 0.0,
|
70
|
+
max_prefix_length: 0,
|
71
|
+
};
|
72
|
+
}
|
73
|
+
|
74
|
+
// Find transpositions / 2 in matches
|
75
|
+
matching_indices[1].sort_unstable();
|
76
|
+
let transpositions = matching_indices[0]
|
77
|
+
.iter()
|
78
|
+
.zip(matching_indices[1].iter())
|
79
|
+
.fold(0.0, |acc, (i, j)| {
|
80
|
+
if graphemes[0][*i] == graphemes[1][*j] {
|
81
|
+
acc
|
82
|
+
} else {
|
83
|
+
acc + 0.5
|
84
|
+
}
|
85
|
+
});
|
86
|
+
|
87
|
+
let m = matches as f64;
|
88
|
+
let t = transpositions;
|
89
|
+
JaroSimilarityResult {
|
90
|
+
value: ((m / lens[0] as f64) + (m / lens[1] as f64) + ((m - t) / m)) / 3.0,
|
91
|
+
max_prefix_length: last_matched_prefix_index + 1,
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
use crate::metrics::jaro;
|
2
|
+
use std::cmp;
|
3
|
+
|
4
|
+
pub fn similarity(
|
5
|
+
a: &str,
|
6
|
+
b: &str,
|
7
|
+
ignore_case: bool,
|
8
|
+
prefix_length: u32,
|
9
|
+
prefix_scaling_factor: f64,
|
10
|
+
prefix_scaling_bonus_threshold: f64,
|
11
|
+
) -> f64 {
|
12
|
+
let jaro_similarity = jaro::similarity(a, b, ignore_case);
|
13
|
+
let common_prefix_len = cmp::min(prefix_length as i64, jaro_similarity.max_prefix_length);
|
14
|
+
|
15
|
+
if jaro_similarity.value > prefix_scaling_bonus_threshold {
|
16
|
+
return jaro_similarity.value
|
17
|
+
+ common_prefix_len as f64 * prefix_scaling_factor * (1.0 - jaro_similarity.value);
|
18
|
+
}
|
19
|
+
jaro_similarity.value
|
20
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use crate::metrics::utils::Array2D;
|
3
|
+
use std::cmp;
|
4
|
+
|
5
|
+
pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
|
6
|
+
if ignore_case {
|
7
|
+
return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
8
|
+
}
|
9
|
+
|
10
|
+
distance_impl(&graphemes(a), &graphemes(b))
|
11
|
+
}
|
12
|
+
|
13
|
+
fn distance_impl<T: Eq>(a: &[T], b: &[T]) -> i64 {
|
14
|
+
let lens = [a.len(), b.len()];
|
15
|
+
if lens[0] == 0 {
|
16
|
+
return lens[1] as i64;
|
17
|
+
}
|
18
|
+
if lens[1] == 0 {
|
19
|
+
return lens[0] as i64;
|
20
|
+
}
|
21
|
+
|
22
|
+
let rows = lens[0] + 1;
|
23
|
+
let columns = lens[1] + 1;
|
24
|
+
|
25
|
+
let mut dist_matrix = Array2D::new(rows, columns);
|
26
|
+
|
27
|
+
for i in 0..rows {
|
28
|
+
dist_matrix[(i, 0)] = i as i64;
|
29
|
+
}
|
30
|
+
for j in 0..columns {
|
31
|
+
dist_matrix[(0, j)] = j as i64;
|
32
|
+
}
|
33
|
+
|
34
|
+
for i in 1..rows {
|
35
|
+
for j in 1..columns {
|
36
|
+
let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
|
37
|
+
|
38
|
+
dist_matrix[(i, j)] = cmp::min(
|
39
|
+
cmp::min(dist_matrix[(i - 1, j)] + 1, dist_matrix[(i, j - 1)] + 1),
|
40
|
+
dist_matrix[(i - 1, j - 1)] + cost,
|
41
|
+
);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
dist_matrix[(rows - 1, columns - 1)]
|
46
|
+
}
|
data/src/metrics/mod.rs
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
use crate::metrics::utils::generate_bigrams;
|
2
|
+
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
3
|
+
use std::collections::HashMap;
|
4
|
+
|
5
|
+
pub fn coefficient(a: &str, b: &str, ignore_case: bool) -> f64 {
|
6
|
+
if ignore_case {
|
7
|
+
return coefficient_impl(&a.to_lowercase(), &b.to_lowercase());
|
8
|
+
}
|
9
|
+
coefficient_impl(a, b)
|
10
|
+
}
|
11
|
+
|
12
|
+
fn coefficient_impl(a: &str, b: &str) -> f64 {
|
13
|
+
let a_bigrams = generate_bigrams(&a);
|
14
|
+
let mut b_bigrams_hash: HashMap<&str, i64> = HashMap::new();
|
15
|
+
|
16
|
+
let mut total_bigrams = a_bigrams.len();
|
17
|
+
|
18
|
+
{
|
19
|
+
let b_bigrams = generate_bigrams(&b);
|
20
|
+
for s in &b_bigrams {
|
21
|
+
let counter = b_bigrams_hash.entry(s).or_insert(0);
|
22
|
+
*counter += 1;
|
23
|
+
}
|
24
|
+
|
25
|
+
total_bigrams += b_bigrams.len();
|
26
|
+
}
|
27
|
+
|
28
|
+
if total_bigrams == 0 {
|
29
|
+
return 0.0;
|
30
|
+
}
|
31
|
+
|
32
|
+
let mut intersections = 0;
|
33
|
+
for bigram in &a_bigrams {
|
34
|
+
match b_bigrams_hash.entry(bigram) {
|
35
|
+
Vacant(_) => {}
|
36
|
+
Occupied(entry) => {
|
37
|
+
let counter = entry.get();
|
38
|
+
if counter > &0 {
|
39
|
+
*entry.into_mut() = entry.get() - 1;
|
40
|
+
intersections += 1;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
2.0 * intersections as f64 / total_bigrams as f64
|
47
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
use std::ops::Index;
|
2
|
+
use std::ops::IndexMut;
|
3
|
+
|
4
|
+
pub struct Array2D<T> {
|
5
|
+
arr: Vec<T>,
|
6
|
+
cols: usize,
|
7
|
+
}
|
8
|
+
|
9
|
+
impl<T: Clone + Default> Array2D<T> {
|
10
|
+
pub fn new(rows: usize, columns: usize) -> Array2D<T> {
|
11
|
+
Array2D {
|
12
|
+
arr: vec![Default::default(); rows * columns],
|
13
|
+
cols: columns,
|
14
|
+
}
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
impl<T> Index<(usize, usize)> for Array2D<T> {
|
19
|
+
type Output = T;
|
20
|
+
|
21
|
+
fn index(&self, (y, x): (usize, usize)) -> &Self::Output {
|
22
|
+
&self.arr[y * self.cols + x]
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
impl<T> IndexMut<(usize, usize)> for Array2D<T> {
|
27
|
+
fn index_mut(&mut self, (y, x): (usize, usize)) -> &mut Self::Output {
|
28
|
+
&mut self.arr[y * self.cols + x]
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
extern crate itertools;
|
2
|
+
extern crate unicode_segmentation;
|
3
|
+
|
4
|
+
use itertools::Itertools;
|
5
|
+
use unicode_segmentation::UnicodeSegmentation;
|
6
|
+
|
7
|
+
mod array_2d;
|
8
|
+
pub use array_2d::Array2D;
|
9
|
+
|
10
|
+
pub fn graphemes(s: &str) -> Vec<&str> {
|
11
|
+
UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>()
|
12
|
+
}
|
13
|
+
|
14
|
+
pub fn generate_bigrams(s: &str) -> Vec<&str> {
|
15
|
+
UnicodeSegmentation::grapheme_indices(s, true)
|
16
|
+
.tuple_windows()
|
17
|
+
.map(|(a, b)| &s[a.0..b.0 + b.1.len()])
|
18
|
+
.collect::<Vec<&str>>()
|
19
|
+
}
|
metadata
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: str_metrics
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anirban Mukhopadhyay
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-03-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '12.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '12.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop-performance
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop-rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: 'Ruby gem (native extension in Rust) providing implementations of various
|
112
|
+
string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein,
|
113
|
+
Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation)
|
114
|
+
are supported. All comparison of strings is done at the grapheme cluster level as
|
115
|
+
described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries);
|
116
|
+
this may be different from many gems that calculate string metrics.'
|
117
|
+
email:
|
118
|
+
- anirban.mukhop@gmail.com
|
119
|
+
executables: []
|
120
|
+
extensions:
|
121
|
+
- extconf.rb
|
122
|
+
extra_rdoc_files: []
|
123
|
+
files:
|
124
|
+
- Cargo.toml
|
125
|
+
- LICENSE
|
126
|
+
- README.md
|
127
|
+
- extconf.rb
|
128
|
+
- lib/str_metrics.rb
|
129
|
+
- lib/str_metrics/version.rb
|
130
|
+
- src/lib.rs
|
131
|
+
- src/metrics/damerau_levenshtein.rs
|
132
|
+
- src/metrics/jaro.rs
|
133
|
+
- src/metrics/jaro_winkler.rs
|
134
|
+
- src/metrics/levenshtein.rs
|
135
|
+
- src/metrics/mod.rs
|
136
|
+
- src/metrics/sorensen_dice.rs
|
137
|
+
- src/metrics/utils/array_2d.rs
|
138
|
+
- src/metrics/utils/mod.rs
|
139
|
+
homepage: https://github.com/anirbanmu/str_metrics
|
140
|
+
licenses:
|
141
|
+
- MIT
|
142
|
+
metadata:
|
143
|
+
allowed_push_host: https://rubygems.org
|
144
|
+
homepage_uri: https://github.com/anirbanmu/str_metrics
|
145
|
+
bug_tracker_uri: https://github.com/anirbanmu/str_metrics/issues
|
146
|
+
source_code_uri: https://github.com/anirbanmu/str_metrics
|
147
|
+
changelog_uri: https://github.com/anirbanmu/str_metrics/blob/v0.1.0/CHANGELOG.md
|
148
|
+
post_install_message:
|
149
|
+
rdoc_options: []
|
150
|
+
require_paths:
|
151
|
+
- lib
|
152
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: 2.3.0
|
157
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
+
requirements:
|
159
|
+
- - ">="
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
requirements: []
|
163
|
+
rubygems_version: 3.0.6
|
164
|
+
signing_key:
|
165
|
+
specification_version: 4
|
166
|
+
summary: Ruby gem providing native implementations of various string metrics
|
167
|
+
test_files: []
|