str_metrics 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Cargo.toml +15 -0
- data/LICENSE +21 -0
- data/README.md +170 -0
- data/extconf.rb +14 -0
- data/lib/str_metrics/version.rb +5 -0
- data/lib/str_metrics.rb +96 -0
- data/src/lib.rs +167 -0
- data/src/metrics/damerau_levenshtein.rs +71 -0
- data/src/metrics/jaro.rs +93 -0
- data/src/metrics/jaro_winkler.rs +20 -0
- data/src/metrics/levenshtein.rs +46 -0
- data/src/metrics/mod.rs +6 -0
- data/src/metrics/sorensen_dice.rs +47 -0
- data/src/metrics/utils/array_2d.rs +30 -0
- data/src/metrics/utils/mod.rs +19 -0
- metadata +167 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: f773ff65c7ef2fb3bc5f396cf10e13e7d98d3037981de45a846f8b9fc6328caa
|
4
|
+
data.tar.gz: c6b3838b89a063c4e57cf70e06a7e9226bee9ecb84a99d0f2773775dbb56cc6e
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 22df7813421a8850d277065d8595aadf5a30c0fc96ebfb90bfaac05e3906030c5874dd4a8da38e1a0af67277f22ca2c46940515d509ed6bd63c0d332e6620b3b
|
7
|
+
data.tar.gz: 7addd087a16eb72ed3428a9b90ce51c0e0ed5865bc2d8aebe7b899f2adc5fdbbcad15c73b60af3701213129e9f7fed6e69a834637f7e32c14f478f8caaa7e589
|
data/Cargo.toml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
[package]
|
2
|
+
name = "str_metrics"
|
3
|
+
version = "0.1.0"
|
4
|
+
authors = ["Anirban Mukhopadhyay <anirban.mukhop@gmail.com>"]
|
5
|
+
edition = "2018"
|
6
|
+
|
7
|
+
[lib]
|
8
|
+
crate-type = ["cdylib"]
|
9
|
+
|
10
|
+
[dependencies]
|
11
|
+
unicode-segmentation = "^1.6"
|
12
|
+
libc = "^0.2"
|
13
|
+
itertools = "^0.8"
|
14
|
+
|
15
|
+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2020 Anirban Mukhopadhyay
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
# StrMetrics
|
2
|
+
|
3
|
+
[![checks](https://github.com/anirbanmu/str_metrics/workflows/checks/badge.svg)](https://github.com/anirbanmu/str_metrics/actions?query=workflow%3Achecks)
|
4
|
+
|
5
|
+
Ruby gem (native extension in Rust) providing implementations of various string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein, Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation) are supported. All comparison of strings is done at the grapheme cluster level as described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries); this may be different from many gems that calculate string metrics.
|
6
|
+
|
7
|
+
## Getting Started
|
8
|
+
### Prerequisites
|
9
|
+
|
10
|
+
Install Rust (tested with version `>= 1.38.0`) with:
|
11
|
+
|
12
|
+
```sh
|
13
|
+
curl https://sh.rustup.rs -sSf | sh
|
14
|
+
```
|
15
|
+
|
16
|
+
### Installation
|
17
|
+
|
18
|
+
#### With [`bundler`](https://bundler.io/)
|
19
|
+
|
20
|
+
Add this line to your application's Gemfile:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
gem 'str_metrics'
|
24
|
+
```
|
25
|
+
|
26
|
+
And then execute:
|
27
|
+
|
28
|
+
$ bundle install
|
29
|
+
|
30
|
+
#### Without `bundler`
|
31
|
+
|
32
|
+
$ gem install str_metrics
|
33
|
+
|
34
|
+
## Usage
|
35
|
+
|
36
|
+
All you need to do to use the metrics provided in this gem is to make sure `str_metrics` is required like:
|
37
|
+
|
38
|
+
```ruby
|
39
|
+
require 'str_metrics'
|
40
|
+
```
|
41
|
+
|
42
|
+
Each metric is shown below with an example & meanings of optional parameters.
|
43
|
+
|
44
|
+
### Sørensen–Dice
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
StrMetrics::SorensenDice.coefficient('abc', 'bcd', ignore_case: false)
|
48
|
+
=> 0.5
|
49
|
+
```
|
50
|
+
Options:
|
51
|
+
|
52
|
+
Keyword | Type | Default | Description
|
53
|
+
--- | --- | --- | ---
|
54
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
55
|
+
|
56
|
+
### Levenshtein
|
57
|
+
|
58
|
+
```ruby
|
59
|
+
StrMetrics::Levenshtein.distance('abc', 'acb', ignore_case: false)
|
60
|
+
=> 2
|
61
|
+
```
|
62
|
+
Options:
|
63
|
+
|
64
|
+
Keyword | Type | Default | Description
|
65
|
+
--- | --- | --- | ---
|
66
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
67
|
+
|
68
|
+
### Damerau–Levenshtein
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
StrMetrics::DamerauLevenshtein.distance('abc', 'acb', ignore_case: false)
|
72
|
+
=> 1
|
73
|
+
```
|
74
|
+
Options:
|
75
|
+
|
76
|
+
Keyword | Type | Default | Description
|
77
|
+
--- | --- | --- | ---
|
78
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
79
|
+
|
80
|
+
### Jaro
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
StrMetrics::Jaro.similarity('abc', 'aac', ignore_case: false)
|
84
|
+
=> 0.7777777777777777
|
85
|
+
```
|
86
|
+
Options:
|
87
|
+
|
88
|
+
Keyword | Type | Default | Description
|
89
|
+
--- | --- | --- | ---
|
90
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
91
|
+
|
92
|
+
### Jaro–Winkler
|
93
|
+
|
94
|
+
```ruby
|
95
|
+
StrMetrics::JaroWinkler.similarity('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
|
96
|
+
=> 0.7999999999999999
|
97
|
+
|
98
|
+
StrMetrics::JaroWinkler.distance('abc', 'aac', ignore_case: false, prefix_scaling_factor: 0.1, prefix_scaling_bonus_threshold: 0.7)
|
99
|
+
=> 0.20000000000000007
|
100
|
+
```
|
101
|
+
Options:
|
102
|
+
|
103
|
+
Keyword | Type | Default | Description
|
104
|
+
--- | --- | --- | ---
|
105
|
+
`ignore_case` | boolean | `false` | Case insensitive comparison?
|
106
|
+
`prefix_scaling_factor` | decimal | `0.1` | Constant scaling factor for how much to weight common prefixes. Should not exceed 0.25.
|
107
|
+
`prefix_scaling_bonus_threshold` | decimal | `0.7` | Prefix bonus weighting will only be applied if the Jaro similarity is greater given value.
|
108
|
+
|
109
|
+
## Motivation
|
110
|
+
|
111
|
+
The main motivation was to have a central gem which can provide a variety of string metric calculations. Secondary motivation was to experiment with writing a native extension in Rust (instead of C).
|
112
|
+
|
113
|
+
## Development
|
114
|
+
|
115
|
+
### Getting started
|
116
|
+
|
117
|
+
```bash
|
118
|
+
gem install bundler
|
119
|
+
git clone https://github.com/anirbanmu/str_metrics.git
|
120
|
+
cd ./str_metrics
|
121
|
+
bundle install
|
122
|
+
```
|
123
|
+
|
124
|
+
### Building (for native component)
|
125
|
+
|
126
|
+
```bash
|
127
|
+
rake rust_build
|
128
|
+
```
|
129
|
+
|
130
|
+
### Testing (will build native component before running tests)
|
131
|
+
```bash
|
132
|
+
rake spec
|
133
|
+
```
|
134
|
+
|
135
|
+
### Local installation
|
136
|
+
```bash
|
137
|
+
rake install
|
138
|
+
```
|
139
|
+
|
140
|
+
### Deploying a new version
|
141
|
+
To deploy a new version of the gem to rubygems:
|
142
|
+
|
143
|
+
1. Bump version in [version.rb](lib/str_metrics/version.rb) according to [SemVer](https://semver.org/).
|
144
|
+
2. Get your code merged to master
|
145
|
+
3. After a `git pull` on master:
|
146
|
+
|
147
|
+
```bash
|
148
|
+
rake build && rake release
|
149
|
+
```
|
150
|
+
|
151
|
+
## Authors
|
152
|
+
- [Anirban Mukhopadhyay](https://github.com/anirbanmu)
|
153
|
+
|
154
|
+
See all repo contributors [here](https://github.com/anirbanmu/str_metrics/contributors).
|
155
|
+
|
156
|
+
## Versioning
|
157
|
+
|
158
|
+
[SemVer](https://semver.org/) is employed. See [tags](https://github.com/anirbanmu/str_metrics/tags) for released versions.
|
159
|
+
|
160
|
+
## Contributing
|
161
|
+
|
162
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/anirbanmu/str_metrics.
|
163
|
+
|
164
|
+
## Code of Conduct
|
165
|
+
|
166
|
+
Everyone interacting in this project's codebase, issue trackers etc. are expected to follow the [code of conduct](CODE_OF_CONDUCT.md).
|
167
|
+
|
168
|
+
## License
|
169
|
+
|
170
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details
|
data/extconf.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
abort 'Rust compiler required (https://www.rust-lang.org/)' if `which rustc`.empty?
|
4
|
+
|
5
|
+
File.open('Makefile', 'wb') do |f|
|
6
|
+
f.puts(<<~MKCONTENT)
|
7
|
+
all:
|
8
|
+
\tcargo rustc --release
|
9
|
+
\tmv ./target/release/libstr_metrics.so ./lib/str_metrics
|
10
|
+
clean:
|
11
|
+
install:
|
12
|
+
\trm -r target
|
13
|
+
MKCONTENT
|
14
|
+
end
|
data/lib/str_metrics.rb
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ffi'
|
4
|
+
require 'str_metrics/version'
|
5
|
+
|
6
|
+
# Namespace for gem
|
7
|
+
module StrMetrics
|
8
|
+
# Interface with Rust functions (not meant for public usage)
|
9
|
+
module Native
|
10
|
+
extend FFI::Library
|
11
|
+
|
12
|
+
ffi_lib File.expand_path('./str_metrics/libstr_metrics.so', __dir__)
|
13
|
+
|
14
|
+
attach_function :sorensen_dice_coefficient, %i[string string char], :double
|
15
|
+
attach_function :levenshtein_distance, %i[string string char], :int64
|
16
|
+
attach_function :damerau_levenshtein_distance, %i[string string char], :int64
|
17
|
+
attach_function :jaro_similarity, %i[string string char], :double
|
18
|
+
attach_function :jaro_winkler_similarity, %i[string string char int double double], :double
|
19
|
+
attach_function :jaro_winkler_distance, %i[string string char int double double], :double
|
20
|
+
end
|
21
|
+
|
22
|
+
private_constant :Native
|
23
|
+
|
24
|
+
refine String do
|
25
|
+
def to_utf8
|
26
|
+
encoding == Encoding::UTF_8 ? self : encode('UTF-8')
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
using self # activates refinement
|
31
|
+
|
32
|
+
# Namespace for Sorensen-Dice
|
33
|
+
module SorensenDice
|
34
|
+
def self.coefficient(a, b, ignore_case: false)
|
35
|
+
Native.sorensen_dice_coefficient(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
# Namespace for Levenshtein
|
40
|
+
module Levenshtein
|
41
|
+
def self.distance(a, b, ignore_case: false)
|
42
|
+
Native.levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Namespace for Damerau-Levenshtein
|
47
|
+
module DamerauLevenshtein
|
48
|
+
def self.distance(a, b, ignore_case: false)
|
49
|
+
Native.damerau_levenshtein_distance(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Namespace for Jaro
|
54
|
+
module Jaro
|
55
|
+
def self.similarity(a, b, ignore_case: false)
|
56
|
+
Native.jaro_similarity(a&.to_utf8, b&.to_utf8, ignore_case ? 1 : 0)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Namespace for Jaro-Winkler
|
61
|
+
module JaroWinkler
|
62
|
+
def self.similarity(
|
63
|
+
a,
|
64
|
+
b,
|
65
|
+
ignore_case: false,
|
66
|
+
prefix_scaling_factor: 0.1,
|
67
|
+
prefix_scaling_bonus_threshold: 0.7
|
68
|
+
)
|
69
|
+
Native.jaro_winkler_similarity(
|
70
|
+
a&.to_utf8,
|
71
|
+
b&.to_utf8,
|
72
|
+
ignore_case ? 1 : 0,
|
73
|
+
4,
|
74
|
+
prefix_scaling_factor,
|
75
|
+
prefix_scaling_bonus_threshold
|
76
|
+
)
|
77
|
+
end
|
78
|
+
|
79
|
+
def self.distance(
|
80
|
+
a,
|
81
|
+
b,
|
82
|
+
ignore_case: false,
|
83
|
+
prefix_scaling_factor: 0.1,
|
84
|
+
prefix_scaling_bonus_threshold: 0.7
|
85
|
+
)
|
86
|
+
Native.jaro_winkler_distance(
|
87
|
+
a&.to_utf8,
|
88
|
+
b&.to_utf8,
|
89
|
+
ignore_case ? 1 : 0,
|
90
|
+
4,
|
91
|
+
prefix_scaling_factor,
|
92
|
+
prefix_scaling_bonus_threshold
|
93
|
+
)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
data/src/lib.rs
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
mod metrics;
|
2
|
+
|
3
|
+
use libc::{c_char, c_double};
|
4
|
+
use std::ffi::CStr;
|
5
|
+
|
6
|
+
fn cstr_from_raw(s: &*const c_char) -> &CStr {
|
7
|
+
unsafe { CStr::from_ptr(*s) }
|
8
|
+
}
|
9
|
+
|
10
|
+
#[no_mangle]
|
11
|
+
pub extern "C" fn sorensen_dice_coefficient(
|
12
|
+
a: *const c_char,
|
13
|
+
b: *const c_char,
|
14
|
+
ignore_case: c_char,
|
15
|
+
) -> c_double {
|
16
|
+
if a.is_null() || b.is_null() {
|
17
|
+
return 0.0;
|
18
|
+
}
|
19
|
+
|
20
|
+
let a_c_str = cstr_from_raw(&a);
|
21
|
+
let b_c_str = cstr_from_raw(&b);
|
22
|
+
|
23
|
+
let a_str = match a_c_str.to_str() {
|
24
|
+
Err(_e) => return 0.0,
|
25
|
+
Ok(s) => s,
|
26
|
+
};
|
27
|
+
|
28
|
+
let b_str = match b_c_str.to_str() {
|
29
|
+
Err(_e) => return 0.0,
|
30
|
+
Ok(s) => s,
|
31
|
+
};
|
32
|
+
|
33
|
+
metrics::sorensen_dice::coefficient(a_str, b_str, ignore_case == 1)
|
34
|
+
}
|
35
|
+
|
36
|
+
#[no_mangle]
|
37
|
+
pub extern "C" fn jaro_similarity(
|
38
|
+
a: *const c_char,
|
39
|
+
b: *const c_char,
|
40
|
+
ignore_case: c_char,
|
41
|
+
) -> c_double {
|
42
|
+
if a.is_null() || b.is_null() {
|
43
|
+
return 0.0;
|
44
|
+
}
|
45
|
+
|
46
|
+
let a_c_str = cstr_from_raw(&a);
|
47
|
+
let b_c_str = cstr_from_raw(&b);
|
48
|
+
|
49
|
+
let a_str = match a_c_str.to_str() {
|
50
|
+
Err(_e) => return 0.0,
|
51
|
+
Ok(s) => s,
|
52
|
+
};
|
53
|
+
|
54
|
+
let b_str = match b_c_str.to_str() {
|
55
|
+
Err(_e) => return 0.0,
|
56
|
+
Ok(s) => s,
|
57
|
+
};
|
58
|
+
|
59
|
+
metrics::jaro::similarity(a_str, b_str, ignore_case == 1).value
|
60
|
+
}
|
61
|
+
|
62
|
+
#[no_mangle]
|
63
|
+
pub extern "C" fn jaro_winkler_similarity(
|
64
|
+
a: *const c_char,
|
65
|
+
b: *const c_char,
|
66
|
+
ignore_case: c_char,
|
67
|
+
prefix_length: u32,
|
68
|
+
prefix_scaling_factor: c_double,
|
69
|
+
prefix_scaling_bonus_threshold: c_double,
|
70
|
+
) -> c_double {
|
71
|
+
if a.is_null() || b.is_null() {
|
72
|
+
return 0.0;
|
73
|
+
}
|
74
|
+
|
75
|
+
let a_c_str = cstr_from_raw(&a);
|
76
|
+
let b_c_str = cstr_from_raw(&b);
|
77
|
+
|
78
|
+
let a_str = match a_c_str.to_str() {
|
79
|
+
Err(_e) => return 0.0,
|
80
|
+
Ok(s) => s,
|
81
|
+
};
|
82
|
+
|
83
|
+
let b_str = match b_c_str.to_str() {
|
84
|
+
Err(_e) => return 0.0,
|
85
|
+
Ok(s) => s,
|
86
|
+
};
|
87
|
+
|
88
|
+
metrics::jaro_winkler::similarity(
|
89
|
+
a_str,
|
90
|
+
b_str,
|
91
|
+
ignore_case == 1,
|
92
|
+
prefix_length,
|
93
|
+
prefix_scaling_factor,
|
94
|
+
prefix_scaling_bonus_threshold,
|
95
|
+
)
|
96
|
+
}
|
97
|
+
|
98
|
+
#[no_mangle]
|
99
|
+
pub extern "C" fn jaro_winkler_distance(
|
100
|
+
a: *const c_char,
|
101
|
+
b: *const c_char,
|
102
|
+
ignore_case: c_char,
|
103
|
+
prefix_length: u32,
|
104
|
+
prefix_scaling_factor: c_double,
|
105
|
+
prefix_scaling_bonus_threshold: c_double,
|
106
|
+
) -> c_double {
|
107
|
+
1.0 - jaro_winkler_similarity(
|
108
|
+
a,
|
109
|
+
b,
|
110
|
+
ignore_case,
|
111
|
+
prefix_length,
|
112
|
+
prefix_scaling_factor,
|
113
|
+
prefix_scaling_bonus_threshold,
|
114
|
+
)
|
115
|
+
}
|
116
|
+
|
117
|
+
#[no_mangle]
|
118
|
+
pub extern "C" fn levenshtein_distance(
|
119
|
+
a: *const c_char,
|
120
|
+
b: *const c_char,
|
121
|
+
ignore_case: c_char,
|
122
|
+
) -> i64 {
|
123
|
+
if a.is_null() || b.is_null() {
|
124
|
+
return std::i64::MAX;
|
125
|
+
}
|
126
|
+
|
127
|
+
let a_c_str = cstr_from_raw(&a);
|
128
|
+
let b_c_str = cstr_from_raw(&b);
|
129
|
+
|
130
|
+
let a_str = match a_c_str.to_str() {
|
131
|
+
Err(_e) => return std::i64::MAX,
|
132
|
+
Ok(s) => s,
|
133
|
+
};
|
134
|
+
|
135
|
+
let b_str = match b_c_str.to_str() {
|
136
|
+
Err(_e) => return std::i64::MAX,
|
137
|
+
Ok(s) => s,
|
138
|
+
};
|
139
|
+
|
140
|
+
metrics::levenshtein::distance(a_str, b_str, ignore_case == 1)
|
141
|
+
}
|
142
|
+
|
143
|
+
#[no_mangle]
|
144
|
+
pub extern "C" fn damerau_levenshtein_distance(
|
145
|
+
a: *const c_char,
|
146
|
+
b: *const c_char,
|
147
|
+
ignore_case: c_char,
|
148
|
+
) -> i64 {
|
149
|
+
if a.is_null() || b.is_null() {
|
150
|
+
return std::i64::MAX;
|
151
|
+
}
|
152
|
+
|
153
|
+
let a_c_str = cstr_from_raw(&a);
|
154
|
+
let b_c_str = cstr_from_raw(&b);
|
155
|
+
|
156
|
+
let a_str = match a_c_str.to_str() {
|
157
|
+
Err(_e) => return std::i64::MAX,
|
158
|
+
Ok(s) => s,
|
159
|
+
};
|
160
|
+
|
161
|
+
let b_str = match b_c_str.to_str() {
|
162
|
+
Err(_e) => return std::i64::MAX,
|
163
|
+
Ok(s) => s,
|
164
|
+
};
|
165
|
+
|
166
|
+
metrics::damerau_levenshtein::distance(a_str, b_str, ignore_case == 1)
|
167
|
+
}
|
@@ -0,0 +1,71 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use crate::metrics::utils::Array2D;
|
3
|
+
|
4
|
+
use std::cmp;
|
5
|
+
use std::collections::HashMap;
|
6
|
+
use std::hash::Hash;
|
7
|
+
|
8
|
+
pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
|
9
|
+
if ignore_case {
|
10
|
+
return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
11
|
+
}
|
12
|
+
|
13
|
+
distance_impl(&graphemes(a), &graphemes(b))
|
14
|
+
}
|
15
|
+
|
16
|
+
// From https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance
|
17
|
+
fn distance_impl<T: Hash + Eq>(a: &[T], b: &[T]) -> i64 {
|
18
|
+
let lens = [a.len(), b.len()];
|
19
|
+
if lens[0] == 0 {
|
20
|
+
return lens[1] as i64;
|
21
|
+
}
|
22
|
+
if lens[1] == 0 {
|
23
|
+
return lens[0] as i64;
|
24
|
+
}
|
25
|
+
|
26
|
+
let rows = lens[0] + 2;
|
27
|
+
let columns = lens[1] + 2;
|
28
|
+
|
29
|
+
let max_dist = (lens[0] + lens[1]) as i64;
|
30
|
+
let mut dist_matrix = Array2D::new(rows, columns);
|
31
|
+
|
32
|
+
dist_matrix[(0, 0)] = max_dist;
|
33
|
+
for i in 1..rows {
|
34
|
+
dist_matrix[(i, 0)] = max_dist;
|
35
|
+
dist_matrix[(i, 1)] = (i - 1) as i64;
|
36
|
+
}
|
37
|
+
for j in 1..columns {
|
38
|
+
dist_matrix[(0, j)] = max_dist;
|
39
|
+
dist_matrix[(1, j)] = (j - 1) as i64;
|
40
|
+
}
|
41
|
+
|
42
|
+
let mut da: HashMap<&T, usize> = HashMap::new();
|
43
|
+
|
44
|
+
for i in 1..=lens[0] {
|
45
|
+
let mut db = 0;
|
46
|
+
for j in 1..=lens[1] {
|
47
|
+
let k = da.entry(&b[j - 1]).or_insert(0);
|
48
|
+
let l = db;
|
49
|
+
let cost = if a[i - 1] == b[j - 1] {
|
50
|
+
db = j;
|
51
|
+
0
|
52
|
+
} else {
|
53
|
+
1
|
54
|
+
};
|
55
|
+
|
56
|
+
dist_matrix[(i + 1, j + 1)] = cmp::min(
|
57
|
+
dist_matrix[(i, j)] + cost,
|
58
|
+
cmp::min(
|
59
|
+
dist_matrix[(i + 1, j)] + 1,
|
60
|
+
cmp::min(
|
61
|
+
dist_matrix[(i, j + 1)] + 1,
|
62
|
+
dist_matrix[(*k, l)] + (i - *k - 1) as i64 + 1 + (j - l - 1) as i64,
|
63
|
+
),
|
64
|
+
),
|
65
|
+
);
|
66
|
+
}
|
67
|
+
da.insert(&a[i - 1], i);
|
68
|
+
}
|
69
|
+
|
70
|
+
dist_matrix[(lens[0] + 1, lens[1] + 1)]
|
71
|
+
}
|
data/src/metrics/jaro.rs
ADDED
@@ -0,0 +1,93 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use std::cmp;
|
3
|
+
|
4
|
+
pub struct JaroSimilarityResult {
|
5
|
+
pub value: f64,
|
6
|
+
pub max_prefix_length: i64,
|
7
|
+
}
|
8
|
+
|
9
|
+
pub fn similarity(a: &str, b: &str, ignore_case: bool) -> JaroSimilarityResult {
|
10
|
+
if ignore_case {
|
11
|
+
return similarity_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
12
|
+
}
|
13
|
+
|
14
|
+
similarity_impl(&graphemes(a), &graphemes(b))
|
15
|
+
}
|
16
|
+
|
17
|
+
fn similarity_impl<T: Eq>(a: &[T], b: &[T]) -> JaroSimilarityResult {
|
18
|
+
let mut graphemes = [a, b];
|
19
|
+
if graphemes[0].len() > graphemes[1].len() {
|
20
|
+
graphemes.swap(0, 1);
|
21
|
+
}
|
22
|
+
|
23
|
+
// let grapheme_iterators = [UnicodeSegmentation::graphemes(&case_handled[0][..], true), UnicodeSegmentation::graphemes(&case_handled[1][..], true)];
|
24
|
+
let lens = [graphemes[0].len(), graphemes[1].len()];
|
25
|
+
|
26
|
+
let max_length = cmp::max(lens[0], lens[1]);
|
27
|
+
let matching_dist = if max_length < 2 {
|
28
|
+
0
|
29
|
+
} else {
|
30
|
+
(max_length / 2) - 1
|
31
|
+
};
|
32
|
+
|
33
|
+
let mut matching_indices = [
|
34
|
+
Vec::with_capacity(max_length),
|
35
|
+
Vec::with_capacity(max_length),
|
36
|
+
];
|
37
|
+
|
38
|
+
// Find matches
|
39
|
+
let mut last_matched_prefix_index = -1;
|
40
|
+
{
|
41
|
+
let mut b_matched = vec![false; lens[1]];
|
42
|
+
for (i, grapheme) in graphemes[0].iter().enumerate() {
|
43
|
+
let start = cmp::max(0 as i64, i as i64 - matching_dist as i64) as usize;
|
44
|
+
let end = cmp::min(lens[1], i + matching_dist + 1);
|
45
|
+
|
46
|
+
// Keep track of prefix match
|
47
|
+
// Safe to access i in b since a.len < b.len
|
48
|
+
if grapheme == &graphemes[1][i]
|
49
|
+
&& ((last_matched_prefix_index == -1 && i == 0)
|
50
|
+
|| last_matched_prefix_index == i as i64 - 1)
|
51
|
+
{
|
52
|
+
last_matched_prefix_index = i as i64;
|
53
|
+
}
|
54
|
+
|
55
|
+
for (j, matched) in b_matched.iter_mut().enumerate().take(end).skip(start) {
|
56
|
+
if grapheme == &graphemes[1][j] && !*matched {
|
57
|
+
*matched = true;
|
58
|
+
matching_indices[0].push(i);
|
59
|
+
matching_indices[1].push(j);
|
60
|
+
break;
|
61
|
+
}
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
let matches = matching_indices[0].len();
|
67
|
+
if matches == 0 {
|
68
|
+
return JaroSimilarityResult {
|
69
|
+
value: 0.0,
|
70
|
+
max_prefix_length: 0,
|
71
|
+
};
|
72
|
+
}
|
73
|
+
|
74
|
+
// Find transpositions / 2 in matches
|
75
|
+
matching_indices[1].sort_unstable();
|
76
|
+
let transpositions = matching_indices[0]
|
77
|
+
.iter()
|
78
|
+
.zip(matching_indices[1].iter())
|
79
|
+
.fold(0.0, |acc, (i, j)| {
|
80
|
+
if graphemes[0][*i] == graphemes[1][*j] {
|
81
|
+
acc
|
82
|
+
} else {
|
83
|
+
acc + 0.5
|
84
|
+
}
|
85
|
+
});
|
86
|
+
|
87
|
+
let m = matches as f64;
|
88
|
+
let t = transpositions;
|
89
|
+
JaroSimilarityResult {
|
90
|
+
value: ((m / lens[0] as f64) + (m / lens[1] as f64) + ((m - t) / m)) / 3.0,
|
91
|
+
max_prefix_length: last_matched_prefix_index + 1,
|
92
|
+
}
|
93
|
+
}
|
@@ -0,0 +1,20 @@
|
|
1
|
+
use crate::metrics::jaro;
|
2
|
+
use std::cmp;
|
3
|
+
|
4
|
+
pub fn similarity(
|
5
|
+
a: &str,
|
6
|
+
b: &str,
|
7
|
+
ignore_case: bool,
|
8
|
+
prefix_length: u32,
|
9
|
+
prefix_scaling_factor: f64,
|
10
|
+
prefix_scaling_bonus_threshold: f64,
|
11
|
+
) -> f64 {
|
12
|
+
let jaro_similarity = jaro::similarity(a, b, ignore_case);
|
13
|
+
let common_prefix_len = cmp::min(prefix_length as i64, jaro_similarity.max_prefix_length);
|
14
|
+
|
15
|
+
if jaro_similarity.value > prefix_scaling_bonus_threshold {
|
16
|
+
return jaro_similarity.value
|
17
|
+
+ common_prefix_len as f64 * prefix_scaling_factor * (1.0 - jaro_similarity.value);
|
18
|
+
}
|
19
|
+
jaro_similarity.value
|
20
|
+
}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
use crate::metrics::utils::graphemes;
|
2
|
+
use crate::metrics::utils::Array2D;
|
3
|
+
use std::cmp;
|
4
|
+
|
5
|
+
pub fn distance(a: &str, b: &str, ignore_case: bool) -> i64 {
|
6
|
+
if ignore_case {
|
7
|
+
return distance_impl(&graphemes(&a.to_lowercase()), &graphemes(&b.to_lowercase()));
|
8
|
+
}
|
9
|
+
|
10
|
+
distance_impl(&graphemes(a), &graphemes(b))
|
11
|
+
}
|
12
|
+
|
13
|
+
fn distance_impl<T: Eq>(a: &[T], b: &[T]) -> i64 {
|
14
|
+
let lens = [a.len(), b.len()];
|
15
|
+
if lens[0] == 0 {
|
16
|
+
return lens[1] as i64;
|
17
|
+
}
|
18
|
+
if lens[1] == 0 {
|
19
|
+
return lens[0] as i64;
|
20
|
+
}
|
21
|
+
|
22
|
+
let rows = lens[0] + 1;
|
23
|
+
let columns = lens[1] + 1;
|
24
|
+
|
25
|
+
let mut dist_matrix = Array2D::new(rows, columns);
|
26
|
+
|
27
|
+
for i in 0..rows {
|
28
|
+
dist_matrix[(i, 0)] = i as i64;
|
29
|
+
}
|
30
|
+
for j in 0..columns {
|
31
|
+
dist_matrix[(0, j)] = j as i64;
|
32
|
+
}
|
33
|
+
|
34
|
+
for i in 1..rows {
|
35
|
+
for j in 1..columns {
|
36
|
+
let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
|
37
|
+
|
38
|
+
dist_matrix[(i, j)] = cmp::min(
|
39
|
+
cmp::min(dist_matrix[(i - 1, j)] + 1, dist_matrix[(i, j - 1)] + 1),
|
40
|
+
dist_matrix[(i - 1, j - 1)] + cost,
|
41
|
+
);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
dist_matrix[(rows - 1, columns - 1)]
|
46
|
+
}
|
data/src/metrics/mod.rs
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
use crate::metrics::utils::generate_bigrams;
|
2
|
+
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
3
|
+
use std::collections::HashMap;
|
4
|
+
|
5
|
+
pub fn coefficient(a: &str, b: &str, ignore_case: bool) -> f64 {
|
6
|
+
if ignore_case {
|
7
|
+
return coefficient_impl(&a.to_lowercase(), &b.to_lowercase());
|
8
|
+
}
|
9
|
+
coefficient_impl(a, b)
|
10
|
+
}
|
11
|
+
|
12
|
+
fn coefficient_impl(a: &str, b: &str) -> f64 {
|
13
|
+
let a_bigrams = generate_bigrams(&a);
|
14
|
+
let mut b_bigrams_hash: HashMap<&str, i64> = HashMap::new();
|
15
|
+
|
16
|
+
let mut total_bigrams = a_bigrams.len();
|
17
|
+
|
18
|
+
{
|
19
|
+
let b_bigrams = generate_bigrams(&b);
|
20
|
+
for s in &b_bigrams {
|
21
|
+
let counter = b_bigrams_hash.entry(s).or_insert(0);
|
22
|
+
*counter += 1;
|
23
|
+
}
|
24
|
+
|
25
|
+
total_bigrams += b_bigrams.len();
|
26
|
+
}
|
27
|
+
|
28
|
+
if total_bigrams == 0 {
|
29
|
+
return 0.0;
|
30
|
+
}
|
31
|
+
|
32
|
+
let mut intersections = 0;
|
33
|
+
for bigram in &a_bigrams {
|
34
|
+
match b_bigrams_hash.entry(bigram) {
|
35
|
+
Vacant(_) => {}
|
36
|
+
Occupied(entry) => {
|
37
|
+
let counter = entry.get();
|
38
|
+
if counter > &0 {
|
39
|
+
*entry.into_mut() = entry.get() - 1;
|
40
|
+
intersections += 1;
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
}
|
45
|
+
|
46
|
+
2.0 * intersections as f64 / total_bigrams as f64
|
47
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
use std::ops::Index;
|
2
|
+
use std::ops::IndexMut;
|
3
|
+
|
4
|
+
pub struct Array2D<T> {
|
5
|
+
arr: Vec<T>,
|
6
|
+
cols: usize,
|
7
|
+
}
|
8
|
+
|
9
|
+
impl<T: Clone + Default> Array2D<T> {
|
10
|
+
pub fn new(rows: usize, columns: usize) -> Array2D<T> {
|
11
|
+
Array2D {
|
12
|
+
arr: vec![Default::default(); rows * columns],
|
13
|
+
cols: columns,
|
14
|
+
}
|
15
|
+
}
|
16
|
+
}
|
17
|
+
|
18
|
+
impl<T> Index<(usize, usize)> for Array2D<T> {
|
19
|
+
type Output = T;
|
20
|
+
|
21
|
+
fn index(&self, (y, x): (usize, usize)) -> &Self::Output {
|
22
|
+
&self.arr[y * self.cols + x]
|
23
|
+
}
|
24
|
+
}
|
25
|
+
|
26
|
+
impl<T> IndexMut<(usize, usize)> for Array2D<T> {
|
27
|
+
fn index_mut(&mut self, (y, x): (usize, usize)) -> &mut Self::Output {
|
28
|
+
&mut self.arr[y * self.cols + x]
|
29
|
+
}
|
30
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
extern crate itertools;
|
2
|
+
extern crate unicode_segmentation;
|
3
|
+
|
4
|
+
use itertools::Itertools;
|
5
|
+
use unicode_segmentation::UnicodeSegmentation;
|
6
|
+
|
7
|
+
mod array_2d;
|
8
|
+
pub use array_2d::Array2D;
|
9
|
+
|
10
|
+
pub fn graphemes(s: &str) -> Vec<&str> {
|
11
|
+
UnicodeSegmentation::graphemes(s, true).collect::<Vec<&str>>()
|
12
|
+
}
|
13
|
+
|
14
|
+
pub fn generate_bigrams(s: &str) -> Vec<&str> {
|
15
|
+
UnicodeSegmentation::grapheme_indices(s, true)
|
16
|
+
.tuple_windows()
|
17
|
+
.map(|(a, b)| &s[a.0..b.0 + b.1.len()])
|
18
|
+
.collect::<Vec<&str>>()
|
19
|
+
}
|
metadata
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: str_metrics
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Anirban Mukhopadhyay
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-03-13 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '12.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '12.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rubocop
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop-performance
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rubocop-rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
description: 'Ruby gem (native extension in Rust) providing implementations of various
|
112
|
+
string metrics. Current metrics supported are: Sørensen–Dice, Levenshtein, Damerau–Levenshtein,
|
113
|
+
Jaro & Jaro–Winkler. Strings that are UTF-8 encodable (convertible to UTF-8 representation)
|
114
|
+
are supported. All comparison of strings is done at the grapheme cluster level as
|
115
|
+
described by [Unicode Standard Annex #29](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries);
|
116
|
+
this may be different from many gems that calculate string metrics.'
|
117
|
+
email:
|
118
|
+
- anirban.mukhop@gmail.com
|
119
|
+
executables: []
|
120
|
+
extensions:
|
121
|
+
- extconf.rb
|
122
|
+
extra_rdoc_files: []
|
123
|
+
files:
|
124
|
+
- Cargo.toml
|
125
|
+
- LICENSE
|
126
|
+
- README.md
|
127
|
+
- extconf.rb
|
128
|
+
- lib/str_metrics.rb
|
129
|
+
- lib/str_metrics/version.rb
|
130
|
+
- src/lib.rs
|
131
|
+
- src/metrics/damerau_levenshtein.rs
|
132
|
+
- src/metrics/jaro.rs
|
133
|
+
- src/metrics/jaro_winkler.rs
|
134
|
+
- src/metrics/levenshtein.rs
|
135
|
+
- src/metrics/mod.rs
|
136
|
+
- src/metrics/sorensen_dice.rs
|
137
|
+
- src/metrics/utils/array_2d.rs
|
138
|
+
- src/metrics/utils/mod.rs
|
139
|
+
homepage: https://github.com/anirbanmu/str_metrics
|
140
|
+
licenses:
|
141
|
+
- MIT
|
142
|
+
metadata:
|
143
|
+
allowed_push_host: https://rubygems.org
|
144
|
+
homepage_uri: https://github.com/anirbanmu/str_metrics
|
145
|
+
bug_tracker_uri: https://github.com/anirbanmu/str_metrics/issues
|
146
|
+
source_code_uri: https://github.com/anirbanmu/str_metrics
|
147
|
+
changelog_uri: https://github.com/anirbanmu/str_metrics/blob/v0.1.0/CHANGELOG.md
|
148
|
+
post_install_message:
|
149
|
+
rdoc_options: []
|
150
|
+
require_paths:
|
151
|
+
- lib
|
152
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
153
|
+
requirements:
|
154
|
+
- - ">="
|
155
|
+
- !ruby/object:Gem::Version
|
156
|
+
version: 2.3.0
|
157
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
158
|
+
requirements:
|
159
|
+
- - ">="
|
160
|
+
- !ruby/object:Gem::Version
|
161
|
+
version: '0'
|
162
|
+
requirements: []
|
163
|
+
rubygems_version: 3.0.6
|
164
|
+
signing_key:
|
165
|
+
specification_version: 4
|
166
|
+
summary: Ruby gem providing native implementations of various string metrics
|
167
|
+
test_files: []
|