mini-levenshtein 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/ruby.yml +25 -0
- data/.gitignore +2 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +52 -0
- data/README.md +15 -0
- data/Rakefile +24 -0
- data/ext/mini_levenshtein/extconf.rb +10 -0
- data/ext/mini_levenshtein/levenshtein.c +1056 -0
- data/ext/mini_levenshtein/levenshtein.h +146 -0
- data/ext/mini_levenshtein/mini_levenshtein.c +32 -0
- data/lib/mini-levenshtein/version.rb +5 -0
- data/lib/mini-levenshtein.rb +33 -0
- data/mini-levenshtein.gemspec +32 -0
- metadata +63 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
/*
|
2
|
+
* This file has been altered to better fit fuzzywuzzy.
|
3
|
+
* To se all changes done, please diff this file with
|
4
|
+
* <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
|
5
|
+
*
|
6
|
+
* Summary:
|
7
|
+
* - stripped all python-related code and data types;
|
8
|
+
*/
|
9
|
+
|
10
|
+
/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
|
11
|
+
#ifndef LEVENSHTEIN_H
|
12
|
+
#define LEVENSHTEIN_H
|
13
|
+
|
14
|
+
#ifndef size_t
|
15
|
+
#include <stdlib.h>
|
16
|
+
#endif
|
17
|
+
|
18
|
+
/* A bit dirty. */
|
19
|
+
#ifndef _LEV_STATIC_PY
|
20
|
+
#define _LEV_STATIC_PY /* */
|
21
|
+
#endif
|
22
|
+
|
23
|
+
/* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can
|
24
|
+
* be anything. If you really want to cheat, define wchar_t to any integer
|
25
|
+
* type you like before including Levenshtein.h and recompile it. */
|
26
|
+
#ifndef lev_wchar
|
27
|
+
#ifndef wchar_t
|
28
|
+
#include <wchar.h>
|
29
|
+
#endif
|
30
|
+
#define lev_wchar wchar_t
|
31
|
+
#endif
|
32
|
+
typedef char lev_byte;
|
33
|
+
|
34
|
+
/* Edit opration type
|
35
|
+
* DON'T CHANGE! used ad arrays indices and the bits are occasionally used
|
36
|
+
* as flags */
|
37
|
+
typedef enum
|
38
|
+
{
|
39
|
+
LEV_EDIT_KEEP = 0,
|
40
|
+
LEV_EDIT_REPLACE = 1,
|
41
|
+
LEV_EDIT_INSERT = 2,
|
42
|
+
LEV_EDIT_DELETE = 3,
|
43
|
+
LEV_EDIT_LAST /* sometimes returned when an error occurs */
|
44
|
+
} LevEditType;
|
45
|
+
|
46
|
+
/* Error codes returned by editop check functions */
|
47
|
+
typedef enum
|
48
|
+
{
|
49
|
+
LEV_EDIT_ERR_OK = 0,
|
50
|
+
LEV_EDIT_ERR_TYPE, /* nonexistent edit type */
|
51
|
+
LEV_EDIT_ERR_OUT, /* edit out of string bounds */
|
52
|
+
LEV_EDIT_ERR_ORDER, /* ops are not ordered */
|
53
|
+
LEV_EDIT_ERR_BLOCK, /* inconsistent block boundaries (block ops) */
|
54
|
+
LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */
|
55
|
+
LEV_EDIT_ERR_LAST
|
56
|
+
} LevEditOpError;
|
57
|
+
|
58
|
+
/* string averaging method (UNUSED yet) */
|
59
|
+
typedef enum
|
60
|
+
{
|
61
|
+
LEV_AVG_HEAD = 0, /* take operations from the head */
|
62
|
+
LEV_AVG_TAIL, /* take operations from the tail */
|
63
|
+
LEV_AVG_SPREAD, /* take a equidistantly distributed subset */
|
64
|
+
LEV_AVG_BLOCK, /* take a random continuous block */
|
65
|
+
LEV_AVG_RANDOM, /* take a random subset */
|
66
|
+
LEV_AVG_LAST
|
67
|
+
} LevAveragingType;
|
68
|
+
|
69
|
+
/* Edit operation (atomic).
|
70
|
+
* This is the `native' atomic edit operation. It differs from the difflib
|
71
|
+
* one's because it represents a change of one character, not a block. And
|
72
|
+
* we usually don't care about LEV_EDIT_KEEP, though the functions can handle
|
73
|
+
* them. The positions are interpreted as at the left edge of a character.
|
74
|
+
*/
|
75
|
+
typedef struct
|
76
|
+
{
|
77
|
+
LevEditType type; /* editing operation type */
|
78
|
+
size_t spos; /* source block position */
|
79
|
+
size_t dpos; /* destination position */
|
80
|
+
} LevEditOp;
|
81
|
+
|
82
|
+
/* Edit operation (difflib-compatible).
|
83
|
+
* This is not `native', but conversion functions exist. These fields exactly
|
84
|
+
* correspond to the codeops() tuples fields (and this method is also the
|
85
|
+
* source of the silly OpCode name). Sequences must span over complete
|
86
|
+
* strings, subsequences are simply edit sequences with more (or larger)
|
87
|
+
* LEV_EDIT_KEEP blocks.
|
88
|
+
*/
|
89
|
+
typedef struct
|
90
|
+
{
|
91
|
+
LevEditType type; /* editing operation type */
|
92
|
+
size_t sbeg, send; /* source block begin, end */
|
93
|
+
size_t dbeg, dend; /* destination block begin, end */
|
94
|
+
} LevOpCode;
|
95
|
+
|
96
|
+
/* Matching block (difflib-compatible). */
|
97
|
+
typedef struct
|
98
|
+
{
|
99
|
+
size_t spos;
|
100
|
+
size_t dpos;
|
101
|
+
size_t len;
|
102
|
+
} LevMatchingBlock;
|
103
|
+
|
104
|
+
size_t
|
105
|
+
lev_edit_distance(size_t len1,
|
106
|
+
const lev_byte *string1,
|
107
|
+
size_t len2,
|
108
|
+
const lev_byte *string2,
|
109
|
+
int xcost);
|
110
|
+
|
111
|
+
size_t
|
112
|
+
lev_u_edit_distance(size_t len1,
|
113
|
+
const lev_wchar *string1,
|
114
|
+
size_t len2,
|
115
|
+
const lev_wchar *string2,
|
116
|
+
int xcost);
|
117
|
+
|
118
|
+
LevEditOp *
|
119
|
+
lev_editops_find(size_t len1,
|
120
|
+
const lev_byte *string1,
|
121
|
+
size_t len2,
|
122
|
+
const lev_byte *string2,
|
123
|
+
size_t *n);
|
124
|
+
|
125
|
+
LevOpCode *
|
126
|
+
lev_editops_to_opcodes(size_t n,
|
127
|
+
const LevEditOp *ops,
|
128
|
+
size_t *nb,
|
129
|
+
size_t len1,
|
130
|
+
size_t len2);
|
131
|
+
|
132
|
+
LevMatchingBlock *
|
133
|
+
lev_opcodes_matching_blocks(size_t len1,
|
134
|
+
__attribute__((unused)) size_t len2,
|
135
|
+
size_t nb,
|
136
|
+
const LevOpCode *bops,
|
137
|
+
size_t *nmblocks);
|
138
|
+
|
139
|
+
LevMatchingBlock *
|
140
|
+
lev_editops_matching_blocks(size_t len1,
|
141
|
+
size_t len2,
|
142
|
+
size_t n,
|
143
|
+
const LevEditOp *ops,
|
144
|
+
size_t *nmblocks);
|
145
|
+
|
146
|
+
#endif
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <wchar.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "levenshtein.h"
|
8
|
+
|
9
|
+
VALUE MiniLevenshteinInternal = Qnil;
|
10
|
+
|
11
|
+
VALUE method_internal_distance(VALUE self, VALUE s1, VALUE s2, VALUE xcost);
|
12
|
+
|
13
|
+
void Init_mini_levenshtein()
|
14
|
+
{
|
15
|
+
MiniLevenshteinInternal = rb_define_module("MiniLevenshteinInternal");
|
16
|
+
rb_define_method(MiniLevenshteinInternal, "internal_distance", method_internal_distance, 3);
|
17
|
+
}
|
18
|
+
|
19
|
+
VALUE method_internal_distance(VALUE self, VALUE s1, VALUE s2, VALUE xcost)
|
20
|
+
{
|
21
|
+
size_t len1 = RSTRING_LEN(s1);
|
22
|
+
size_t len2 = RSTRING_LEN(s2);
|
23
|
+
|
24
|
+
const lev_byte *str1 = StringValuePtr(s1);
|
25
|
+
const lev_byte *str2 = StringValuePtr(s2);
|
26
|
+
|
27
|
+
int cost = FIX2INT(xcost);
|
28
|
+
|
29
|
+
long distance = lev_edit_distance(len1, str1, len2, str2, cost);
|
30
|
+
|
31
|
+
return INT2NUM(distance);
|
32
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'mini-levenshtein/mini_levenshtein'
|
2
|
+
|
3
|
+
module MiniLevenshtein
|
4
|
+
extend MiniLevenshteinInternal
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def distance(string1, string2)
|
8
|
+
validate_string!(string1)
|
9
|
+
validate_string!(string2)
|
10
|
+
|
11
|
+
internal_distance(string1, string2, 0)
|
12
|
+
end
|
13
|
+
|
14
|
+
def ratio(string1, string2)
|
15
|
+
validate_string!(string1)
|
16
|
+
validate_string!(string2)
|
17
|
+
|
18
|
+
lensum = string1.length + string2.length
|
19
|
+
return 1.0 if lensum.zero?
|
20
|
+
|
21
|
+
distance = internal_distance(string1, string2, 1)
|
22
|
+
return 1.0 if distance.zero?
|
23
|
+
|
24
|
+
(lensum - distance) / lensum.to_f
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def validate_string!(string)
|
30
|
+
raise TypeError, "no implicit conversion of #{string.class} to String" unless string.is_a?(String)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'mini-levenshtein/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'mini-levenshtein'
|
9
|
+
s.version = MiniLevenshtein::VERSION
|
10
|
+
s.authors = ['Delon Newman']
|
11
|
+
s.email = 'contact@delonnewman.name'
|
12
|
+
|
13
|
+
s.summary = 'Simple, fast, levenshtein distance'
|
14
|
+
s.description = s.summary
|
15
|
+
s.homepage = 'https://github.com/delonnewman/mini-levenshtein'
|
16
|
+
s.license = 'MIT'
|
17
|
+
|
18
|
+
if s.respond_to?(:metadata)
|
19
|
+
s.metadata['allowed_push_host'] = 'https://rubygems.org'
|
20
|
+
|
21
|
+
s.metadata['homepage_uri'] = s.homepage
|
22
|
+
s.metadata['source_code_uri'] = s.homepage
|
23
|
+
s.metadata['changelog_uri'] = "#{s.homepage}#changelog"
|
24
|
+
s.metadata['documentation_uri'] = "https://www.rubydoc.info/gems/#{s.name}"
|
25
|
+
else
|
26
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
|
27
|
+
end
|
28
|
+
|
29
|
+
s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
30
|
+
s.extensions = ['ext/mini_levenshtein/extconf.rb']
|
31
|
+
s.require_paths = ['lib']
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mini-levenshtein
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Delon Newman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-04-29 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Simple, fast, levenshtein distance
|
14
|
+
email: contact@delonnewman.name
|
15
|
+
executables: []
|
16
|
+
extensions:
|
17
|
+
- ext/mini_levenshtein/extconf.rb
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- ".github/workflows/ruby.yml"
|
21
|
+
- ".gitignore"
|
22
|
+
- ".rspec"
|
23
|
+
- ".ruby-version"
|
24
|
+
- Gemfile
|
25
|
+
- Gemfile.lock
|
26
|
+
- README.md
|
27
|
+
- Rakefile
|
28
|
+
- ext/mini_levenshtein/extconf.rb
|
29
|
+
- ext/mini_levenshtein/levenshtein.c
|
30
|
+
- ext/mini_levenshtein/levenshtein.h
|
31
|
+
- ext/mini_levenshtein/mini_levenshtein.c
|
32
|
+
- lib/mini-levenshtein.rb
|
33
|
+
- lib/mini-levenshtein/version.rb
|
34
|
+
- mini-levenshtein.gemspec
|
35
|
+
homepage: https://github.com/delonnewman/mini-levenshtein
|
36
|
+
licenses:
|
37
|
+
- MIT
|
38
|
+
metadata:
|
39
|
+
allowed_push_host: https://rubygems.org
|
40
|
+
homepage_uri: https://github.com/delonnewman/mini-levenshtein
|
41
|
+
source_code_uri: https://github.com/delonnewman/mini-levenshtein
|
42
|
+
changelog_uri: https://github.com/delonnewman/mini-levenshtein#changelog
|
43
|
+
documentation_uri: https://www.rubydoc.info/gems/mini-levenshtein
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubygems_version: 3.2.3
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: Simple, fast, levenshtein distance
|
63
|
+
test_files: []
|