mini-levenshtein 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ruby.yml +25 -0
- data/.gitignore +2 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +52 -0
- data/README.md +15 -0
- data/Rakefile +24 -0
- data/ext/mini_levenshtein/extconf.rb +10 -0
- data/ext/mini_levenshtein/levenshtein.c +1056 -0
- data/ext/mini_levenshtein/levenshtein.h +146 -0
- data/ext/mini_levenshtein/mini_levenshtein.c +32 -0
- data/lib/mini-levenshtein/version.rb +5 -0
- data/lib/mini-levenshtein.rb +33 -0
- data/mini-levenshtein.gemspec +32 -0
- metadata +63 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
/*
|
2
|
+
* This file has been altered to better fit fuzzywuzzy.
|
3
|
+
* To se all changes done, please diff this file with
|
4
|
+
* <https://github.com/Tmplt/python-Levenshtein/blob/master/Levenshtein.c>
|
5
|
+
*
|
6
|
+
* Summary:
|
7
|
+
* - stripped all python-related code and data types;
|
8
|
+
*/
|
9
|
+
|
10
|
+
/* @(#) $Id: Levenshtein.h,v 1.22 2005/01/13 20:02:56 yeti Exp $ */
|
11
|
+
#ifndef LEVENSHTEIN_H
|
12
|
+
#define LEVENSHTEIN_H
|
13
|
+
|
14
|
+
#ifndef size_t
|
15
|
+
#include <stdlib.h>
|
16
|
+
#endif
|
17
|
+
|
18
|
+
/* A bit dirty. */
|
19
|
+
#ifndef _LEV_STATIC_PY
|
20
|
+
#define _LEV_STATIC_PY /* */
|
21
|
+
#endif
|
22
|
+
|
23
|
+
/* In C, this is just wchar_t and unsigned char, in Python, lev_wchar can
|
24
|
+
* be anything. If you really want to cheat, define wchar_t to any integer
|
25
|
+
* type you like before including Levenshtein.h and recompile it. */
|
26
|
+
#ifndef lev_wchar
|
27
|
+
#ifndef wchar_t
|
28
|
+
#include <wchar.h>
|
29
|
+
#endif
|
30
|
+
#define lev_wchar wchar_t
|
31
|
+
#endif
|
32
|
+
typedef char lev_byte;
|
33
|
+
|
34
|
+
/* Edit opration type
|
35
|
+
* DON'T CHANGE! used ad arrays indices and the bits are occasionally used
|
36
|
+
* as flags */
|
37
|
+
typedef enum
|
38
|
+
{
|
39
|
+
LEV_EDIT_KEEP = 0,
|
40
|
+
LEV_EDIT_REPLACE = 1,
|
41
|
+
LEV_EDIT_INSERT = 2,
|
42
|
+
LEV_EDIT_DELETE = 3,
|
43
|
+
LEV_EDIT_LAST /* sometimes returned when an error occurs */
|
44
|
+
} LevEditType;
|
45
|
+
|
46
|
+
/* Error codes returned by editop check functions */
|
47
|
+
typedef enum
|
48
|
+
{
|
49
|
+
LEV_EDIT_ERR_OK = 0,
|
50
|
+
LEV_EDIT_ERR_TYPE, /* nonexistent edit type */
|
51
|
+
LEV_EDIT_ERR_OUT, /* edit out of string bounds */
|
52
|
+
LEV_EDIT_ERR_ORDER, /* ops are not ordered */
|
53
|
+
LEV_EDIT_ERR_BLOCK, /* inconsistent block boundaries (block ops) */
|
54
|
+
LEV_EDIT_ERR_SPAN, /* sequence is not a full transformation (block ops) */
|
55
|
+
LEV_EDIT_ERR_LAST
|
56
|
+
} LevEditOpError;
|
57
|
+
|
58
|
+
/* string averaging method (UNUSED yet) */
|
59
|
+
typedef enum
|
60
|
+
{
|
61
|
+
LEV_AVG_HEAD = 0, /* take operations from the head */
|
62
|
+
LEV_AVG_TAIL, /* take operations from the tail */
|
63
|
+
LEV_AVG_SPREAD, /* take a equidistantly distributed subset */
|
64
|
+
LEV_AVG_BLOCK, /* take a random continuous block */
|
65
|
+
LEV_AVG_RANDOM, /* take a random subset */
|
66
|
+
LEV_AVG_LAST
|
67
|
+
} LevAveragingType;
|
68
|
+
|
69
|
+
/* Edit operation (atomic).
|
70
|
+
* This is the `native' atomic edit operation. It differs from the difflib
|
71
|
+
* one's because it represents a change of one character, not a block. And
|
72
|
+
* we usually don't care about LEV_EDIT_KEEP, though the functions can handle
|
73
|
+
* them. The positions are interpreted as at the left edge of a character.
|
74
|
+
*/
|
75
|
+
typedef struct
|
76
|
+
{
|
77
|
+
LevEditType type; /* editing operation type */
|
78
|
+
size_t spos; /* source block position */
|
79
|
+
size_t dpos; /* destination position */
|
80
|
+
} LevEditOp;
|
81
|
+
|
82
|
+
/* Edit operation (difflib-compatible).
|
83
|
+
* This is not `native', but conversion functions exist. These fields exactly
|
84
|
+
* correspond to the codeops() tuples fields (and this method is also the
|
85
|
+
* source of the silly OpCode name). Sequences must span over complete
|
86
|
+
* strings, subsequences are simply edit sequences with more (or larger)
|
87
|
+
* LEV_EDIT_KEEP blocks.
|
88
|
+
*/
|
89
|
+
typedef struct
|
90
|
+
{
|
91
|
+
LevEditType type; /* editing operation type */
|
92
|
+
size_t sbeg, send; /* source block begin, end */
|
93
|
+
size_t dbeg, dend; /* destination block begin, end */
|
94
|
+
} LevOpCode;
|
95
|
+
|
96
|
+
/* Matching block (difflib-compatible). */
|
97
|
+
typedef struct
|
98
|
+
{
|
99
|
+
size_t spos;
|
100
|
+
size_t dpos;
|
101
|
+
size_t len;
|
102
|
+
} LevMatchingBlock;
|
103
|
+
|
104
|
+
size_t
|
105
|
+
lev_edit_distance(size_t len1,
|
106
|
+
const lev_byte *string1,
|
107
|
+
size_t len2,
|
108
|
+
const lev_byte *string2,
|
109
|
+
int xcost);
|
110
|
+
|
111
|
+
size_t
|
112
|
+
lev_u_edit_distance(size_t len1,
|
113
|
+
const lev_wchar *string1,
|
114
|
+
size_t len2,
|
115
|
+
const lev_wchar *string2,
|
116
|
+
int xcost);
|
117
|
+
|
118
|
+
LevEditOp *
|
119
|
+
lev_editops_find(size_t len1,
|
120
|
+
const lev_byte *string1,
|
121
|
+
size_t len2,
|
122
|
+
const lev_byte *string2,
|
123
|
+
size_t *n);
|
124
|
+
|
125
|
+
LevOpCode *
|
126
|
+
lev_editops_to_opcodes(size_t n,
|
127
|
+
const LevEditOp *ops,
|
128
|
+
size_t *nb,
|
129
|
+
size_t len1,
|
130
|
+
size_t len2);
|
131
|
+
|
132
|
+
LevMatchingBlock *
|
133
|
+
lev_opcodes_matching_blocks(size_t len1,
|
134
|
+
__attribute__((unused)) size_t len2,
|
135
|
+
size_t nb,
|
136
|
+
const LevOpCode *bops,
|
137
|
+
size_t *nmblocks);
|
138
|
+
|
139
|
+
LevMatchingBlock *
|
140
|
+
lev_editops_matching_blocks(size_t len1,
|
141
|
+
size_t len2,
|
142
|
+
size_t n,
|
143
|
+
const LevEditOp *ops,
|
144
|
+
size_t *nmblocks);
|
145
|
+
|
146
|
+
#endif
|
@@ -0,0 +1,32 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <math.h>
|
3
|
+
#include <wchar.h>
|
4
|
+
#include <stdlib.h>
|
5
|
+
|
6
|
+
#include "ruby.h"
|
7
|
+
#include "levenshtein.h"
|
8
|
+
|
9
|
+
VALUE MiniLevenshteinInternal = Qnil;
|
10
|
+
|
11
|
+
VALUE method_internal_distance(VALUE self, VALUE s1, VALUE s2, VALUE xcost);
|
12
|
+
|
13
|
+
void Init_mini_levenshtein()
|
14
|
+
{
|
15
|
+
MiniLevenshteinInternal = rb_define_module("MiniLevenshteinInternal");
|
16
|
+
rb_define_method(MiniLevenshteinInternal, "internal_distance", method_internal_distance, 3);
|
17
|
+
}
|
18
|
+
|
19
|
+
VALUE method_internal_distance(VALUE self, VALUE s1, VALUE s2, VALUE xcost)
|
20
|
+
{
|
21
|
+
size_t len1 = RSTRING_LEN(s1);
|
22
|
+
size_t len2 = RSTRING_LEN(s2);
|
23
|
+
|
24
|
+
const lev_byte *str1 = StringValuePtr(s1);
|
25
|
+
const lev_byte *str2 = StringValuePtr(s2);
|
26
|
+
|
27
|
+
int cost = FIX2INT(xcost);
|
28
|
+
|
29
|
+
long distance = lev_edit_distance(len1, str1, len2, str2, cost);
|
30
|
+
|
31
|
+
return INT2NUM(distance);
|
32
|
+
}
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'mini-levenshtein/mini_levenshtein'
|
2
|
+
|
3
|
+
module MiniLevenshtein
|
4
|
+
extend MiniLevenshteinInternal
|
5
|
+
|
6
|
+
class << self
|
7
|
+
def distance(string1, string2)
|
8
|
+
validate_string!(string1)
|
9
|
+
validate_string!(string2)
|
10
|
+
|
11
|
+
internal_distance(string1, string2, 0)
|
12
|
+
end
|
13
|
+
|
14
|
+
def ratio(string1, string2)
|
15
|
+
validate_string!(string1)
|
16
|
+
validate_string!(string2)
|
17
|
+
|
18
|
+
lensum = string1.length + string2.length
|
19
|
+
return 1.0 if lensum.zero?
|
20
|
+
|
21
|
+
distance = internal_distance(string1, string2, 1)
|
22
|
+
return 1.0 if distance.zero?
|
23
|
+
|
24
|
+
(lensum - distance) / lensum.to_f
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def validate_string!(string)
|
30
|
+
raise TypeError, "no implicit conversion of #{string.class} to String" unless string.is_a?(String)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'mini-levenshtein/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |s|
|
8
|
+
s.name = 'mini-levenshtein'
|
9
|
+
s.version = MiniLevenshtein::VERSION
|
10
|
+
s.authors = ['Delon Newman']
|
11
|
+
s.email = 'contact@delonnewman.name'
|
12
|
+
|
13
|
+
s.summary = 'Simple, fast, levenshtein distance'
|
14
|
+
s.description = s.summary
|
15
|
+
s.homepage = 'https://github.com/delonnewman/mini-levenshtein'
|
16
|
+
s.license = 'MIT'
|
17
|
+
|
18
|
+
if s.respond_to?(:metadata)
|
19
|
+
s.metadata['allowed_push_host'] = 'https://rubygems.org'
|
20
|
+
|
21
|
+
s.metadata['homepage_uri'] = s.homepage
|
22
|
+
s.metadata['source_code_uri'] = s.homepage
|
23
|
+
s.metadata['changelog_uri'] = "#{s.homepage}#changelog"
|
24
|
+
s.metadata['documentation_uri'] = "https://www.rubydoc.info/gems/#{s.name}"
|
25
|
+
else
|
26
|
+
raise 'RubyGems 2.0 or newer is required to protect against public gem pushes.'
|
27
|
+
end
|
28
|
+
|
29
|
+
s.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
30
|
+
s.extensions = ['ext/mini_levenshtein/extconf.rb']
|
31
|
+
s.require_paths = ['lib']
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mini-levenshtein
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Delon Newman
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2022-04-29 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: Simple, fast, levenshtein distance
|
14
|
+
email: contact@delonnewman.name
|
15
|
+
executables: []
|
16
|
+
extensions:
|
17
|
+
- ext/mini_levenshtein/extconf.rb
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- ".github/workflows/ruby.yml"
|
21
|
+
- ".gitignore"
|
22
|
+
- ".rspec"
|
23
|
+
- ".ruby-version"
|
24
|
+
- Gemfile
|
25
|
+
- Gemfile.lock
|
26
|
+
- README.md
|
27
|
+
- Rakefile
|
28
|
+
- ext/mini_levenshtein/extconf.rb
|
29
|
+
- ext/mini_levenshtein/levenshtein.c
|
30
|
+
- ext/mini_levenshtein/levenshtein.h
|
31
|
+
- ext/mini_levenshtein/mini_levenshtein.c
|
32
|
+
- lib/mini-levenshtein.rb
|
33
|
+
- lib/mini-levenshtein/version.rb
|
34
|
+
- mini-levenshtein.gemspec
|
35
|
+
homepage: https://github.com/delonnewman/mini-levenshtein
|
36
|
+
licenses:
|
37
|
+
- MIT
|
38
|
+
metadata:
|
39
|
+
allowed_push_host: https://rubygems.org
|
40
|
+
homepage_uri: https://github.com/delonnewman/mini-levenshtein
|
41
|
+
source_code_uri: https://github.com/delonnewman/mini-levenshtein
|
42
|
+
changelog_uri: https://github.com/delonnewman/mini-levenshtein#changelog
|
43
|
+
documentation_uri: https://www.rubydoc.info/gems/mini-levenshtein
|
44
|
+
post_install_message:
|
45
|
+
rdoc_options: []
|
46
|
+
require_paths:
|
47
|
+
- lib
|
48
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
49
|
+
requirements:
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
requirements: []
|
59
|
+
rubygems_version: 3.2.3
|
60
|
+
signing_key:
|
61
|
+
specification_version: 4
|
62
|
+
summary: Simple, fast, levenshtein distance
|
63
|
+
test_files: []
|