bmatch 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +34 -0
- data/.rspec +2 -0
- data/Gemfile +2 -0
- data/LICENSE +22 -0
- data/README.md +33 -0
- data/bmatch.gemspec +24 -0
- data/ext/Makefile +5 -0
- data/ext/build.sh +2 -0
- data/ext/extconf.rb +23 -0
- data/ext/similarities.go +115 -0
- data/lib/bmatch/similarities.rb +7 -0
- data/lib/bmatch/version.rb +3 -0
- data/spec/bmatch/similarities_spec.rb +34 -0
- data/spec/spec_helper.rb +30 -0
- metadata +101 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 731552d40e900fde0658bcd649f5d5fa95d903a7
|
4
|
+
data.tar.gz: adcb836f48404f2a7631740155bc0e9d2ad4cdf7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7e71ed9e202edb6d14c6d123fbf1b62c0976ea333c2ade13e0ef16a0fc5eedde824f1167fcf03e5b99406817140b1e82c65d7ea0d2128a8c4257dedf26613caa
|
7
|
+
data.tar.gz: 88a5715d6643d0fdac36e61d818f68289c37c7b44fb128ede9007ec6fb19a983638c6beec7b3c9b22e58d17e3a27244d6350b9ad3971d0fa033de59b164cec97
|
data/.gitignore
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
/.config
|
4
|
+
/coverage/
|
5
|
+
/InstalledFiles
|
6
|
+
/pkg/
|
7
|
+
/spec/reports/
|
8
|
+
/test/tmp/
|
9
|
+
/test/version_tmp/
|
10
|
+
/tmp/
|
11
|
+
|
12
|
+
## Documentation cache and generated files:
|
13
|
+
/.yardoc/
|
14
|
+
/_yardoc/
|
15
|
+
/doc/
|
16
|
+
/rdoc/
|
17
|
+
|
18
|
+
## Environment normalisation:
|
19
|
+
/.bundle/
|
20
|
+
/vendor/bundle
|
21
|
+
/lib/bundler/man/
|
22
|
+
|
23
|
+
# for a library or gem, you might want to ignore these files since the code is
|
24
|
+
# intended to run in multiple environments; otherwise, check them in:
|
25
|
+
Gemfile.lock
|
26
|
+
.ruby-version
|
27
|
+
# .ruby-gemset
|
28
|
+
spec/examples.txt
|
29
|
+
|
30
|
+
# unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
|
31
|
+
.rvmrc
|
32
|
+
|
33
|
+
ext/*.so
|
34
|
+
ext/*.h
|
data/.rspec
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Udo Groebner
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# bmatch
|
2
|
+
|
3
|
+
[](https://semaphoreci.com/udo-groebner/bmatch)
|
5
|
+
[](http://badge.fury.io/rb/bmatch)
|
7
|
+
|
8
|
+
Approximate string matching library (with native bindings)
|
9
|
+
|
10
|
+
Inspired by
|
11
|
+
* http://blog.paracode.com/2015/08/28/ruby-and-go-sitting-in-a-tree/
|
12
|
+
* https://github.com/flori/amatch
|
13
|
+
* https://blog.jcoglan.com/2012/07/29/your-first-ruby-native-extension-c/
|
14
|
+
* http://yorickpeterse.com/articles/hacking-extconf-rb/
|
15
|
+
|
16
|
+
Levenshtein implementation taken from: https://github.com/arbovm/levenshtein
|
17
|
+
|
18
|
+
## Why
|
19
|
+
|
20
|
+
Because amatch unfortunately produces segfaults. I hope golang-to-C doesn't do that.
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
require 'bmatch/similarities'
|
26
|
+
Similarities.levenshtein_distance("aa", "aü")
|
27
|
+
Similarities.dice_similarity("night", "nacht")
|
28
|
+
```
|
29
|
+
|
30
|
+
## Contributing
|
31
|
+
|
32
|
+
Just send me a github pull request.
|
33
|
+
|
data/bmatch.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require File.expand_path('../lib/bmatch/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "bmatch"
|
6
|
+
s.version = Bmatch::VERSION
|
7
|
+
s.authors = ["Udo Groebner"]
|
8
|
+
s.email = "udo.groebner@crealytics.com"
|
9
|
+
s.homepage = "https://github.com/udl/bmatch"
|
10
|
+
s.licenses = ["MIT"]
|
11
|
+
s.summary = "Library for string similarities."
|
12
|
+
s.description = "Provides string similarity functions like levenshtein distance or dice's coefficient as native extensions."
|
13
|
+
s.rubygems_version = ">= 1.3"
|
14
|
+
s.required_ruby_version = ">= 1.9.3"
|
15
|
+
s.platform = Gem::Platform::RUBY
|
16
|
+
s.extensions = ['ext/extconf.rb']
|
17
|
+
|
18
|
+
s.files = `git ls-files`.split("\n")
|
19
|
+
s.require_path = 'lib'
|
20
|
+
|
21
|
+
s.add_dependency 'ffi', '~> 1.9'
|
22
|
+
s.add_development_dependency 'pry', '~> 0.10'
|
23
|
+
s.add_development_dependency 'rspec', '~> 3.3'
|
24
|
+
end
|
data/ext/build.sh
ADDED
data/ext/extconf.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require 'mkmf'
|
2
|
+
|
3
|
+
# Stops the installation process if one of these commands is not found in
|
4
|
+
# $PATH.
|
5
|
+
find_executable('go')
|
6
|
+
|
7
|
+
# Create a dummy extension file. Without this RubyGems would abort the
|
8
|
+
# installation process. On Linux this would result in the file "bmatch.so"
|
9
|
+
# being created in the current working directory.
|
10
|
+
#
|
11
|
+
# Normally the generated Makefile would take care of this but since we
|
12
|
+
# don't generate one we'll have to do this manually.
|
13
|
+
#
|
14
|
+
File.open(File.join(Dir.pwd, 'bmatch.' + RbConfig::CONFIG['DLEXT']), "w") {}
|
15
|
+
|
16
|
+
Dir.chdir(File.dirname(__FILE__)) do
|
17
|
+
`go build -buildmode=c-shared -o similarities.so similarities.go`
|
18
|
+
end
|
19
|
+
|
20
|
+
# This is normally set by calling create_makefile() but we don't need that
|
21
|
+
# method since we'll provide a dummy Makefile. Without setting this value
|
22
|
+
# RubyGems will abort the installation.
|
23
|
+
$makefile_created = true
|
data/ext/similarities.go
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
package main
|
2
|
+
|
3
|
+
import (
|
4
|
+
"C"
|
5
|
+
)
|
6
|
+
|
7
|
+
type BiGram struct{ a, b rune }
|
8
|
+
|
9
|
+
func CreateBiGrams(s string) map[BiGram]bool {
|
10
|
+
result := make(map[BiGram]bool)
|
11
|
+
end := rune(0)
|
12
|
+
for i, c := range s {
|
13
|
+
if i > 0 {
|
14
|
+
result[BiGram{end, c}] = true
|
15
|
+
}
|
16
|
+
end = c
|
17
|
+
}
|
18
|
+
return result
|
19
|
+
}
|
20
|
+
|
21
|
+
//export dice_similarity
|
22
|
+
func dice_similarity(s1in, s2in *C.char) float32 {
|
23
|
+
s1 := C.GoString(s1in)
|
24
|
+
s2 := C.GoString(s2in)
|
25
|
+
bigram1, bigram2 := CreateBiGrams(s1), CreateBiGrams(s2)
|
26
|
+
// ruby only supports float32
|
27
|
+
var numIntersects float32
|
28
|
+
for first, _ := range bigram1 {
|
29
|
+
if bigram2[first] {
|
30
|
+
numIntersects += 1
|
31
|
+
}
|
32
|
+
}
|
33
|
+
return 2.0 * numIntersects / float32(len(bigram1)+len(bigram2))
|
34
|
+
}
|
35
|
+
|
36
|
+
func main() {}
|
37
|
+
|
38
|
+
// all of the following is taken and slightly adjusted to compile with exported function
|
39
|
+
// from https://github.com/arbovm/levenshtein/blob/master/levenshtein.go
|
40
|
+
|
41
|
+
//Copyright (c) 2015, Arbo von Monkiewitsch All rights reserved.
|
42
|
+
|
43
|
+
//Redistribution and use in source and binary forms, with or without
|
44
|
+
//modification, are permitted provided that the following conditions are
|
45
|
+
//met:
|
46
|
+
|
47
|
+
//1. Redistributions of source code must retain the above copyright
|
48
|
+
//notice, this list of conditions and the following disclaimer.
|
49
|
+
|
50
|
+
//2. Redistributions in binary form must reproduce the above copyright
|
51
|
+
//notice, this list of conditions and the following disclaimer in the
|
52
|
+
//documentation and/or other materials provided with the distribution.
|
53
|
+
|
54
|
+
//3. Neither the name of the copyright holder nor the names of its
|
55
|
+
//contributors may be used to endorse or promote products derived from
|
56
|
+
//this software without specific prior written permission.
|
57
|
+
|
58
|
+
//THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
59
|
+
//"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
60
|
+
//LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
61
|
+
//A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
62
|
+
//HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
63
|
+
//SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
64
|
+
//LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
65
|
+
//DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
66
|
+
//THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
67
|
+
//(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
68
|
+
//OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
69
|
+
|
70
|
+
// Calculate the Levenshtein distance between two strings
|
71
|
+
|
72
|
+
//export levenshtein_distance
|
73
|
+
func levenshtein_distance(s1in, s2in *C.char) int {
|
74
|
+
s1 := C.GoString(s1in)
|
75
|
+
s2 := C.GoString(s2in)
|
76
|
+
var cost, lastdiag, olddiag int
|
77
|
+
len_s1 := len([]rune(s1))
|
78
|
+
len_s2 := len([]rune(s2))
|
79
|
+
|
80
|
+
column := make([]int, len_s1+1)
|
81
|
+
|
82
|
+
for y := 1; y <= len_s1; y++ {
|
83
|
+
column[y] = y
|
84
|
+
}
|
85
|
+
|
86
|
+
for x := 1; x <= len_s2; x++ {
|
87
|
+
column[0] = x
|
88
|
+
lastdiag = x - 1
|
89
|
+
for y := 1; y <= len_s1; y++ {
|
90
|
+
olddiag = column[y]
|
91
|
+
cost = 0
|
92
|
+
if s1[y-1] != s2[x-1] {
|
93
|
+
cost = 1
|
94
|
+
}
|
95
|
+
column[y] = min(
|
96
|
+
column[y]+1,
|
97
|
+
column[y-1]+1,
|
98
|
+
lastdiag+cost)
|
99
|
+
lastdiag = olddiag
|
100
|
+
}
|
101
|
+
}
|
102
|
+
return column[len_s1]
|
103
|
+
}
|
104
|
+
func min(a, b, c int) int {
|
105
|
+
if a < b {
|
106
|
+
if a < c {
|
107
|
+
return a
|
108
|
+
}
|
109
|
+
} else {
|
110
|
+
if b < c {
|
111
|
+
return b
|
112
|
+
}
|
113
|
+
}
|
114
|
+
return c
|
115
|
+
}
|
@@ -0,0 +1,7 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
module Similarities
|
3
|
+
extend FFI::Library
|
4
|
+
ffi_lib File.expand_path(File.join(File.dirname(__FILE__), "..", "..", "ext", "similarities.so"))
|
5
|
+
attach_function :dice_similarity, [:string, :string], :float
|
6
|
+
attach_function :levenshtein_distance, [:string, :string], :int
|
7
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'bmatch/similarities'
|
3
|
+
RSpec.describe Similarities do
|
4
|
+
|
5
|
+
context "#dice_similarity" do
|
6
|
+
it "has a similarity of 1 for the same string" do
|
7
|
+
expect(Similarities.dice_similarity("aa", "aa")).to be 1.0
|
8
|
+
end
|
9
|
+
|
10
|
+
it "has a similarity of 1/4 for 'night' and 'nacht'" do
|
11
|
+
expect(Similarities.dice_similarity("night", "nacht")).to be 0.25
|
12
|
+
end
|
13
|
+
|
14
|
+
it "also works for utf-8" do
|
15
|
+
expect(Similarities.dice_similarity("münchen", "munich")).to be 0.1818181872367859
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
context "#levenshtein_distance" do
|
20
|
+
it "has a distance of 0 for the same string" do
|
21
|
+
expect(Similarities.levenshtein_distance("aa", "aa")).to be 0
|
22
|
+
end
|
23
|
+
|
24
|
+
it "has a distance of 1 for a one-off string" do
|
25
|
+
expect(Similarities.levenshtein_distance("aa", "ab")).to be 1
|
26
|
+
end
|
27
|
+
|
28
|
+
it "also works for utf-8" do
|
29
|
+
expect(Similarities.levenshtein_distance("aa", "aü")).to be 1
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
|
2
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
3
|
+
RSpec.configure do |config|
|
4
|
+
config.expect_with :rspec do |expectations|
|
5
|
+
expectations.include_chain_clauses_in_custom_matcher_descriptions = true
|
6
|
+
end
|
7
|
+
|
8
|
+
config.mock_with :rspec do |mocks|
|
9
|
+
mocks.verify_partial_doubles = true
|
10
|
+
end
|
11
|
+
|
12
|
+
config.filter_run :focus
|
13
|
+
config.run_all_when_everything_filtered = true
|
14
|
+
|
15
|
+
config.example_status_persistence_file_path = "spec/examples.txt"
|
16
|
+
|
17
|
+
config.disable_monkey_patching!
|
18
|
+
|
19
|
+
config.warnings = true
|
20
|
+
|
21
|
+
if config.files_to_run.one?
|
22
|
+
config.default_formatter = 'doc'
|
23
|
+
end
|
24
|
+
|
25
|
+
config.profile_examples = 10
|
26
|
+
|
27
|
+
config.order = :random
|
28
|
+
|
29
|
+
Kernel.srand config.seed
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,101 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bmatch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Udo Groebner
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-10-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.9'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: pry
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0.10'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0.10'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '3.3'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '3.3'
|
55
|
+
description: Provides string similarity functions like levenshtein distance or dice's
|
56
|
+
coefficient as native extensions.
|
57
|
+
email: udo.groebner@crealytics.com
|
58
|
+
executables: []
|
59
|
+
extensions:
|
60
|
+
- ext/extconf.rb
|
61
|
+
extra_rdoc_files: []
|
62
|
+
files:
|
63
|
+
- ".gitignore"
|
64
|
+
- ".rspec"
|
65
|
+
- Gemfile
|
66
|
+
- LICENSE
|
67
|
+
- README.md
|
68
|
+
- bmatch.gemspec
|
69
|
+
- ext/Makefile
|
70
|
+
- ext/build.sh
|
71
|
+
- ext/extconf.rb
|
72
|
+
- ext/similarities.go
|
73
|
+
- lib/bmatch/similarities.rb
|
74
|
+
- lib/bmatch/version.rb
|
75
|
+
- spec/bmatch/similarities_spec.rb
|
76
|
+
- spec/spec_helper.rb
|
77
|
+
homepage: https://github.com/udl/bmatch
|
78
|
+
licenses:
|
79
|
+
- MIT
|
80
|
+
metadata: {}
|
81
|
+
post_install_message:
|
82
|
+
rdoc_options: []
|
83
|
+
require_paths:
|
84
|
+
- lib
|
85
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 1.9.3
|
90
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
91
|
+
requirements:
|
92
|
+
- - ">="
|
93
|
+
- !ruby/object:Gem::Version
|
94
|
+
version: '0'
|
95
|
+
requirements: []
|
96
|
+
rubyforge_project:
|
97
|
+
rubygems_version: 2.4.5.1
|
98
|
+
signing_key:
|
99
|
+
specification_version: 4
|
100
|
+
summary: Library for string similarities.
|
101
|
+
test_files: []
|