ngramdistance-ffi 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/CHANGELOG.markdown +3 -0
- data/Gemfile +8 -0
- data/README.markdown +21 -0
- data/Rakefile +19 -0
- data/VERSION +1 -0
- data/ext/ngramdistance/.gitignore +3 -0
- data/ext/ngramdistance/extconf.rb +2 -0
- data/ext/ngramdistance/ngramdistance.c +140 -0
- data/ext/ngramdistance/ngramdistance.h +1 -0
- data/lib/ngramdistance.rb +12 -0
- data/ngramdistance-ffi.gemspec +64 -0
- data/spec/ngramdistance_spec.rb +19 -0
- data/spec/spec_helper.rb +3 -0
- metadata +112 -0
data/.gitignore
ADDED
data/CHANGELOG.markdown
ADDED
data/Gemfile
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
ngramdistance-ffi
|
2
|
+
===============
|
3
|
+
|
4
|
+
Converted to FFI by Bali for Ruby portability.
|
5
|
+
|
6
|
+
This gem originally based on ngram distance.
|
7
|
+
|
8
|
+
Tested on:
|
9
|
+
|
10
|
+
* MRI 1.9.2
|
11
|
+
|
12
|
+
Known Issues
|
13
|
+
============
|
14
|
+
* The C extension uses `char*` strings, and so Unicode strings will give incorrect distances.
|
15
|
+
|
16
|
+
Including in Gemfile
|
17
|
+
====================
|
18
|
+
|
19
|
+
gem ' ngramdistance-ffi', :require => ' ngramdistance'
|
20
|
+
|
21
|
+
- end -
|
data/Rakefile
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
begin
|
2
|
+
require 'jeweler'
|
3
|
+
Jeweler::Tasks.new do |gemspec|
|
4
|
+
gemspec.name = " ngramdistance-ffi"
|
5
|
+
gemspec.summary = "An FFI version of the ngramdistance gem."
|
6
|
+
gemspec.description = "Provides a fast, cross-Ruby implementation of the ngramdistance distance algorithm."
|
7
|
+
gemspec.email = "dbalatero@gmail.com"
|
8
|
+
gemspec.homepage = "http://github.com/dbalatero/ ngramdistance-ffi"
|
9
|
+
gemspec.authors = ["Bali"]
|
10
|
+
gemspec.add_dependency "ffi"
|
11
|
+
gemspec.add_development_dependency "rspec"
|
12
|
+
gemspec.add_development_dependency "jeweler"
|
13
|
+
end
|
14
|
+
|
15
|
+
Jeweler::GemcutterTasks.new
|
16
|
+
rescue LoadError
|
17
|
+
puts "Jeweler not available. Install it with: gem install jeweler"
|
18
|
+
end
|
19
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,140 @@
|
|
1
|
+
#include <string.h>
|
2
|
+
#include <stdlib.h>
|
3
|
+
#include <stdio.h>
|
4
|
+
#ifdef NGRAM_CASE_INSENSITIVE
|
5
|
+
#include <ctype.h>
|
6
|
+
#define eq(x, y) (tolower(x) == tolower(y))
|
7
|
+
#else
|
8
|
+
#define eq(x, y) ((x) == (y))
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#ifndef max
|
12
|
+
#define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
|
13
|
+
#endif
|
14
|
+
|
15
|
+
#ifndef min
|
16
|
+
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
|
17
|
+
#endif
|
18
|
+
|
19
|
+
|
20
|
+
char* substring(const char* str, size_t begin, size_t len)
|
21
|
+
{
|
22
|
+
if (str == 0 || strlen(str) == 0 || strlen(str) < begin || strlen(str) < (begin+len))
|
23
|
+
return 0;
|
24
|
+
|
25
|
+
return strndup(str + begin, len);
|
26
|
+
}
|
27
|
+
|
28
|
+
|
29
|
+
float ngramdistance (const char *source, const char *target,int n) {
|
30
|
+
int sl = strlen(source);
|
31
|
+
int tl = strlen(target);
|
32
|
+
|
33
|
+
if (sl == 0 || tl == 0) {
|
34
|
+
if (sl == tl) {
|
35
|
+
return 1;
|
36
|
+
}
|
37
|
+
else {
|
38
|
+
return 0;
|
39
|
+
}
|
40
|
+
}
|
41
|
+
int cost = 0;
|
42
|
+
if (sl < n || tl < n) {
|
43
|
+
int ni = min(sl,tl);
|
44
|
+
int i=0;
|
45
|
+
for (i=0;i<ni;i++) {
|
46
|
+
if (eq(source[i],target[i])) {
|
47
|
+
cost++;
|
48
|
+
}
|
49
|
+
}
|
50
|
+
return (float) cost/(float)max(sl, tl);
|
51
|
+
}
|
52
|
+
int char_len = sl+n-1;
|
53
|
+
char* sa = calloc(char_len+1,sizeof(char));
|
54
|
+
float* p; //'previous' cost array, horizontally
|
55
|
+
float* d; // cost array, horizontally
|
56
|
+
float* _d; //placeholder to assist in swapping p and d
|
57
|
+
|
58
|
+
//construct sa with prefix
|
59
|
+
int i=0;
|
60
|
+
for (i=0;i<char_len;i++) {
|
61
|
+
if (i < n-1) {
|
62
|
+
sa[i]=0; //add prefix
|
63
|
+
}
|
64
|
+
else {
|
65
|
+
sa[i] = source[i-n+1];
|
66
|
+
}
|
67
|
+
}
|
68
|
+
int float_arr_len = sl+1;
|
69
|
+
p = calloc( float_arr_len+1 , sizeof( float));
|
70
|
+
d = calloc( float_arr_len+1 , sizeof( float));
|
71
|
+
|
72
|
+
// indexes into strings s and t
|
73
|
+
i=0; // iterates through source
|
74
|
+
int j=0; // iterates through target
|
75
|
+
|
76
|
+
char* t_j = calloc(n+1,sizeof(char)); // jth n-gram of t
|
77
|
+
for (i = 0; i<=sl; i++) {
|
78
|
+
p[i] = i;
|
79
|
+
}
|
80
|
+
|
81
|
+
for (j = 1; j<=tl; j++) {
|
82
|
+
//construct t_j n-gram
|
83
|
+
if (j < n) {
|
84
|
+
int ti=0;
|
85
|
+
for (ti=0;ti<n-j;ti++) {
|
86
|
+
t_j[ti]=0; //add prefix
|
87
|
+
}
|
88
|
+
for (ti=n-j;ti<n;ti++) {
|
89
|
+
t_j[ti]=target[ti-(n-j)];
|
90
|
+
}
|
91
|
+
}
|
92
|
+
else {
|
93
|
+
free (t_j);
|
94
|
+
t_j = substring(target,j-n, n);
|
95
|
+
}
|
96
|
+
d[0] = j;
|
97
|
+
for (i=1; i<=sl; i++) {
|
98
|
+
cost = 0;
|
99
|
+
int tn=n;
|
100
|
+
//compare sa to t_j
|
101
|
+
int ni=0;
|
102
|
+
for (ni=0;ni<n;ni++) {
|
103
|
+
if (!eq(sa[i-1+ni] , t_j[ni])) {
|
104
|
+
cost++;
|
105
|
+
}
|
106
|
+
else if (eq(sa[i-1+ni], 0)) { //discount matches on prefix
|
107
|
+
tn--;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
float ec = (float) cost/(float)tn;
|
111
|
+
// minimum of cell to the left+1, to the top+1, diagonally left and up +cost
|
112
|
+
d[i] = min(min(d[i-1]+1, p[i]+1), p[i-1]+ec);
|
113
|
+
}
|
114
|
+
// copy current distance counts to 'previous row' distance counts
|
115
|
+
_d = p;
|
116
|
+
p = d;
|
117
|
+
d = _d;
|
118
|
+
}
|
119
|
+
float val = p[sl];
|
120
|
+
free(p);
|
121
|
+
free(d);
|
122
|
+
free(t_j);
|
123
|
+
free(sa);
|
124
|
+
// our last action in the above loop was to switch d and p, so p now
|
125
|
+
// actually has the most recent cost counts
|
126
|
+
return 1.0f - ((float) val / (float)max(tl, sl));
|
127
|
+
}
|
128
|
+
|
129
|
+
|
130
|
+
#ifdef TEST
|
131
|
+
#include <stdio.h>
|
132
|
+
#include "ngramdistance.h"
|
133
|
+
|
134
|
+
int main (int argc, char **argv) {
|
135
|
+
float distance;
|
136
|
+
if (argc < 3) return -1;
|
137
|
+
distance = ngramdistance(argv[1], argv[2],3);
|
138
|
+
printf("%s vs %s: %f\n", argv[1], argv[2],distance);
|
139
|
+
}
|
140
|
+
#endif
|
@@ -0,0 +1 @@
|
|
1
|
+
unsigned ngramdistance(const char *, const char *, int n);
|
@@ -0,0 +1,12 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
module NGramDistance
|
4
|
+
extend FFI::Library
|
5
|
+
|
6
|
+
# Try loading in order.
|
7
|
+
library = File.dirname(__FILE__) + "/../ext/ngramdistance/ngramdistance"
|
8
|
+
candidates = ['.bundle', '.so', '.dylib', ''].map { |ext| library + ext }
|
9
|
+
ffi_lib(candidates)
|
10
|
+
|
11
|
+
attach_function :distance, :ngramdistance, [:string, :string, :int], :float
|
12
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{ngramdistance-ffi}
|
8
|
+
s.version = "1.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Bali"]
|
12
|
+
s.date = %q{2012-06-18}
|
13
|
+
s.description = %q{Provides a fast, cross-Ruby implementation of the ngramdistance distance algorithm.}
|
14
|
+
s.email = %q{mailbali@gmail.com}
|
15
|
+
s.extensions = ["ext/ngramdistance/extconf.rb"]
|
16
|
+
s.extra_rdoc_files = [
|
17
|
+
"README.markdown"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".gitignore",
|
21
|
+
"CHANGELOG.markdown",
|
22
|
+
"Gemfile",
|
23
|
+
"README.markdown",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"ext/ngramdistance/.gitignore",
|
27
|
+
"ext/ngramdistance/extconf.rb",
|
28
|
+
"ext/ngramdistance/ngramdistance.c",
|
29
|
+
"ext/ngramdistance/ngramdistance.h",
|
30
|
+
"ngramdistance-ffi.gemspec",
|
31
|
+
"lib/ngramdistance.rb",
|
32
|
+
"spec/ngramdistance_spec.rb",
|
33
|
+
"spec/spec_helper.rb"
|
34
|
+
]
|
35
|
+
s.homepage = %q{https://github.com/pecbali/ngramdistance-ffi}
|
36
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
37
|
+
s.require_paths = ["lib"]
|
38
|
+
s.rubygems_version = %q{1.3.7}
|
39
|
+
s.summary = %q{An FFI version of the ngramdistance gem.}
|
40
|
+
s.test_files = [
|
41
|
+
"spec/ngramdistance_spec.rb",
|
42
|
+
"spec/spec_helper.rb"
|
43
|
+
]
|
44
|
+
|
45
|
+
if s.respond_to? :specification_version then
|
46
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
47
|
+
s.specification_version = 3
|
48
|
+
|
49
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
50
|
+
s.add_runtime_dependency(%q<ffi>, [">= 0"])
|
51
|
+
s.add_development_dependency(%q<rspec>, [">= 0"])
|
52
|
+
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
53
|
+
else
|
54
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
55
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
56
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
57
|
+
end
|
58
|
+
else
|
59
|
+
s.add_dependency(%q<ffi>, [">= 0"])
|
60
|
+
s.add_dependency(%q<rspec>, [">= 0"])
|
61
|
+
s.add_dependency(%q<jeweler>, [">= 0"])
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe NGramDistance do
|
4
|
+
fixtures = [
|
5
|
+
["university", "univearsitty", 0.750000,
|
6
|
+
["university", "university", 1.0],
|
7
|
+
["hello", "jello",0.633333],
|
8
|
+
["hello", "heloll", 0.666667],
|
9
|
+
["hello", "saint", 0.0000],
|
10
|
+
["hello", "", 0.0000]
|
11
|
+
]
|
12
|
+
|
13
|
+
fixtures.each do |w1, w2, d|
|
14
|
+
it "should calculate a distance of #{d} between #{w1} and #{w2}" do
|
15
|
+
NGramDistance.distance(w1, w2,3).should be_close( d,0.05)
|
16
|
+
NGramDistance.distance(w2, w1,3).should be_close d,0.05)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,112 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ngramdistance-ffi
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Bali
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-18 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: ffi
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: rspec
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :development
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: jeweler
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
description: Provides a fast, cross-Ruby implementation of the ngramdistance distance
|
63
|
+
algorithm.
|
64
|
+
email: mailbali@gmail.com
|
65
|
+
executables: []
|
66
|
+
extensions:
|
67
|
+
- ext/ngramdistance/extconf.rb
|
68
|
+
extra_rdoc_files:
|
69
|
+
- README.markdown
|
70
|
+
files:
|
71
|
+
- .gitignore
|
72
|
+
- CHANGELOG.markdown
|
73
|
+
- Gemfile
|
74
|
+
- README.markdown
|
75
|
+
- Rakefile
|
76
|
+
- VERSION
|
77
|
+
- ext/ngramdistance/.gitignore
|
78
|
+
- ext/ngramdistance/extconf.rb
|
79
|
+
- ext/ngramdistance/ngramdistance.c
|
80
|
+
- ext/ngramdistance/ngramdistance.h
|
81
|
+
- ngramdistance-ffi.gemspec
|
82
|
+
- lib/ngramdistance.rb
|
83
|
+
- spec/ngramdistance_spec.rb
|
84
|
+
- spec/spec_helper.rb
|
85
|
+
homepage: https://github.com/pecbali/ngramdistance-ffi
|
86
|
+
licenses: []
|
87
|
+
post_install_message:
|
88
|
+
rdoc_options:
|
89
|
+
- --charset=UTF-8
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project:
|
106
|
+
rubygems_version: 1.8.24
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: An FFI version of the ngramdistance gem.
|
110
|
+
test_files:
|
111
|
+
- spec/ngramdistance_spec.rb
|
112
|
+
- spec/spec_helper.rb
|