dimus-taxamatch_rb 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +6 -0
- data/LICENSE +20 -0
- data/README.rdoc +7 -0
- data/Rakefile +50 -0
- data/VERSION +1 -0
- data/features/step_definitions/common_steps.rb +163 -0
- data/features/step_definitions/taxamatch_rb.rb +92 -0
- data/features/support/common.rb +29 -0
- data/features/support/env.rb +14 -0
- data/features/support/matchers.rb +11 -0
- data/features/taxamatch_rb.feature +33 -0
- data/lib/taxamatch_rb/damerau_levenshtein_mod.rb +136 -0
- data/lib/taxamatch_rb/normalizer.rb +47 -0
- data/lib/taxamatch_rb/parser.rb +83 -0
- data/lib/taxamatch_rb/phonetizer.rb +74 -0
- data/lib/taxamatch_rb.rb +444 -0
- data/spec/damerau_levenshtein_mod_test.txt +58 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/taxamatch_rb_spec.rb +50 -0
- data/taxamatch_rb.gemspec +65 -0
- metadata +96 -0
data/.document
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2009 Dmitry Mozzherin
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "taxamatch_rb"
|
8
|
+
gem.summary = %Q{TODO}
|
9
|
+
gem.email = "dmozzherin@eol.org"
|
10
|
+
gem.homepage = "http://github.com/dimus/taxamatch_rb"
|
11
|
+
gem.authors = ["Dmitry Mozzherin"]
|
12
|
+
gem.add_dependency('RubyInline')
|
13
|
+
gem.add_dependency('dimus-biodiversity')
|
14
|
+
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
15
|
+
end
|
16
|
+
|
17
|
+
rescue LoadError
|
18
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
19
|
+
end
|
20
|
+
|
21
|
+
require 'spec/rake/spectask'
|
22
|
+
Spec::Rake::SpecTask.new(:spec) do |spec|
|
23
|
+
spec.libs << 'lib' << 'spec'
|
24
|
+
spec.spec_files = FileList['spec/**/*_spec.rb']
|
25
|
+
end
|
26
|
+
|
27
|
+
Spec::Rake::SpecTask.new(:rcov) do |spec|
|
28
|
+
spec.libs << 'lib' << 'spec'
|
29
|
+
spec.pattern = 'spec/**/*_spec.rb'
|
30
|
+
spec.rcov = true
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
task :default => :spec
|
35
|
+
|
36
|
+
require 'rake/rdoctask'
|
37
|
+
Rake::RDocTask.new do |rdoc|
|
38
|
+
if File.exist?('VERSION.yml')
|
39
|
+
config = YAML.load(File.read('VERSION.yml'))
|
40
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
41
|
+
else
|
42
|
+
version = ""
|
43
|
+
end
|
44
|
+
|
45
|
+
rdoc.rdoc_dir = 'rdoc'
|
46
|
+
rdoc.title = "taxamatch_rb #{version}"
|
47
|
+
rdoc.rdoc_files.include('README*')
|
48
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
49
|
+
end
|
50
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.1
|
@@ -0,0 +1,163 @@
|
|
1
|
+
Given /^this project is active project folder/ do
|
2
|
+
@active_project_folder = File.expand_path(File.dirname(__FILE__) + "/../..")
|
3
|
+
end
|
4
|
+
|
5
|
+
Given /^env variable \$([\w_]+) set to "(.*)"/ do |env_var, value|
|
6
|
+
ENV[env_var] = value
|
7
|
+
end
|
8
|
+
|
9
|
+
Given /"(.*)" folder is deleted/ do |folder|
|
10
|
+
in_project_folder { FileUtils.rm_rf folder }
|
11
|
+
end
|
12
|
+
|
13
|
+
When /^I invoke "(.*)" generator with arguments "(.*)"$/ do |generator, arguments|
|
14
|
+
@stdout = StringIO.new
|
15
|
+
in_project_folder do
|
16
|
+
if Object.const_defined?("APP_ROOT")
|
17
|
+
APP_ROOT.replace(FileUtils.pwd)
|
18
|
+
else
|
19
|
+
APP_ROOT = FileUtils.pwd
|
20
|
+
end
|
21
|
+
run_generator(generator, arguments.split(' '), SOURCES, :stdout => @stdout)
|
22
|
+
end
|
23
|
+
File.open(File.join(@tmp_root, "generator.out"), "w") do |f|
|
24
|
+
@stdout.rewind
|
25
|
+
f << @stdout.read
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
When /^I run executable "(.*)" with arguments "(.*)"/ do |executable, arguments|
|
30
|
+
@stdout = File.expand_path(File.join(@tmp_root, "executable.out"))
|
31
|
+
in_project_folder do
|
32
|
+
system "#{executable} #{arguments} > #{@stdout} 2> #{@stdout}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
When /^I run project executable "(.*)" with arguments "(.*)"/ do |executable, arguments|
|
37
|
+
@stdout = File.expand_path(File.join(@tmp_root, "executable.out"))
|
38
|
+
in_project_folder do
|
39
|
+
system "ruby #{executable} #{arguments} > #{@stdout} 2> #{@stdout}"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
When /^I run local executable "(.*)" with arguments "(.*)"/ do |executable, arguments|
|
44
|
+
@stdout = File.expand_path(File.join(@tmp_root, "executable.out"))
|
45
|
+
executable = File.expand_path(File.join(File.dirname(__FILE__), "/../../bin", executable))
|
46
|
+
in_project_folder do
|
47
|
+
system "ruby #{executable} #{arguments} > #{@stdout} 2> #{@stdout}"
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
When /^I invoke task "rake (.*)"/ do |task|
|
52
|
+
@stdout = File.expand_path(File.join(@tmp_root, "tests.out"))
|
53
|
+
in_project_folder do
|
54
|
+
system "rake #{task} --trace > #{@stdout} 2> #{@stdout}"
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
Then /^folder "(.*)" (is|is not) created/ do |folder, is|
|
59
|
+
in_project_folder do
|
60
|
+
File.exists?(folder).should(is == 'is' ? be_true : be_false)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
Then /^file "(.*)" (is|is not) created/ do |file, is|
|
65
|
+
in_project_folder do
|
66
|
+
File.exists?(file).should(is == 'is' ? be_true : be_false)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
Then /^file with name matching "(.*)" is created/ do |pattern|
|
71
|
+
in_project_folder do
|
72
|
+
Dir[pattern].should_not be_empty
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
Then /^file "(.*)" contents (does|does not) match \/(.*)\// do |file, does, regex|
|
77
|
+
in_project_folder do
|
78
|
+
actual_output = File.read(file)
|
79
|
+
(does == 'does') ?
|
80
|
+
actual_output.should(match(/#{regex}/)) :
|
81
|
+
actual_output.should_not(match(/#{regex}/))
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
Then /gem file "(.*)" and generated file "(.*)" should be the same/ do |gem_file, project_file|
|
86
|
+
File.exists?(gem_file).should be_true
|
87
|
+
File.exists?(project_file).should be_true
|
88
|
+
gem_file_contents = File.read(File.dirname(__FILE__) + "/../../#{gem_file}")
|
89
|
+
project_file_contents = File.read(File.join(@active_project_folder, project_file))
|
90
|
+
project_file_contents.should == gem_file_contents
|
91
|
+
end
|
92
|
+
|
93
|
+
Then /^(does|does not) invoke generator "(.*)"$/ do |does_invoke, generator|
|
94
|
+
actual_output = File.read(@stdout)
|
95
|
+
does_invoke == "does" ?
|
96
|
+
actual_output.should(match(/dependency\s+#{generator}/)) :
|
97
|
+
actual_output.should_not(match(/dependency\s+#{generator}/))
|
98
|
+
end
|
99
|
+
|
100
|
+
Then /help options "(.*)" and "(.*)" are displayed/ do |opt1, opt2|
|
101
|
+
actual_output = File.read(@stdout)
|
102
|
+
actual_output.should match(/#{opt1}/)
|
103
|
+
actual_output.should match(/#{opt2}/)
|
104
|
+
end
|
105
|
+
|
106
|
+
Then /^I should see$/ do |text|
|
107
|
+
actual_output = File.read(@stdout)
|
108
|
+
actual_output.should contain(text)
|
109
|
+
end
|
110
|
+
|
111
|
+
Then /^I should not see$/ do |text|
|
112
|
+
actual_output = File.read(@stdout)
|
113
|
+
actual_output.should_not contain(text)
|
114
|
+
end
|
115
|
+
|
116
|
+
Then /^I should see exactly$/ do |text|
|
117
|
+
actual_output = File.read(@stdout)
|
118
|
+
actual_output.should == text
|
119
|
+
end
|
120
|
+
|
121
|
+
Then /^I should see all (\d+) tests pass/ do |expected_test_count|
|
122
|
+
expected = %r{^#{expected_test_count} tests, \d+ assertions, 0 failures, 0 errors}
|
123
|
+
actual_output = File.read(@stdout)
|
124
|
+
actual_output.should match(expected)
|
125
|
+
end
|
126
|
+
|
127
|
+
Then /^I should see all (\d+) examples pass/ do |expected_test_count|
|
128
|
+
expected = %r{^#{expected_test_count} examples?, 0 failures}
|
129
|
+
actual_output = File.read(@stdout)
|
130
|
+
actual_output.should match(expected)
|
131
|
+
end
|
132
|
+
|
133
|
+
Then /^yaml file "(.*)" contains (\{.*\})/ do |file, yaml|
|
134
|
+
in_project_folder do
|
135
|
+
yaml = eval yaml
|
136
|
+
YAML.load(File.read(file)).should == yaml
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
Then /^Rakefile can display tasks successfully/ do
|
141
|
+
@stdout = File.expand_path(File.join(@tmp_root, "rakefile.out"))
|
142
|
+
in_project_folder do
|
143
|
+
system "rake -T > #{@stdout} 2> #{@stdout}"
|
144
|
+
end
|
145
|
+
actual_output = File.read(@stdout)
|
146
|
+
actual_output.should match(/^rake\s+\w+\s+#\s.*/)
|
147
|
+
end
|
148
|
+
|
149
|
+
Then /^task "rake (.*)" is executed successfully/ do |task|
|
150
|
+
@stdout.should_not be_nil
|
151
|
+
actual_output = File.read(@stdout)
|
152
|
+
actual_output.should_not match(/^Don't know how to build task '#{task}'/)
|
153
|
+
actual_output.should_not match(/Error/i)
|
154
|
+
end
|
155
|
+
|
156
|
+
Then /^gem spec key "(.*)" contains \/(.*)\// do |key, regex|
|
157
|
+
in_project_folder do
|
158
|
+
gem_file = Dir["pkg/*.gem"].first
|
159
|
+
gem_spec = Gem::Specification.from_yaml(`gem spec #{gem_file}`)
|
160
|
+
spec_value = gem_spec.send(key.to_sym)
|
161
|
+
spec_value.to_s.should match(/#{regex}/)
|
162
|
+
end
|
163
|
+
end
|
@@ -0,0 +1,92 @@
|
|
1
|
+
str1 = str2 = block_size = max_distance = distance = dlm = nil
|
2
|
+
|
3
|
+
###############
|
4
|
+
#DAMERAU LEVENSHTEIN MOD
|
5
|
+
###############
|
6
|
+
|
7
|
+
Given /^strings "([^\"]*)" and "([^\"]*)", transposition block size "([^\"]*)", and a maximum allowed distance "([^\"]*)"$/ do |a,b,c,d|
|
8
|
+
str1 = a
|
9
|
+
str2 = b
|
10
|
+
block_size = c.to_i
|
11
|
+
max_distance = d.to_i
|
12
|
+
end
|
13
|
+
|
14
|
+
When /^I run "([^\"]*)" instance function "([^\"]*)"$/ do |arg1, arg2|
|
15
|
+
dl = eval(arg1 + ".new")
|
16
|
+
distance = dl.distance(str1, str2, block_size, max_distance)
|
17
|
+
end
|
18
|
+
|
19
|
+
Then /^I should receive edit distance "([^\"]*)"$/ do |arg1|
|
20
|
+
distance.should == arg1.to_i
|
21
|
+
end
|
22
|
+
|
23
|
+
#############
|
24
|
+
#PARSER
|
25
|
+
#############
|
26
|
+
|
27
|
+
sci_name = result = nil
|
28
|
+
parser = Parser.new
|
29
|
+
|
30
|
+
Given /^a name "([^\"]*)"$/ do |arg1|
|
31
|
+
sci_name = arg1
|
32
|
+
end
|
33
|
+
|
34
|
+
When /^I run a Parser function parse$/ do
|
35
|
+
result = parser.parse(sci_name)
|
36
|
+
end
|
37
|
+
|
38
|
+
Then /^I should receive "([^\"]*)" as genus epithet, "([^\"]*)" as species epithet, "([^\"]*)" and "([^\"]*)" as species authors, "([^\"]*)" as a species year$/ do |gen_val, sp_val, au_val1, au_val2, yr_val|
|
39
|
+
result[:genus][:epitheton].should == gen_val
|
40
|
+
result[:species][:epitheton].should == sp_val
|
41
|
+
result[:species][:authors].include?(au_val1).should be_true
|
42
|
+
result[:species][:authors].include?(au_val2).should be_true
|
43
|
+
result[:species][:years].include?(yr_val).should be_true
|
44
|
+
require 'pp'
|
45
|
+
print result
|
46
|
+
end
|
47
|
+
|
48
|
+
#############
|
49
|
+
# NORMALIZER
|
50
|
+
#############
|
51
|
+
|
52
|
+
string = normalized_string = nil
|
53
|
+
|
54
|
+
Given /^a string "([^\"]*)"$/ do |arg1|
|
55
|
+
string = arg1
|
56
|
+
end
|
57
|
+
|
58
|
+
When /^I run a Normalizer function normalize$/ do
|
59
|
+
normalized_string = Normalizer.normalize(string)
|
60
|
+
end
|
61
|
+
|
62
|
+
Then /^I should receive "([^\"]*)" as a normalized form of the string$/ do |arg1|
|
63
|
+
normalized_string.should == arg1
|
64
|
+
end
|
65
|
+
|
66
|
+
######
|
67
|
+
# PHONETIZER
|
68
|
+
#####
|
69
|
+
|
70
|
+
word = phonetized_word = nil
|
71
|
+
|
72
|
+
Given /^a word "([^\"]*)"$/ do |arg1|
|
73
|
+
word = arg1
|
74
|
+
end
|
75
|
+
|
76
|
+
When /^I run a Phonetizer function near_match$/ do
|
77
|
+
phonetized_word = Phonetizer.near_match(word)
|
78
|
+
end
|
79
|
+
|
80
|
+
Then /^I should receive "([^\"]*)" as a phonetic form of the word$/ do |arg1|
|
81
|
+
phonetized_word.should == arg1
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
When /^I run a Phonetizer function near_match with an option normalize_ending$/ do
|
86
|
+
phonetized_word = Phonetizer.near_match(word,true)
|
87
|
+
end
|
88
|
+
|
89
|
+
Then /^I should receive "([^\"]*)" as a normalized phonetic form of the word$/ do |arg1|
|
90
|
+
phonetized_word.should == arg1
|
91
|
+
end
|
92
|
+
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module CommonHelpers
|
2
|
+
def in_tmp_folder(&block)
|
3
|
+
FileUtils.chdir(@tmp_root, &block)
|
4
|
+
end
|
5
|
+
|
6
|
+
def in_project_folder(&block)
|
7
|
+
project_folder = @active_project_folder || @tmp_root
|
8
|
+
FileUtils.chdir(project_folder, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def in_home_folder(&block)
|
12
|
+
FileUtils.chdir(@home_path, &block)
|
13
|
+
end
|
14
|
+
|
15
|
+
def force_local_lib_override(project_name = @project_name)
|
16
|
+
rakefile = File.read(File.join(project_name, 'Rakefile'))
|
17
|
+
File.open(File.join(project_name, 'Rakefile'), "w+") do |f|
|
18
|
+
f << "$:.unshift('#{@lib_path}')\n"
|
19
|
+
f << rakefile
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def setup_active_project_folder project_name
|
24
|
+
@active_project_folder = File.join(@tmp_root, project_name)
|
25
|
+
@project_name = project_name
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
World(CommonHelpers)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
require File.dirname(__FILE__) + "/../../lib/taxamatch_rb"
|
2
|
+
|
3
|
+
gem 'cucumber'
|
4
|
+
require 'cucumber'
|
5
|
+
gem 'rspec'
|
6
|
+
require 'spec'
|
7
|
+
|
8
|
+
Before do
|
9
|
+
@tmp_root = File.dirname(__FILE__) + "/../../tmp"
|
10
|
+
@home_path = File.expand_path(File.join(@tmp_root, "home"))
|
11
|
+
FileUtils.rm_rf @tmp_root
|
12
|
+
FileUtils.mkdir_p @home_path
|
13
|
+
ENV['HOME'] = @home_path
|
14
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Matchers
|
2
|
+
def contain(expected)
|
3
|
+
simple_matcher("contain #{expected.inspect}") do |given, matcher|
|
4
|
+
matcher.failure_message = "expected #{given.inspect} to contain #{expected.inspect}"
|
5
|
+
matcher.negative_failure_message = "expected #{given.inspect} not to contain #{expected.inspect}"
|
6
|
+
given.index expected
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
World(Matchers)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
Feature: Find if two scientific names are lexical variants of each other
|
2
|
+
|
3
|
+
As a Biodiversity Informatician
|
4
|
+
I want to be able to compare scientific names to determine if they are variants of the same name.
|
5
|
+
And I want to be able to combine names that are the same into lexical groups, so they appear together in names list
|
6
|
+
So I want to implement Tony Rees and Barbara Boehmer taxamatch algorithms http://bit.ly/boWyG
|
7
|
+
|
8
|
+
|
9
|
+
Scenario: find edit distance between two unicode (utf8) strings
|
10
|
+
Given strings "Sjostedt" and "Sojstedt", transposition block size "1", and a maximum allowed distance "4"
|
11
|
+
When I run "DamerauLevenshteinMod" instance function "distance"
|
12
|
+
Then I should receive edit distance "1"
|
13
|
+
|
14
|
+
Scenario: find parts of a name in unicode
|
15
|
+
Given a name "Arthopyrenia hyalospora (Banker) D. Hall 1988 hyalosporis Kutz 1999"
|
16
|
+
When I run a Parser function parse
|
17
|
+
Then I should receive "Arthopyrenia" as genus epithet, "hyalospora" as species epithet, "Banker" and "D. Hall" as species authors, "1988" as a species year
|
18
|
+
|
19
|
+
Scenario: normalize a string into ASCII upcase
|
20
|
+
Given a string "Choriozopella trägårdhi"
|
21
|
+
When I run a Normalizer function normalize
|
22
|
+
Then I should receive "CHORIOZOPELLA TRAGARDHI" as a normalized form of the string
|
23
|
+
|
24
|
+
Scenario: create phonetic version of a word
|
25
|
+
Given a word "bifasciata"
|
26
|
+
When I run a Phonetizer function near_match
|
27
|
+
Then I should receive "BIFASATA" as a phonetic form of the word
|
28
|
+
|
29
|
+
Scenario: create phonetic version of a species epithet normalizing ending
|
30
|
+
Given a word "bifasciatum"
|
31
|
+
When I run a Phonetizer function near_match with an option normalize_ending
|
32
|
+
Then I should receive "BIFASATA" as a normalized phonetic form of the word
|
33
|
+
|
@@ -0,0 +1,136 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rubygems'
|
3
|
+
require 'inline'
|
4
|
+
require 'time'
|
5
|
+
|
6
|
+
class DamerauLevenshteinMod
|
7
|
+
def distance(str1, str2, block_size=2, max_distance=10)
|
8
|
+
# puts str1.unpack("U*");
|
9
|
+
res = distance_utf(str1.unpack("U*"), str2.unpack("U*"), block_size, max_distance)
|
10
|
+
(res > max_distance) ? nil : res
|
11
|
+
end
|
12
|
+
|
13
|
+
inline do |builder|
|
14
|
+
builder.c "
|
15
|
+
static VALUE distance_utf(VALUE _s, VALUE _t, long block_size, long max_distance){
|
16
|
+
long min, i, i1, j, j1, k, sl, half_sl, tl, half_tl, cost, *d, distance, del, ins, subs, transp, block, current_distance;
|
17
|
+
long stop_execution = 0;
|
18
|
+
|
19
|
+
VALUE *sv = RARRAY_PTR(_s);
|
20
|
+
VALUE *tv = RARRAY_PTR(_t);
|
21
|
+
|
22
|
+
sl = RARRAY_LEN(_s);
|
23
|
+
tl = RARRAY_LEN(_t);
|
24
|
+
|
25
|
+
if (sl == 0) return LONG2NUM(tl);
|
26
|
+
if (tl == 0) return LONG2NUM(sl);
|
27
|
+
//case of lengths 1 must present or it will break further in the code
|
28
|
+
if (sl == 1 && tl == 1 && sv[0] != tv[0]) return LONG2NUM(1);
|
29
|
+
|
30
|
+
long s[sl];
|
31
|
+
long t[tl];
|
32
|
+
|
33
|
+
for (i=0; i < sl; i++) s[i] = NUM2LONG(sv[i]);
|
34
|
+
for (i=0; i < tl; i++) t[i] = NUM2LONG(tv[i]);
|
35
|
+
|
36
|
+
sl++;
|
37
|
+
tl++;
|
38
|
+
|
39
|
+
//one-dimentional representation of 2 dimentional array len(s)+1 * len(t)+1
|
40
|
+
d = malloc((sizeof(long))*(sl)*(tl));
|
41
|
+
//populate 'vertical' row starting from the 2nd position (first one is filled already)
|
42
|
+
for(i = 0; i < tl; i++){
|
43
|
+
d[i*sl] = i;
|
44
|
+
}
|
45
|
+
|
46
|
+
//fill up array with scores
|
47
|
+
for(i = 1; i<sl; i++){
|
48
|
+
d[i] = i;
|
49
|
+
if (stop_execution == 1) break;
|
50
|
+
current_distance = 10000;
|
51
|
+
for(j = 1; j<tl; j++){
|
52
|
+
|
53
|
+
cost = 1;
|
54
|
+
if(s[i-1] == t[j-1]) cost = 0;
|
55
|
+
|
56
|
+
half_sl = (sl - 1)/2;
|
57
|
+
half_tl = (tl - 1)/2;
|
58
|
+
|
59
|
+
block = block_size < half_sl ? block_size : half_sl;
|
60
|
+
block = block < half_tl ? block : half_tl;
|
61
|
+
|
62
|
+
while (block >= 1){
|
63
|
+
long swap1 = 1;
|
64
|
+
long swap2 = 1;
|
65
|
+
i1 = i - (block * 2);
|
66
|
+
j1 = j - (block * 2);
|
67
|
+
for (k = i1; k < i1 + block; k++) {
|
68
|
+
if (s[k] != t[k + block]){
|
69
|
+
swap1 = 0;
|
70
|
+
break;
|
71
|
+
}
|
72
|
+
}
|
73
|
+
for (k = j1; k < j1 + block; k++) {
|
74
|
+
if (t[k] != s[k + block]){
|
75
|
+
swap2 = 0;
|
76
|
+
break;
|
77
|
+
}
|
78
|
+
}
|
79
|
+
|
80
|
+
del = d[j*sl + i - 1] + 1;
|
81
|
+
ins = d[(j-1)*sl + i] + 1;
|
82
|
+
min = del;
|
83
|
+
if (ins < min) min = ins;
|
84
|
+
//if (i == 2 && j==2) return LONG2NUM(swap2+5);
|
85
|
+
if (i >= block && j >= block && swap1 == 1 && swap2 == 1){
|
86
|
+
transp = d[(j - block * 2) * sl + i - block * 2] + cost + block -1;
|
87
|
+
if (transp < min) min = transp;
|
88
|
+
block = 0;
|
89
|
+
} else if (block == 1) {
|
90
|
+
subs = d[(j-1)*sl + i - 1] + cost;
|
91
|
+
if (subs < min) min = subs;
|
92
|
+
}
|
93
|
+
block--;
|
94
|
+
}
|
95
|
+
d[j*sl+i]=min;
|
96
|
+
if (current_distance > d[j*sl+i]) current_distance = d[j*sl+i];
|
97
|
+
}
|
98
|
+
if (current_distance > max_distance) {
|
99
|
+
stop_execution = 1;
|
100
|
+
}
|
101
|
+
}
|
102
|
+
distance=d[sl * tl - 1];
|
103
|
+
if (stop_execution == 1) distance = current_distance;
|
104
|
+
|
105
|
+
free(d);
|
106
|
+
return LONG2NUM(distance);
|
107
|
+
}
|
108
|
+
"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
if __FILE__ == $0
|
113
|
+
a=DamerauLevenshteinMod.new
|
114
|
+
s = 'Cedarinia scabra Sjöstedt 1921'.unpack('U*')
|
115
|
+
t = 'Cedarinia scabra Söjstedt 1921'.unpack('U*')
|
116
|
+
|
117
|
+
#puts s.join(",")
|
118
|
+
#puts t.join(",")
|
119
|
+
|
120
|
+
start = Time.now
|
121
|
+
(1..100000).each do
|
122
|
+
a.distance('Cedarinia scabra Sjöstedt 1921', 'Cedarinia scabra Söjstedt 1921',1,10)
|
123
|
+
end
|
124
|
+
puts "with unpack time: " + (Time.now - start).to_s + ' sec'
|
125
|
+
|
126
|
+
start = Time.now
|
127
|
+
(1..100000).each do
|
128
|
+
a.distance_utf(s, t, 1, 10)
|
129
|
+
end
|
130
|
+
puts 'utf time: ' + (Time.now - start).to_s + ' sec'
|
131
|
+
|
132
|
+
#puts a.distance('Cedarinia scabra Sjöstedt 1921','Cedarinia scabra Söjstedt 1921')
|
133
|
+
#puts a.distance_utf(s, t, 2, 10)
|
134
|
+
#puts a.distance('tar','atp',1,10);
|
135
|
+
puts a.distance('sub', 'usb', 1, 10);
|
136
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
module Normalizer
|
4
|
+
def self.normalize(string)
|
5
|
+
utf8_to_ascii(string).upcase
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.normalize_word(word)
|
9
|
+
self.normalize(word).gsub(/[^A-Z\.\-]/, '')
|
10
|
+
end
|
11
|
+
|
12
|
+
protected
|
13
|
+
def self.utf8_to_ascii(string)
|
14
|
+
string = string.gsub(/[ÀÂÅÃÄÁẤẠ]/, "A")
|
15
|
+
string = string.gsub(/[ÉÈÊË]/, "E")
|
16
|
+
string = string.gsub(/[ÍÌÎÏ]/, "I")
|
17
|
+
string = string.gsub(/[ÓÒÔØÕÖỚỔ]/, "O")
|
18
|
+
string = string.gsub(/[ÚÙÛÜ]/, "U")
|
19
|
+
string = string.gsub(/[Ý]/, "Y")
|
20
|
+
string = string.gsub(/Æ/, "AE")
|
21
|
+
string = string.gsub(/[ČÇ]/, "C")
|
22
|
+
string = string.gsub(/[ŠŞ]/, "S")
|
23
|
+
string = string.gsub(/[Đ]/, "D")
|
24
|
+
string = string.gsub(/Ž/, "Z")
|
25
|
+
string = string.gsub(/Ñ/, "N")
|
26
|
+
string = string.gsub(/Œ/, "OE")
|
27
|
+
string = string.gsub(/ß/, "B")
|
28
|
+
string = string.gsub(/Ķ/, "K")
|
29
|
+
string = string.gsub(/[áàâåãäăãắảạậầằ]/, "a")
|
30
|
+
string = string.gsub(/[éèêëĕěếệểễềẻ]/, "e")
|
31
|
+
string = string.gsub(/[íìîïǐĭīĩỉï]/, "i")
|
32
|
+
string = string.gsub(/[óòôøõöŏỏỗộơọỡốơồờớổ]/, "o")
|
33
|
+
string = string.gsub(/[úùûüůưừựủứụ]/, "u")
|
34
|
+
string = string.gsub(/[žź]/, "z")
|
35
|
+
string = string.gsub(/[ýÿỹ]/, "y")
|
36
|
+
string = string.gsub(/[đ]/, "d")
|
37
|
+
string = string.gsub(/æ/, "ae")
|
38
|
+
string = string.gsub(/[čćç]/, "c")
|
39
|
+
string = string.gsub(/[ñńň]/, "n")
|
40
|
+
string = string.gsub(/œ/, "oe")
|
41
|
+
string = string.gsub(/[śšş]/, "s")
|
42
|
+
string = string.gsub(/ř/, "r")
|
43
|
+
string = string.gsub(/ğ/, "g")
|
44
|
+
string = string.gsub(/Ř/, "R")
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|