phonetics 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1ad0894f34019de351475721cfba624c2873944993e82db0e72ee7dd6aef374
4
- data.tar.gz: 4590f9791a8409e9ef380949cc675c75507864050e72f616d4b3ac99f4dc4439
3
+ metadata.gz: b9f2be3c1987040125914d8a028218b495e6cd8c4c004820b4f9ffbcc9995eb8
4
+ data.tar.gz: 9b787ce78e582bfff9f515ab42b1f2d43de2b4224cb5d1a31831ddc8d1b5a672
5
5
  SHA512:
6
- metadata.gz: 60af744c78500372afb02dd22fdc1066cc7d2e83ab218eed798ff7d96f101cbc112eb0b19c6a694ec83d79e891c5ea41e0d8cda26ad5aab209aa87aed3ba9f2e
7
- data.tar.gz: 971566027a0b60cfad064f7c54c871a94830dae6c3210c77c306b31dbe0410c2687415c87686caecf9dd065812b3027b7bc6c48b69ec7fee3b01c56bfeff2644
6
+ metadata.gz: 4cd0298205c7e28785cdee86b95233ff9547317f4aa1bdee33f9fc3ca93fae636a6f1caf2b7a25061c02cf553aca9142d059f2eae19cb20ffb22b34ac3c2679e
7
+ data.tar.gz: e25d349bb3b6430f0b5001902ec9613974d60d880ccf85af4abb3ba191c73e7fb1edb3fd8c1b8af9cec39173d707a51c71d00a375de88490d7fd01092e402795
@@ -6,7 +6,16 @@ PATH
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
+ byebug (11.0.1)
10
+ coderay (1.1.2)
9
11
  diff-lcs (1.3)
12
+ method_source (0.9.2)
13
+ pry (0.12.2)
14
+ coderay (~> 1.1.0)
15
+ method_source (~> 0.9.0)
16
+ pry-byebug (3.7.0)
17
+ byebug (~> 11.0)
18
+ pry (~> 0.10)
10
19
  rake (12.3.2)
11
20
  rspec (3.8.0)
12
21
  rspec-core (~> 3.8.0)
@@ -28,6 +37,7 @@ PLATFORMS
28
37
  DEPENDENCIES
29
38
  bundler (~> 1.16)
30
39
  phonetics!
40
+ pry-byebug
31
41
  rake
32
42
  rspec (~> 3.0)
33
43
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.1.0
@@ -0,0 +1,119 @@
1
+ require_relative '../phonetics'
2
+
3
+ # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
+ # count used instead of a binary edit distance calculation
5
+ #
6
+ # This implementation is almost entirely taken from the damerau-levenshtein gem
7
+ # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
+ # The implementation is modified based on "Using Phonologically Weighted
9
+ # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
+ # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
+ # Aumont, 2016
12
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
13
+ module Phonetics
14
+ class Levenshtein
15
+ def initialize(ipa_str1, ipa_str2)
16
+ @str1 = ipa_str1
17
+ @str2 = ipa_str2
18
+ @len1 = ipa_str1.size
19
+ @len2 = ipa_str2.size
20
+ prepare_matrix
21
+ set_edit_distances(ipa_str1, ipa_str2)
22
+ end
23
+
24
+ def distance
25
+ walk.last[:distance]
26
+ end
27
+
28
+ def self.distance(str1, str2)
29
+ new(str1, str2).distance
30
+ end
31
+
32
+ private
33
+
34
+ def walk
35
+ res = []
36
+ cell = [@len2, @len1]
37
+ while cell != [0, 0]
38
+ cell, char = char_data(cell)
39
+ res.unshift char
40
+ end
41
+ res
42
+ end
43
+
44
+ def set_edit_distances(str1, str2)
45
+ (1..@len2).each do |i|
46
+ (1..@len1).each do |j|
47
+ no_change(i, j) && next if str2[i - 1] == str1[j - 1]
48
+ @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
49
+ end
50
+ end
51
+ end
52
+
53
+ def char_data(cell)
54
+ char = { distance: @matrix[cell[0]][cell[1]] }
55
+ val = find_previous(cell)
56
+ previous_value = val[0][0]
57
+ char[:type] = previous_value == char[:distance] ? :same : val[1]
58
+ cell = val.pop
59
+ [cell, char]
60
+ end
61
+
62
+ def find_previous(cell)
63
+ candidates = [
64
+ [
65
+ [ins(*cell), 1],
66
+ :ins,
67
+ [cell[0], cell[1] - 1],
68
+ ],
69
+ [
70
+ [del(*cell), 2],
71
+ :del,
72
+ [cell[0] - 1, cell[1]],
73
+ ],
74
+ [
75
+ [subst(*cell), 0],
76
+ :subst,
77
+ [cell[0] - 1, cell[1] - 1],
78
+ ],
79
+ ]
80
+ select_cell(candidates)
81
+ end
82
+
83
+ def select_cell(candidates)
84
+ candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
85
+ sort_by(&:first).first
86
+ end
87
+
88
+ # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
89
+ def del(i, j)
90
+ @matrix[i - 1][j]
91
+ end
92
+
93
+ def ins(i, j)
94
+ @matrix[i][j - 1]
95
+ end
96
+
97
+ # This is where we implement the modifications to Damerau-Levenshtein according to
98
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
99
+ def subst(i, j)
100
+ map = Phonetics.distance_map[@str1[j]]
101
+ score = map[@str2[i]] if map
102
+ score ||= 1.0
103
+ @matrix[i - 1][j - 1] + score
104
+ end
105
+
106
+ def no_change(i, j)
107
+ @matrix[i][j] = @matrix[i - 1][j - 1]
108
+ end
109
+
110
+ def prepare_matrix
111
+ @matrix = []
112
+ @matrix << (0..@len1).to_a
113
+ @len2.times do |i|
114
+ ary = [i + 1] + (1..@len1).map { nil }
115
+ @matrix << ary
116
+ end
117
+ end
118
+ end
119
+ end
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
16
16
  end
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_development_dependency "bundler", "~> 1.16"
20
- spec.add_development_dependency "rake"
21
- spec.add_development_dependency "rspec", "~> 3.0"
19
+ spec.add_development_dependency 'bundler', '~> 1.16'
20
+ spec.add_development_dependency 'rake'
21
+ spec.add_development_dependency "pry-byebug"
22
+ spec.add_development_dependency 'rspec', '~> 3.0'
22
23
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry-byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -69,8 +83,8 @@ files:
69
83
  - README.md
70
84
  - Rakefile
71
85
  - VERSION
72
- - lib/phonetic_levenshtein.rb
73
86
  - lib/phonetics.rb
87
+ - lib/phonetics/levenshtein.rb
74
88
  - lib/phonetics/version.rb
75
89
  - phonetics.gemspec
76
90
  homepage: https://github.com/JackDanger/phonetics
@@ -1,103 +0,0 @@
1
- require_relative './phonetics'
2
-
3
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
- # count used instead of a binary edit distance calculation
5
- #
6
- # This implementation is almost entirely taken from the damerau-levenshtein gem
7
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
- # The implementation is modified based on "Using Phonologically Weighted
9
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
- # Aumont, 2016
12
- # https://hal.archives-ouvertes.fr/hal-01474904/document
13
- class PhoneticLevenshtein
14
- def initialize(ipa_str1, ipa_str2)
15
- @str1 = ipa_str1
16
- @str2 = ipa_str2
17
- @len1 = ipa_str1.size
18
- @len2 = ipa_str2.size
19
- prepare_matrix
20
- set_edit_distances(ipa_str1, ipa_str2)
21
- end
22
-
23
- def distance
24
- walk.last[:distance]
25
- end
26
-
27
- def self.distance(str1, str2)
28
- new(str1, str2).distance
29
- end
30
-
31
- private
32
-
33
- def walk
34
- res = []
35
- cell = [@len2, @len1]
36
- while cell != [0, 0]
37
- cell, char = char_data(cell)
38
- res.unshift char
39
- end
40
- res
41
- end
42
-
43
- def set_edit_distances(str1, str2)
44
- (1..@len2).each do |i|
45
- (1..@len1).each do |j|
46
- no_change(i, j) && next if str2[i - 1] == str1[j - 1]
47
- @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
48
- end
49
- end
50
- end
51
-
52
- def char_data(cell)
53
- char = { distance: @matrix[cell[0]][cell[1]] }
54
- val = find_previous(cell)
55
- previous_value = val[0][0]
56
- char[:type] = previous_value == char[:distance] ? :same : val[1]
57
- cell = val.pop
58
- [cell, char]
59
- end
60
-
61
- def find_previous(cell)
62
- candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
63
- [[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
64
- [[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
65
- select_cell(candidates)
66
- end
67
-
68
- def select_cell(candidates)
69
- candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
70
- sort_by(&:first).first
71
- end
72
-
73
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
74
- def del(i, j)
75
- @matrix[i - 1][j]
76
- end
77
-
78
- def ins(i, j)
79
- @matrix[i][j - 1]
80
- end
81
-
82
- # This is where we implement the modifications to Damerau-Levenshtein according to
83
- # https://hal.archives-ouvertes.fr/hal-01474904/document
84
- def subst(i, j)
85
- map = Phonetics.distance_map[@str1[j]]
86
- score = map[@str2[i]] if map
87
- score ||= 1.0
88
- @matrix[i - 1][j - 1] + score
89
- end
90
-
91
- def no_change(i, j)
92
- @matrix[i][j] = @matrix[i - 1][j - 1]
93
- end
94
-
95
- def prepare_matrix
96
- @matrix = []
97
- @matrix << (0..@len1).to_a
98
- @len2.times do |i|
99
- ary = [i + 1] + (1..@len1).map { nil }
100
- @matrix << ary
101
- end
102
- end
103
- end