phonetics 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1ad0894f34019de351475721cfba624c2873944993e82db0e72ee7dd6aef374
4
- data.tar.gz: 4590f9791a8409e9ef380949cc675c75507864050e72f616d4b3ac99f4dc4439
3
+ metadata.gz: b9f2be3c1987040125914d8a028218b495e6cd8c4c004820b4f9ffbcc9995eb8
4
+ data.tar.gz: 9b787ce78e582bfff9f515ab42b1f2d43de2b4224cb5d1a31831ddc8d1b5a672
5
5
  SHA512:
6
- metadata.gz: 60af744c78500372afb02dd22fdc1066cc7d2e83ab218eed798ff7d96f101cbc112eb0b19c6a694ec83d79e891c5ea41e0d8cda26ad5aab209aa87aed3ba9f2e
7
- data.tar.gz: 971566027a0b60cfad064f7c54c871a94830dae6c3210c77c306b31dbe0410c2687415c87686caecf9dd065812b3027b7bc6c48b69ec7fee3b01c56bfeff2644
6
+ metadata.gz: 4cd0298205c7e28785cdee86b95233ff9547317f4aa1bdee33f9fc3ca93fae636a6f1caf2b7a25061c02cf553aca9142d059f2eae19cb20ffb22b34ac3c2679e
7
+ data.tar.gz: e25d349bb3b6430f0b5001902ec9613974d60d880ccf85af4abb3ba191c73e7fb1edb3fd8c1b8af9cec39173d707a51c71d00a375de88490d7fd01092e402795
@@ -6,7 +6,16 @@ PATH
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
+ byebug (11.0.1)
10
+ coderay (1.1.2)
9
11
  diff-lcs (1.3)
12
+ method_source (0.9.2)
13
+ pry (0.12.2)
14
+ coderay (~> 1.1.0)
15
+ method_source (~> 0.9.0)
16
+ pry-byebug (3.7.0)
17
+ byebug (~> 11.0)
18
+ pry (~> 0.10)
10
19
  rake (12.3.2)
11
20
  rspec (3.8.0)
12
21
  rspec-core (~> 3.8.0)
@@ -28,6 +37,7 @@ PLATFORMS
28
37
  DEPENDENCIES
29
38
  bundler (~> 1.16)
30
39
  phonetics!
40
+ pry-byebug
31
41
  rake
32
42
  rspec (~> 3.0)
33
43
 
data/VERSION CHANGED
@@ -1 +1 @@
1
- 1.0.0
1
+ 1.1.0
@@ -0,0 +1,119 @@
1
+ require_relative '../phonetics'
2
+
3
+ # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
+ # count used instead of a binary edit distance calculation
5
+ #
6
+ # This implementation is almost entirely taken from the damerau-levenshtein gem
7
+ # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
+ # The implementation is modified based on "Using Phonologically Weighted
9
+ # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
+ # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
+ # Aumont, 2016
12
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
13
+ module Phonetics
14
+ class Levenshtein
15
+ def initialize(ipa_str1, ipa_str2)
16
+ @str1 = ipa_str1
17
+ @str2 = ipa_str2
18
+ @len1 = ipa_str1.size
19
+ @len2 = ipa_str2.size
20
+ prepare_matrix
21
+ set_edit_distances(ipa_str1, ipa_str2)
22
+ end
23
+
24
+ def distance
25
+ walk.last[:distance]
26
+ end
27
+
28
+ def self.distance(str1, str2)
29
+ new(str1, str2).distance
30
+ end
31
+
32
+ private
33
+
34
+ def walk
35
+ res = []
36
+ cell = [@len2, @len1]
37
+ while cell != [0, 0]
38
+ cell, char = char_data(cell)
39
+ res.unshift char
40
+ end
41
+ res
42
+ end
43
+
44
+ def set_edit_distances(str1, str2)
45
+ (1..@len2).each do |i|
46
+ (1..@len1).each do |j|
47
+ no_change(i, j) && next if str2[i - 1] == str1[j - 1]
48
+ @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
49
+ end
50
+ end
51
+ end
52
+
53
+ def char_data(cell)
54
+ char = { distance: @matrix[cell[0]][cell[1]] }
55
+ val = find_previous(cell)
56
+ previous_value = val[0][0]
57
+ char[:type] = previous_value == char[:distance] ? :same : val[1]
58
+ cell = val.pop
59
+ [cell, char]
60
+ end
61
+
62
+ def find_previous(cell)
63
+ candidates = [
64
+ [
65
+ [ins(*cell), 1],
66
+ :ins,
67
+ [cell[0], cell[1] - 1],
68
+ ],
69
+ [
70
+ [del(*cell), 2],
71
+ :del,
72
+ [cell[0] - 1, cell[1]],
73
+ ],
74
+ [
75
+ [subst(*cell), 0],
76
+ :subst,
77
+ [cell[0] - 1, cell[1] - 1],
78
+ ],
79
+ ]
80
+ select_cell(candidates)
81
+ end
82
+
83
+ def select_cell(candidates)
84
+ candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
85
+ sort_by(&:first).first
86
+ end
87
+
88
+ # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
89
+ def del(i, j)
90
+ @matrix[i - 1][j]
91
+ end
92
+
93
+ def ins(i, j)
94
+ @matrix[i][j - 1]
95
+ end
96
+
97
+ # This is where we implement the modifications to Damerau-Levenshtein according to
98
+ # https://hal.archives-ouvertes.fr/hal-01474904/document
99
+ def subst(i, j)
100
+ map = Phonetics.distance_map[@str1[j]]
101
+ score = map[@str2[i]] if map
102
+ score ||= 1.0
103
+ @matrix[i - 1][j - 1] + score
104
+ end
105
+
106
+ def no_change(i, j)
107
+ @matrix[i][j] = @matrix[i - 1][j - 1]
108
+ end
109
+
110
+ def prepare_matrix
111
+ @matrix = []
112
+ @matrix << (0..@len1).to_a
113
+ @len2.times do |i|
114
+ ary = [i + 1] + (1..@len1).map { nil }
115
+ @matrix << ary
116
+ end
117
+ end
118
+ end
119
+ end
@@ -16,7 +16,8 @@ Gem::Specification.new do |spec|
16
16
  end
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_development_dependency "bundler", "~> 1.16"
20
- spec.add_development_dependency "rake"
21
- spec.add_development_dependency "rspec", "~> 3.0"
19
+ spec.add_development_dependency 'bundler', '~> 1.16'
20
+ spec.add_development_dependency 'rake'
21
+ spec.add_development_dependency "pry-byebug"
22
+ spec.add_development_dependency 'rspec', '~> 3.0'
22
23
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: pry-byebug
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: rspec
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -69,8 +83,8 @@ files:
69
83
  - README.md
70
84
  - Rakefile
71
85
  - VERSION
72
- - lib/phonetic_levenshtein.rb
73
86
  - lib/phonetics.rb
87
+ - lib/phonetics/levenshtein.rb
74
88
  - lib/phonetics/version.rb
75
89
  - phonetics.gemspec
76
90
  homepage: https://github.com/JackDanger/phonetics
@@ -1,103 +0,0 @@
1
- require_relative './phonetics'
2
-
3
- # Using the Damerau version of the Levenshtein algorithm, with phonetic feature
4
- # count used instead of a binary edit distance calculation
5
- #
6
- # This implementation is almost entirely taken from the damerau-levenshtein gem
7
- # (https://github.com/GlobalNamesArchitecture/damerau-levenshtein/tree/master/ext/damerau_levenshtein).
8
- # The implementation is modified based on "Using Phonologically Weighted
9
- # Levenshtein Distances for the Prediction of Microscopic Intelligibility" by
10
- # Lionel Fontan, Isabelle Ferrané, Jérôme Farinas, Julien Pinquier, Xavier
11
- # Aumont, 2016
12
- # https://hal.archives-ouvertes.fr/hal-01474904/document
13
- class PhoneticLevenshtein
14
- def initialize(ipa_str1, ipa_str2)
15
- @str1 = ipa_str1
16
- @str2 = ipa_str2
17
- @len1 = ipa_str1.size
18
- @len2 = ipa_str2.size
19
- prepare_matrix
20
- set_edit_distances(ipa_str1, ipa_str2)
21
- end
22
-
23
- def distance
24
- walk.last[:distance]
25
- end
26
-
27
- def self.distance(str1, str2)
28
- new(str1, str2).distance
29
- end
30
-
31
- private
32
-
33
- def walk
34
- res = []
35
- cell = [@len2, @len1]
36
- while cell != [0, 0]
37
- cell, char = char_data(cell)
38
- res.unshift char
39
- end
40
- res
41
- end
42
-
43
- def set_edit_distances(str1, str2)
44
- (1..@len2).each do |i|
45
- (1..@len1).each do |j|
46
- no_change(i, j) && next if str2[i - 1] == str1[j - 1]
47
- @matrix[i][j] = [del(i, j) + 1.0, ins(i, j) + 1.0, subst(i, j)].min
48
- end
49
- end
50
- end
51
-
52
- def char_data(cell)
53
- char = { distance: @matrix[cell[0]][cell[1]] }
54
- val = find_previous(cell)
55
- previous_value = val[0][0]
56
- char[:type] = previous_value == char[:distance] ? :same : val[1]
57
- cell = val.pop
58
- [cell, char]
59
- end
60
-
61
- def find_previous(cell)
62
- candidates = [[[ins(*cell), 1], :ins, [cell[0], cell[1] - 1]],
63
- [[del(*cell), 2], :del, [cell[0] - 1, cell[1]]],
64
- [[subst(*cell), 0], :subst, [cell[0] - 1, cell[1] - 1]]]
65
- select_cell(candidates)
66
- end
67
-
68
- def select_cell(candidates)
69
- candidates.select { |e| e[-1][0] >= 0 && e[-1][1] >= 0 }.
70
- sort_by(&:first).first
71
- end
72
-
73
- # TODO: Score the edit distance lower if sonorant sounds are found in sequence.
74
- def del(i, j)
75
- @matrix[i - 1][j]
76
- end
77
-
78
- def ins(i, j)
79
- @matrix[i][j - 1]
80
- end
81
-
82
- # This is where we implement the modifications to Damerau-Levenshtein according to
83
- # https://hal.archives-ouvertes.fr/hal-01474904/document
84
- def subst(i, j)
85
- map = Phonetics.distance_map[@str1[j]]
86
- score = map[@str2[i]] if map
87
- score ||= 1.0
88
- @matrix[i - 1][j - 1] + score
89
- end
90
-
91
- def no_change(i, j)
92
- @matrix[i][j] = @matrix[i - 1][j - 1]
93
- end
94
-
95
- def prepare_matrix
96
- @matrix = []
97
- @matrix << (0..@len1).to_a
98
- @len2.times do |i|
99
- ary = [i + 1] + (1..@len1).map { nil }
100
- @matrix << ary
101
- end
102
- end
103
- end