phonetics 2.0.1 → 3.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -16,12 +16,6 @@ module Phonetics
16
16
  module Levenshtein
17
17
  extend ::PhoneticsLevenshteinCBinding
18
18
 
19
- def inspect_bytes(str)
20
- puts "Rubyland str: #{str.inspect}"
21
- puts "Rubyland bytes: #{str.bytes.inspect}"
22
- testing_codepoints(str)
23
- end
24
-
25
19
  def self.distance(str1, str2, verbose = false)
26
20
  internal_phonetic_distance(str1, str2, verbose)
27
21
  end
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open-uri'
4
+ require 'json'
5
+
6
+ module Phonetics
7
+ module Transcriptions
8
+ extend self
9
+ Transcriptions = File.join(__dir__, '..', 'common_ipa_transcriptions.json')
10
+ TranscriptionsURL = 'https://jackdanger.com/common_ipa_transcriptions.json'
11
+
12
+ SourcesByPreference = [/wiktionary/, /cmu/, /phonemicchart.com/].freeze
13
+
14
+ def [](key)
15
+ entry = transcriptions[key]
16
+ return unless entry
17
+ return unless entry['ipa']
18
+
19
+ SourcesByPreference.each do |preferred_source|
20
+ entry['ipa'].keys.each do |source|
21
+ return entry['ipa'][source] if source =~ preferred_source
22
+ end
23
+ end
24
+ nil
25
+ end
26
+
27
+ def transcriptions
28
+ @transcriptions ||= begin
29
+ download! unless File.exist?(Transcriptions)
30
+ load_from_disk!
31
+ end
32
+ end
33
+
34
+ # Lazily loaded from JSON file on disk
35
+ def load_from_disk!
36
+ @transcriptions = JSON.parse(File.read(Transcriptions))
37
+ end
38
+
39
+ def download!
40
+ File.open(Transcriptions, 'w') { |f| f.write(URI.open(TranscriptionsURL).read) }
41
+ end
42
+
43
+ def trie
44
+ # Let's turn this:
45
+ #
46
+ # "century": {
47
+ # "rarity": 462.0,
48
+ # "ipa": {
49
+ # "cmu": "sɛntʃɝɪ",
50
+ # "phonemicchart.com": "sentʃərɪ",
51
+ # "wiktionary": "sɛntʃəɹi",
52
+ # "wiktionary2": "sɛntʃɹi",
53
+ # "wiktionary3": "sɛntʃʊɹi"
54
+ # },
55
+ # "alt_display": "CENTURY"
56
+ # }
57
+ #
58
+ # into this:
59
+ #
60
+ # "s": {
61
+ # "e": {
62
+ # "n": {
63
+ # "t": {
64
+ # "ʃ": {
65
+ # "ʊ": {
66
+ # "ɹ": {
67
+ # "i": {
68
+ # "terminal": [Term('century')],
69
+ # },
70
+ # },
71
+ # },
72
+ # "ə": {
73
+ # "r": {
74
+ # "ɪ": {
75
+ # "terminal": [Term('century')],
76
+ # },
77
+ # },
78
+ # },
79
+ # "ɹ": {
80
+ # "i": {
81
+ # "terminal": [Term('century')],
82
+ # },
83
+ # },
84
+ # "ɝ": {
85
+ # "ɪ": {
86
+ # "terminal": [Term('century')],
87
+ # },
88
+ # },
89
+ # },
90
+ # },
91
+ # },
92
+ # },
93
+ # "ɛ": {
94
+ # "n": {
95
+ # "t": {
96
+ # "ʃ": {
97
+ # "ɝ": {
98
+ # "ɪ": {
99
+ # "terminal": [Term('century')],
100
+ # },
101
+ # },
102
+ # },
103
+ # },
104
+ # },
105
+ # },
106
+ # },
107
+ #
108
+ @trie ||= begin
109
+ base_trie = {}
110
+ transcriptions.each do |key, entry|
111
+ entry_data = {
112
+ word: key,
113
+ rarity: entry['rarity'],
114
+ }
115
+ entry.fetch('ipa', []).each do |_source, transcription|
116
+ base_trie = construct_trie(base_trie, transcription, entry_data)
117
+ end
118
+ end
119
+ base_trie.freeze
120
+ end
121
+ end
122
+
123
+ def walk(ipa)
124
+ ipa.each_char.reduce(trie) { |acc, char| acc[char] }
125
+ end
126
+
127
+ def transcription_for(phrase)
128
+ phrase.downcase.split(' ').map { |word| self[word] }.join
129
+ end
130
+
131
+ private
132
+
133
+ # Given an portion of an existing trie (to be modified), the remainder of a
134
+ # char string, and an entry, walk or construct the appropriate trie nodes
135
+ # necessary to place the entry in a leaf.
136
+ def construct_trie(subtrie, chars_remaining, entry_data, depth = 0)
137
+ subtrie[:depth] ||= depth
138
+ if chars_remaining.empty?
139
+ # Base condition met
140
+ subtrie[:terminal] ||= []
141
+ subtrie[:terminal] << entry_data unless subtrie[:terminal].include?(entry_data)
142
+ else
143
+ next_char = chars_remaining[0]
144
+ subtrie[next_char] ||= {}
145
+ subtrie[next_char][:path] ||= subtrie[:path].to_s + next_char
146
+ subtrie[next_char] = construct_trie(subtrie[next_char], chars_remaining[1..-1], entry_data, depth + 1)
147
+ end
148
+ subtrie
149
+ end
150
+ end
151
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-02 00:00:00.000000000 Z
11
+ date: 2019-12-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -129,10 +129,13 @@ files:
129
129
  - ext/c_levenshtein/phonemes.h
130
130
  - ext/c_levenshtein/phonetic_cost.c
131
131
  - ext/c_levenshtein/phonetic_cost.h
132
+ - lib/common_ipa_transcriptions.json
132
133
  - lib/phonetics.rb
133
134
  - lib/phonetics/code_generator.rb
135
+ - lib/phonetics/distances.rb
134
136
  - lib/phonetics/levenshtein.rb
135
137
  - lib/phonetics/ruby_levenshtein.rb
138
+ - lib/phonetics/transcriptions.rb
136
139
  - lib/phonetics/version.rb
137
140
  - phonetics.gemspec
138
141
  homepage: https://github.com/JackDanger/phonetics