phonetics 2.0.1 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,12 +16,6 @@ module Phonetics
16
16
  module Levenshtein
17
17
  extend ::PhoneticsLevenshteinCBinding
18
18
 
19
- def inspect_bytes(str)
20
- puts "Rubyland str: #{str.inspect}"
21
- puts "Rubyland bytes: #{str.bytes.inspect}"
22
- testing_codepoints(str)
23
- end
24
-
25
19
  def self.distance(str1, str2, verbose = false)
26
20
  internal_phonetic_distance(str1, str2, verbose)
27
21
  end
@@ -0,0 +1,151 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open-uri'
4
+ require 'json'
5
+
6
+ module Phonetics
7
+ module Transcriptions
8
+ extend self
9
+ Transcriptions = File.join(__dir__, '..', 'common_ipa_transcriptions.json')
10
+ TranscriptionsURL = 'https://jackdanger.com/common_ipa_transcriptions.json'
11
+
12
+ SourcesByPreference = [/wiktionary/, /cmu/, /phonemicchart.com/].freeze
13
+
14
+ def [](key)
15
+ entry = transcriptions[key]
16
+ return unless entry
17
+ return unless entry['ipa']
18
+
19
+ SourcesByPreference.each do |preferred_source|
20
+ entry['ipa'].keys.each do |source|
21
+ return entry['ipa'][source] if source =~ preferred_source
22
+ end
23
+ end
24
+ nil
25
+ end
26
+
27
+ def transcriptions
28
+ @transcriptions ||= begin
29
+ download! unless File.exist?(Transcriptions)
30
+ load_from_disk!
31
+ end
32
+ end
33
+
34
+ # Lazily loaded from JSON file on disk
35
+ def load_from_disk!
36
+ @transcriptions = JSON.parse(File.read(Transcriptions))
37
+ end
38
+
39
+ def download!
40
+ File.open(Transcriptions, 'w') { |f| f.write(URI.open(TranscriptionsURL).read) }
41
+ end
42
+
43
+ def trie
44
+ # Let's turn this:
45
+ #
46
+ # "century": {
47
+ # "rarity": 462.0,
48
+ # "ipa": {
49
+ # "cmu": "sɛntʃɝɪ",
50
+ # "phonemicchart.com": "sentʃərɪ",
51
+ # "wiktionary": "sɛntʃəɹi",
52
+ # "wiktionary2": "sɛntʃɹi",
53
+ # "wiktionary3": "sɛntʃʊɹi"
54
+ # },
55
+ # "alt_display": "CENTURY"
56
+ # }
57
+ #
58
+ # into this:
59
+ #
60
+ # "s": {
61
+ # "e": {
62
+ # "n": {
63
+ # "t": {
64
+ # "ʃ": {
65
+ # "ʊ": {
66
+ # "ɹ": {
67
+ # "i": {
68
+ # "terminal": [Term('century')],
69
+ # },
70
+ # },
71
+ # },
72
+ # "ə": {
73
+ # "r": {
74
+ # "ɪ": {
75
+ # "terminal": [Term('century')],
76
+ # },
77
+ # },
78
+ # },
79
+ # "ɹ": {
80
+ # "i": {
81
+ # "terminal": [Term('century')],
82
+ # },
83
+ # },
84
+ # "ɝ": {
85
+ # "ɪ": {
86
+ # "terminal": [Term('century')],
87
+ # },
88
+ # },
89
+ # },
90
+ # },
91
+ # },
92
+ # },
93
+ # "ɛ": {
94
+ # "n": {
95
+ # "t": {
96
+ # "ʃ": {
97
+ # "ɝ": {
98
+ # "ɪ": {
99
+ # "terminal": [Term('century')],
100
+ # },
101
+ # },
102
+ # },
103
+ # },
104
+ # },
105
+ # },
106
+ # },
107
+ #
108
+ @trie ||= begin
109
+ base_trie = {}
110
+ transcriptions.each do |key, entry|
111
+ entry_data = {
112
+ word: key,
113
+ rarity: entry['rarity'],
114
+ }
115
+ entry.fetch('ipa', []).each do |_source, transcription|
116
+ base_trie = construct_trie(base_trie, transcription, entry_data)
117
+ end
118
+ end
119
+ base_trie.freeze
120
+ end
121
+ end
122
+
123
+ def walk(ipa)
124
+ ipa.each_char.reduce(trie) { |acc, char| acc[char] }
125
+ end
126
+
127
+ def transcription_for(phrase)
128
+ phrase.downcase.split(' ').map { |word| self[word] }.join
129
+ end
130
+
131
+ private
132
+
133
+ # Given an portion of an existing trie (to be modified), the remainder of a
134
+ # char string, and an entry, walk or construct the appropriate trie nodes
135
+ # necessary to place the entry in a leaf.
136
+ def construct_trie(subtrie, chars_remaining, entry_data, depth = 0)
137
+ subtrie[:depth] ||= depth
138
+ if chars_remaining.empty?
139
+ # Base condition met
140
+ subtrie[:terminal] ||= []
141
+ subtrie[:terminal] << entry_data unless subtrie[:terminal].include?(entry_data)
142
+ else
143
+ next_char = chars_remaining[0]
144
+ subtrie[next_char] ||= {}
145
+ subtrie[next_char][:path] ||= subtrie[:path].to_s + next_char
146
+ subtrie[next_char] = construct_trie(subtrie[next_char], chars_remaining[1..-1], entry_data, depth + 1)
147
+ end
148
+ subtrie
149
+ end
150
+ end
151
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: phonetics
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jack Danger
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2019-10-02 00:00:00.000000000 Z
11
+ date: 2019-12-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -129,10 +129,13 @@ files:
129
129
  - ext/c_levenshtein/phonemes.h
130
130
  - ext/c_levenshtein/phonetic_cost.c
131
131
  - ext/c_levenshtein/phonetic_cost.h
132
+ - lib/common_ipa_transcriptions.json
132
133
  - lib/phonetics.rb
133
134
  - lib/phonetics/code_generator.rb
135
+ - lib/phonetics/distances.rb
134
136
  - lib/phonetics/levenshtein.rb
135
137
  - lib/phonetics/ruby_levenshtein.rb
138
+ - lib/phonetics/transcriptions.rb
136
139
  - lib/phonetics/version.rb
137
140
  - phonetics.gemspec
138
141
  homepage: https://github.com/JackDanger/phonetics