translit_kit 0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +86 -0
- data/Rakefile +29 -0
- data/lib/hebrewword.rb +60 -0
- data/lib/permuter.rb +97 -0
- data/lib/phoneme_maps.rb +80 -0
- data/lib/phoneme_maps/long.json +41 -0
- data/lib/phoneme_maps/short.json +39 -0
- data/lib/phoneme_maps/single.json +40 -0
- data/lib/phonemizer.rb +170 -0
- data/lib/readme.md +120 -0
- data/lib/translit_kit.rb +2 -0
- data/lib/translit_kit/version.rb +3 -0
- data/lib/transliterator.rb +115 -0
- data/test/dummy/README.rdoc +28 -0
- data/test/dummy/Rakefile +6 -0
- data/test/dummy/app/assets/javascripts/application.js +13 -0
- data/test/dummy/app/assets/stylesheets/application.css +15 -0
- data/test/dummy/app/controllers/application_controller.rb +5 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/app/views/layouts/application.html.erb +14 -0
- data/test/dummy/bin/bundle +3 -0
- data/test/dummy/bin/rails +4 -0
- data/test/dummy/bin/rake +4 -0
- data/test/dummy/bin/setup +34 -0
- data/test/dummy/bin/update +29 -0
- data/test/dummy/config.ru +4 -0
- data/test/dummy/config/application.rb +15 -0
- data/test/dummy/config/boot.rb +3 -0
- data/test/dummy/config/cable.yml +9 -0
- data/test/dummy/config/database.yml +25 -0
- data/test/dummy/config/environment.rb +5 -0
- data/test/dummy/config/environments/development.rb +54 -0
- data/test/dummy/config/environments/production.rb +86 -0
- data/test/dummy/config/environments/test.rb +42 -0
- data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
- data/test/dummy/config/initializers/assets.rb +11 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
- data/test/dummy/config/initializers/inflections.rb +16 -0
- data/test/dummy/config/initializers/mime_types.rb +4 -0
- data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
- data/test/dummy/config/initializers/session_store.rb +3 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/test/dummy/config/locales/en.yml +23 -0
- data/test/dummy/config/puma.rb +47 -0
- data/test/dummy/config/routes.rb +3 -0
- data/test/dummy/config/secrets.yml +22 -0
- data/test/dummy/config/spring.rb +6 -0
- data/test/dummy/db/test.sqlite3 +0 -0
- data/test/dummy/log/test.log +85939 -0
- data/test/dummy/public/404.html +67 -0
- data/test/dummy/public/422.html +67 -0
- data/test/dummy/public/500.html +66 -0
- data/test/dummy/public/favicon.ico +0 -0
- data/test/hebrewword_test.rb +45 -0
- data/test/permuter_test.rb +53 -0
- data/test/phoneme_maps_test.rb +29 -0
- data/test/phonemizer_test.rb +209 -0
- data/test/test_helper.rb +29 -0
- data/test/transliterator_test.rb +75 -0
- metadata +155 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
{
|
2
|
+
"א": [""],
|
3
|
+
"ב": ["v"],
|
4
|
+
"בּ": ["b"],
|
5
|
+
"ג": ["g"],
|
6
|
+
"ד": ["d"],
|
7
|
+
"ה": ["h"],
|
8
|
+
"ו": ["v"],
|
9
|
+
"ז": ["z"],
|
10
|
+
"ח": ["ch"],
|
11
|
+
"חַ": ["ach"],
|
12
|
+
"ט": ["t"],
|
13
|
+
"י": ["y"],
|
14
|
+
"כ": ["ch"],
|
15
|
+
"כּ": ["k"],
|
16
|
+
"ל": ["l"],
|
17
|
+
"מ": ["m"],
|
18
|
+
"נ": ["n"],
|
19
|
+
"ס": ["s"],
|
20
|
+
"ע": ["a"],
|
21
|
+
"פ": ["f"],
|
22
|
+
"פּ": ["p"],
|
23
|
+
"צ": ["tz"],
|
24
|
+
"ק": ["k"],
|
25
|
+
"ר": ["r"],
|
26
|
+
"שׁ": ["sh"],
|
27
|
+
"ש": ["s"],
|
28
|
+
"ת": ["s"],
|
29
|
+
"תּ": ["t"],
|
30
|
+
"ָ": ["o"],
|
31
|
+
"ַ": ["a"],
|
32
|
+
"ֵ": ["ei"],
|
33
|
+
"ֶ": ["e"],
|
34
|
+
"ִ": ["i"],
|
35
|
+
"ֹ": ["o"],
|
36
|
+
"וֹ": ["o"],
|
37
|
+
"וּ": ["u"],
|
38
|
+
"ֻ": ["u"],
|
39
|
+
"ְ": ["e"]
|
40
|
+
}
|
data/lib/phonemizer.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
=begin
|
2
|
+
Phonemizer.rb
|
3
|
+
|
4
|
+
Takes a raw Hebrew word (with nekudos)
|
5
|
+
and returns an array of phonemes.
|
6
|
+
|
7
|
+
Behavior:
|
8
|
+
* Letters and nekudos are seperated.
|
9
|
+
* Strips spaces
|
10
|
+
* Normalizes CHATAF nekudos
|
11
|
+
* Normalizes final letters
|
12
|
+
* The DAGESH is joined to its letter
|
13
|
+
* The SHIN's dot is attached to the SHIN
|
14
|
+
* MALEI nekudos are stripped of their extra YUD
|
15
|
+
|
16
|
+
=end
|
17
|
+
|
18
|
+
|
19
|
+
require 'permuter'
|
20
|
+
require 'phoneme_maps'
|
21
|
+
|
22
|
+
# Constants
|
23
|
+
|
24
|
+
# English and Hebrew Unicode have different space (' ') characters
|
25
|
+
ENGLISH_SPACE = [160].pack "U"
|
26
|
+
HEBREW_SPACE = [32].pack "U"
|
27
|
+
|
28
|
+
# Edge-case characters
|
29
|
+
DAGESH = "ּ"
|
30
|
+
SHIN_DOT = "ׁ"
|
31
|
+
|
32
|
+
# Nekudos that have special cases
|
33
|
+
CHOLOM = "ֹ"
|
34
|
+
PATACH = "ַ"
|
35
|
+
CHIRIK = "ִ"
|
36
|
+
TZEIREI = "ֵ"
|
37
|
+
|
38
|
+
# Letters that have special cases
|
39
|
+
SIN = "ש"
|
40
|
+
VAV = "ו"
|
41
|
+
CHES = "ח"
|
42
|
+
YUD = "י"
|
43
|
+
SHIN_WITH_DOT = "שׁ"
|
44
|
+
|
45
|
+
|
46
|
+
# Regexes
|
47
|
+
LETTER = /[אבגדהוזחטיכלמנסעפקרשתםןץףךצ]/
|
48
|
+
FINAL_LETTER = /[םןךףץ]/
|
49
|
+
CHATAF = ['ֲ','ֳ','ֱ']
|
50
|
+
DAGESH_WHITELIST = /[בוכפת]/
|
51
|
+
|
52
|
+
|
53
|
+
# Breaks a Hebrew string into its discrete phonemes
|
54
|
+
class Phonemizer
|
55
|
+
|
56
|
+
def initialize word
|
57
|
+
@hebword = word
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the unedited Hebrew string
|
61
|
+
def raw
|
62
|
+
@hebword
|
63
|
+
end
|
64
|
+
|
65
|
+
# Breaks the word down into its discrete phonemes
|
66
|
+
# “ם’’ ,“וּ“ ,“כּ“ ,“ע“] = "עַכּוּם]
|
67
|
+
#
|
68
|
+
# No arguments; returns an array
|
69
|
+
#
|
70
|
+
# This function depends heavily on the workings of Hebrew grammer,
|
71
|
+
# so it gets a bit complicated. If you have a more elegant solution, I'd gladly take it.
|
72
|
+
# This thing was a hornet's nest full of bugs, so watch that test suite when editing!
|
73
|
+
def phonemes
|
74
|
+
@completed = []
|
75
|
+
|
76
|
+
# For each raw character :
|
77
|
+
@hebword.chars.each_with_index do |char,i|
|
78
|
+
|
79
|
+
# Skip whitespace
|
80
|
+
if char == ENGLISH_SPACE || char == HEBREW_SPACE
|
81
|
+
next
|
82
|
+
|
83
|
+
# If it's a final letter, normalize it to its standard form (מ –> ם)
|
84
|
+
elsif char =~ FINAL_LETTER
|
85
|
+
@completed << normalize_final_letter(char)
|
86
|
+
|
87
|
+
# If it's a CHATAF, normalize it to it's standard form
|
88
|
+
elsif CHATAF.include? char
|
89
|
+
@completed << deCHATAFize(char)
|
90
|
+
|
91
|
+
# If it's a SHIN_DOT, find the previous SIN and replace it with SHIN_WITH_DOT
|
92
|
+
elsif char == SHIN_DOT
|
93
|
+
@completed[@completed.rindex(SIN)] = SHIN_WITH_DOT
|
94
|
+
|
95
|
+
# If it's a DAGESH:
|
96
|
+
# 1. Find the previous letter
|
97
|
+
# 2. Check if it's on the list of DAGESH-compatible letters
|
98
|
+
# 3. If it is, add it
|
99
|
+
# 4. If it's not, implicitly fall through to the `else` case
|
100
|
+
elsif char == DAGESH
|
101
|
+
previous_letter = previous_letter_index(i, @completed)
|
102
|
+
if previous_letter.nil? then raise "Orphaned DAGESH: DAGESH at position #{i} is not preceded by a letter.(Word: \"#{@hebword}\")"; end
|
103
|
+
if DAGESH_WHITELIST =~ @completed[previous_letter]
|
104
|
+
@completed[previous_letter] += DAGESH
|
105
|
+
end
|
106
|
+
|
107
|
+
# Skip the VAV of a CHOLOM MALEI, otherwise add it
|
108
|
+
elsif char == VAV
|
109
|
+
@hebword[i + 1] == CHOLOM ? next : @completed << VAV
|
110
|
+
|
111
|
+
# Skip the YUD of a CHIRIK MALEI and TZEIREI MALEI, otherwise add them
|
112
|
+
elsif char == YUD
|
113
|
+
(@completed.last == CHIRIK ||
|
114
|
+
@completed.last == TZEIREI) ?
|
115
|
+
next : @completed << YUD
|
116
|
+
|
117
|
+
# Append a PATACH to a final CHES ( חַ )
|
118
|
+
elsif char == PATACH && # It's a PATACH
|
119
|
+
@completed.last == CHES && # Proceeded by a CHES
|
120
|
+
(i == @hebword.length - 1) # At the end of the word
|
121
|
+
@completed[@completed.length - 1] += PATACH
|
122
|
+
|
123
|
+
# Otherwise, pass the letter or nekuda unchanged
|
124
|
+
else
|
125
|
+
@completed << char
|
126
|
+
end
|
127
|
+
|
128
|
+
end # end loop
|
129
|
+
@completed
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
|
136
|
+
# Normalize final letters to standard forms
|
137
|
+
def normalize_final_letter char
|
138
|
+
case char
|
139
|
+
when "ם" then return "מ"
|
140
|
+
when "ן" then return "נ"
|
141
|
+
when "ץ" then return "צ"
|
142
|
+
when "ף" then return "פ"
|
143
|
+
when "ך" then return "כ"
|
144
|
+
else
|
145
|
+
raise "#{char} is not a final letter\nSuggested test snippet: #{FINAL_LETTER} =~ #{char}\n"
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Normalize CHATAF nekudos to standard forms
|
150
|
+
# Raises a `RuntimeError` if the character is not one of ['ֲ','ֳ','ֱ']
|
151
|
+
def deCHATAFize chataf
|
152
|
+
case chataf
|
153
|
+
when "ֲ" then return "ַ"
|
154
|
+
when "ֳ" then return "ָ"
|
155
|
+
when "ֱ" then return "ֶ"
|
156
|
+
end
|
157
|
+
raise "#{chataf} is not a CHATAF\n\tSuggested test snippet: ['ֲ','ֳ','ֱ'].include?(#{chataf})"
|
158
|
+
end
|
159
|
+
|
160
|
+
# Return the index of the first previous character that is a letter
|
161
|
+
# * If the index is a letter -> Ignore it and find the previous one #BugOrFeature?
|
162
|
+
# * If a previous character is a letter -> return its index
|
163
|
+
# * If no characters are letters -> nil
|
164
|
+
def previous_letter_index current_loc, array
|
165
|
+
current_loc.downto(0) do |i|
|
166
|
+
return i if array[i] =~ LETTER
|
167
|
+
end
|
168
|
+
nil
|
169
|
+
end
|
170
|
+
end
|
data/lib/readme.md
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# How Transliteration Works
|
2
|
+
|
3
|
+
LectureLab uses a pile of helper classes to ease mass-editing strings
|
4
|
+
|
5
|
+
## The HebrewWord class
|
6
|
+
|
7
|
+
`HebrewWord` takes a a Hebrew word (with _nikkud_) and a _phoneme list_, which maps Hebrew phonemes (letters with optional modifiers) unto English characters.
|
8
|
+
(If phonemes are not supplied, it loads a default set. See the implementation)
|
9
|
+
|
10
|
+
Example:
|
11
|
+
```ruby
|
12
|
+
@phonemes = {"ב" => ["v"], "בּ" => ["b","bb"]}
|
13
|
+
h = new HebrewWord "בָּעוֹמֶר", @phonemes
|
14
|
+
h.transliterate
|
15
|
+
# => ...
|
16
|
+
```
|
17
|
+
|
18
|
+
Let's see the implementation:
|
19
|
+
```ruby
|
20
|
+
def transliterate list_name = nil
|
21
|
+
Transliterator.new(@hebword, list_name).transliterate
|
22
|
+
end
|
23
|
+
```
|
24
|
+
|
25
|
+
`Hebrew` delegates the actual work to the `Transliterator` class.
|
26
|
+
|
27
|
+
## The Transliterator class
|
28
|
+
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
class Transliterator
|
32
|
+
def transliterate
|
33
|
+
@permuter.permutations
|
34
|
+
end
|
35
|
+
...
|
36
|
+
```
|
37
|
+
|
38
|
+
In the initializer:
|
39
|
+
```ruby
|
40
|
+
@permuter = Permuter.new
|
41
|
+
```
|
42
|
+
|
43
|
+
So HebrewWord delegates the actual permuting to the _Permuter_ class
|
44
|
+
|
45
|
+
## The Permuter class
|
46
|
+
|
47
|
+
The `Permuter` class is a general purpose object for generating combinations:
|
48
|
+
```ruby
|
49
|
+
p = Permuter.new
|
50
|
+
3.times { p.add_array [1,2,3] }
|
51
|
+
|
52
|
+
|
53
|
+
p.permutations
|
54
|
+
# => [1,1,1]
|
55
|
+
[1,1,2]
|
56
|
+
[1,1,3]
|
57
|
+
[1,2,3]
|
58
|
+
...
|
59
|
+
```
|
60
|
+
|
61
|
+
In our case, the arrays are the possible English letters for every Hebrew phoneme:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
def setup_permuter
|
65
|
+
heb_letters.each do |heb_letter|
|
66
|
+
@permuter.add_array @possible_english_letters[heb_letter]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
```
|
70
|
+
Suppose that:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
@possible_english_letters = {"ב" => ["v"], "בּ" => ["b","bb"]}
|
74
|
+
@possible_english_letters["בּ"]
|
75
|
+
# => "["b","bb"]"`
|
76
|
+
```
|
77
|
+
|
78
|
+
If the word contains the letter _'בּ'_, permutations will be generated containing both _'b'_ and _bb_.
|
79
|
+
|
80
|
+
###### And how does Permuter work?
|
81
|
+
`Permuter` uses a basic recursive strategy to generate the permutations.
|
82
|
+
|
83
|
+
From the implementation
|
84
|
+
```ruby
|
85
|
+
private
|
86
|
+
# permute (indices)
|
87
|
+
# Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
|
88
|
+
#
|
89
|
+
# The classic recursive permutation algorithm:
|
90
|
+
# Imagine picking a combination lock: [0][0][0]
|
91
|
+
# Each cylinder is the index to one of the arrays
|
92
|
+
# On each recursion, we add another cylinder [0], [0][0], [0][0][0]
|
93
|
+
# When we have enough cylinders, we generate the permutation (base case)
|
94
|
+
# and iterate to the next value by dropping a cylinder, [0][0]
|
95
|
+
# iterating the loop in else, and recursing again [0][0][1]
|
96
|
+
# Simple and elegant
|
97
|
+
def permute indices
|
98
|
+
# Base case
|
99
|
+
if indices.length == @arrays.length
|
100
|
+
build_permutation indices
|
101
|
+
else
|
102
|
+
@arrays[indices.length].each_with_index do |item,i|
|
103
|
+
permute indices.dup << i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
```
|
108
|
+
|
109
|
+
`Permuter` now returns to `Transliterator`, which returns to `HebrewWord`, which returns to the user.
|
110
|
+
|
111
|
+
## Summary
|
112
|
+
|
113
|
+
1. `HebrewWord` is given a string of Hebrew text, with all the necessary vowelisations.
|
114
|
+
2. `HebrewWord` passes the string into `Transliterator`
|
115
|
+
3. `Transliterator` passes the string into `Phonemizer`
|
116
|
+
4. `Phonemizer` digests the string into usable phonemes for mapping, and hands them back to `Transliterator`
|
117
|
+
5. `Transliterator` loads the phoneme map from `PhonemeMaps`, and uses the map and string to configure the `Permuter`
|
118
|
+
6. `Permuter` generates the transliterations, and hands them back to `Transliterator`
|
119
|
+
7. `Transliterator` returns to `HebrewWord`
|
120
|
+
8. `HebrewWord` returns to the user
|
data/lib/translit_kit.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
=begin
|
2
|
+
Transliterator.rb
|
3
|
+
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'permuter'
|
8
|
+
require 'phoneme_maps'
|
9
|
+
require 'phonemizer'
|
10
|
+
|
11
|
+
class Transliterator < String
|
12
|
+
|
13
|
+
# Initializer
|
14
|
+
# Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
|
15
|
+
# and a optional phoneme-mapping list
|
16
|
+
def initialize string, map_name = nil
|
17
|
+
@hebword = string
|
18
|
+
@phoneme_map = fetch_phoneme_map map_name
|
19
|
+
setup_permuter
|
20
|
+
end
|
21
|
+
|
22
|
+
# Get the raw Hebrew text of the word (Included NIKUD)
|
23
|
+
def raw
|
24
|
+
@hebword
|
25
|
+
end
|
26
|
+
|
27
|
+
# Alias of `raw`
|
28
|
+
def to_s
|
29
|
+
raw
|
30
|
+
end
|
31
|
+
|
32
|
+
def phoneme_map
|
33
|
+
@list_name
|
34
|
+
end
|
35
|
+
|
36
|
+
def phoneme_map= name
|
37
|
+
@phoneme_map = fetch_phoneme_map name
|
38
|
+
end
|
39
|
+
# Returns a `String` of format:
|
40
|
+
# `hebrew_text`: Permutations: `x` single | `y` short | `z` long
|
41
|
+
def inspect
|
42
|
+
"#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def phonemes
|
47
|
+
Phonemizer.new(@hebword).phonemes
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return an `Array` of all possible transliterations of the word
|
51
|
+
# As defined in the optional `list_name` argument. options: [:long, :short, :single]
|
52
|
+
# Default is `:single`
|
53
|
+
def transliterate list_name = nil
|
54
|
+
self.phoneme_map = list_name
|
55
|
+
setup_permuter()
|
56
|
+
generate_permutations()
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
# #fetch_phoneme_maponeme_map(list_name)
|
62
|
+
# Returns the appropriate `phoneme_map` for transliteration
|
63
|
+
#
|
64
|
+
# If a name is supplied, use that
|
65
|
+
# options: [:long, :short, :single] (default is :short)
|
66
|
+
#
|
67
|
+
# Following init, if no list is supplied, the one selected in init is used.
|
68
|
+
#
|
69
|
+
# On init:
|
70
|
+
# >> name -> use name
|
71
|
+
# >> nil -> use :short
|
72
|
+
#
|
73
|
+
# After init
|
74
|
+
# >> name -> use name
|
75
|
+
# >> nil -> use what we've already got
|
76
|
+
|
77
|
+
def fetch_phoneme_map list_name = nil
|
78
|
+
if list_name.nil?
|
79
|
+
defined?(@phoneme_map) ? (return @phoneme_map) : list_name = :short
|
80
|
+
end
|
81
|
+
|
82
|
+
map = PhonemeMaps.new.load list_name
|
83
|
+
@list_name = list_name
|
84
|
+
map
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get all permutations for `@hebword`
|
88
|
+
def generate_permutations
|
89
|
+
@permuter.permutations.
|
90
|
+
select do |pr|
|
91
|
+
# Eliminate duplicate chars
|
92
|
+
# At start and end of permutations
|
93
|
+
# i.e. "avrohom" -> keep
|
94
|
+
# "avrohomm" -> reject
|
95
|
+
pr[0] != pr[1] && # compare first 2 chars
|
96
|
+
pr[pr.length - 1] != pr[pr.length - 2] # compare last 2 chars
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Configures the versatile Permuter for permuting the word
|
101
|
+
def setup_permuter
|
102
|
+
@permuter = Permuter.new
|
103
|
+
|
104
|
+
# Get the letters of the word
|
105
|
+
heb_letters = self.phonemes
|
106
|
+
|
107
|
+
# For each letter, add the array
|
108
|
+
# of possible english letters to the permuter
|
109
|
+
heb_letters.each do |heb_letter|
|
110
|
+
en_letters = @phoneme_map[heb_letter]
|
111
|
+
if en_letters.nil? then raise "Couldn't find phoneme_map entry for letter ( #{heb_letter.chars} ) in list `#{@list_name}`\nSuggested test snippet: #{@list_name == ":custom" ? @list_name : "require \'phoneme_maps\';PhonemeMaps.new.short"}['#{heb_letter}'].nil?\n" end
|
112
|
+
@permuter.add_array en_letters
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|