translit_kit 0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.md +86 -0
- data/Rakefile +29 -0
- data/lib/hebrewword.rb +60 -0
- data/lib/permuter.rb +97 -0
- data/lib/phoneme_maps.rb +80 -0
- data/lib/phoneme_maps/long.json +41 -0
- data/lib/phoneme_maps/short.json +39 -0
- data/lib/phoneme_maps/single.json +40 -0
- data/lib/phonemizer.rb +170 -0
- data/lib/readme.md +120 -0
- data/lib/translit_kit.rb +2 -0
- data/lib/translit_kit/version.rb +3 -0
- data/lib/transliterator.rb +115 -0
- data/test/dummy/README.rdoc +28 -0
- data/test/dummy/Rakefile +6 -0
- data/test/dummy/app/assets/javascripts/application.js +13 -0
- data/test/dummy/app/assets/stylesheets/application.css +15 -0
- data/test/dummy/app/controllers/application_controller.rb +5 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/app/views/layouts/application.html.erb +14 -0
- data/test/dummy/bin/bundle +3 -0
- data/test/dummy/bin/rails +4 -0
- data/test/dummy/bin/rake +4 -0
- data/test/dummy/bin/setup +34 -0
- data/test/dummy/bin/update +29 -0
- data/test/dummy/config.ru +4 -0
- data/test/dummy/config/application.rb +15 -0
- data/test/dummy/config/boot.rb +3 -0
- data/test/dummy/config/cable.yml +9 -0
- data/test/dummy/config/database.yml +25 -0
- data/test/dummy/config/environment.rb +5 -0
- data/test/dummy/config/environments/development.rb +54 -0
- data/test/dummy/config/environments/production.rb +86 -0
- data/test/dummy/config/environments/test.rb +42 -0
- data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
- data/test/dummy/config/initializers/assets.rb +11 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
- data/test/dummy/config/initializers/inflections.rb +16 -0
- data/test/dummy/config/initializers/mime_types.rb +4 -0
- data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
- data/test/dummy/config/initializers/session_store.rb +3 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/test/dummy/config/locales/en.yml +23 -0
- data/test/dummy/config/puma.rb +47 -0
- data/test/dummy/config/routes.rb +3 -0
- data/test/dummy/config/secrets.yml +22 -0
- data/test/dummy/config/spring.rb +6 -0
- data/test/dummy/db/test.sqlite3 +0 -0
- data/test/dummy/log/test.log +85939 -0
- data/test/dummy/public/404.html +67 -0
- data/test/dummy/public/422.html +67 -0
- data/test/dummy/public/500.html +66 -0
- data/test/dummy/public/favicon.ico +0 -0
- data/test/hebrewword_test.rb +45 -0
- data/test/permuter_test.rb +53 -0
- data/test/phoneme_maps_test.rb +29 -0
- data/test/phonemizer_test.rb +209 -0
- data/test/test_helper.rb +29 -0
- data/test/transliterator_test.rb +75 -0
- metadata +155 -0
@@ -0,0 +1,40 @@
|
|
1
|
+
{
|
2
|
+
"א": [""],
|
3
|
+
"ב": ["v"],
|
4
|
+
"בּ": ["b"],
|
5
|
+
"ג": ["g"],
|
6
|
+
"ד": ["d"],
|
7
|
+
"ה": ["h"],
|
8
|
+
"ו": ["v"],
|
9
|
+
"ז": ["z"],
|
10
|
+
"ח": ["ch"],
|
11
|
+
"חַ": ["ach"],
|
12
|
+
"ט": ["t"],
|
13
|
+
"י": ["y"],
|
14
|
+
"כ": ["ch"],
|
15
|
+
"כּ": ["k"],
|
16
|
+
"ל": ["l"],
|
17
|
+
"מ": ["m"],
|
18
|
+
"נ": ["n"],
|
19
|
+
"ס": ["s"],
|
20
|
+
"ע": ["a"],
|
21
|
+
"פ": ["f"],
|
22
|
+
"פּ": ["p"],
|
23
|
+
"צ": ["tz"],
|
24
|
+
"ק": ["k"],
|
25
|
+
"ר": ["r"],
|
26
|
+
"שׁ": ["sh"],
|
27
|
+
"ש": ["s"],
|
28
|
+
"ת": ["s"],
|
29
|
+
"תּ": ["t"],
|
30
|
+
"ָ": ["o"],
|
31
|
+
"ַ": ["a"],
|
32
|
+
"ֵ": ["ei"],
|
33
|
+
"ֶ": ["e"],
|
34
|
+
"ִ": ["i"],
|
35
|
+
"ֹ": ["o"],
|
36
|
+
"וֹ": ["o"],
|
37
|
+
"וּ": ["u"],
|
38
|
+
"ֻ": ["u"],
|
39
|
+
"ְ": ["e"]
|
40
|
+
}
|
data/lib/phonemizer.rb
ADDED
@@ -0,0 +1,170 @@
|
|
1
|
+
=begin
|
2
|
+
Phonemizer.rb
|
3
|
+
|
4
|
+
Takes a raw Hebrew word (with nekudos)
|
5
|
+
and returns an array of phonemes.
|
6
|
+
|
7
|
+
Behavior:
|
8
|
+
* Letters and nekudos are seperated.
|
9
|
+
* Strips spaces
|
10
|
+
* Normalizes CHATAF nekudos
|
11
|
+
* Normalizes final letters
|
12
|
+
* The DAGESH is joined to its letter
|
13
|
+
* The SHIN's dot is attached to the SHIN
|
14
|
+
* MALEI nekudos are stripped of their extra YUD
|
15
|
+
|
16
|
+
=end
|
17
|
+
|
18
|
+
|
19
|
+
require 'permuter'
|
20
|
+
require 'phoneme_maps'
|
21
|
+
|
22
|
+
# Constants
|
23
|
+
|
24
|
+
# English and Hebrew Unicode have different space (' ') characters
|
25
|
+
ENGLISH_SPACE = [160].pack "U"
|
26
|
+
HEBREW_SPACE = [32].pack "U"
|
27
|
+
|
28
|
+
# Edge-case characters
|
29
|
+
DAGESH = "ּ"
|
30
|
+
SHIN_DOT = "ׁ"
|
31
|
+
|
32
|
+
# Nekudos that have special cases
|
33
|
+
CHOLOM = "ֹ"
|
34
|
+
PATACH = "ַ"
|
35
|
+
CHIRIK = "ִ"
|
36
|
+
TZEIREI = "ֵ"
|
37
|
+
|
38
|
+
# Letters that have special cases
|
39
|
+
SIN = "ש"
|
40
|
+
VAV = "ו"
|
41
|
+
CHES = "ח"
|
42
|
+
YUD = "י"
|
43
|
+
SHIN_WITH_DOT = "שׁ"
|
44
|
+
|
45
|
+
|
46
|
+
# Regexes
|
47
|
+
LETTER = /[אבגדהוזחטיכלמנסעפקרשתםןץףךצ]/
|
48
|
+
FINAL_LETTER = /[םןךףץ]/
|
49
|
+
CHATAF = ['ֲ','ֳ','ֱ']
|
50
|
+
DAGESH_WHITELIST = /[בוכפת]/
|
51
|
+
|
52
|
+
|
53
|
+
# Breaks a Hebrew string into its discrete phonemes
|
54
|
+
class Phonemizer
|
55
|
+
|
56
|
+
def initialize word
|
57
|
+
@hebword = word
|
58
|
+
end
|
59
|
+
|
60
|
+
# Returns the unedited Hebrew string
|
61
|
+
def raw
|
62
|
+
@hebword
|
63
|
+
end
|
64
|
+
|
65
|
+
# Breaks the word down into its discrete phonemes
|
66
|
+
# “ם’’ ,“וּ“ ,“כּ“ ,“ע“] = "עַכּוּם]
|
67
|
+
#
|
68
|
+
# No arguments; returns an array
|
69
|
+
#
|
70
|
+
# This function depends heavily on the workings of Hebrew grammer,
|
71
|
+
# so it gets a bit complicated. If you have a more elegant solution, I'd gladly take it.
|
72
|
+
# This thing was a hornet's nest full of bugs, so watch that test suite when editing!
|
73
|
+
def phonemes
|
74
|
+
@completed = []
|
75
|
+
|
76
|
+
# For each raw character :
|
77
|
+
@hebword.chars.each_with_index do |char,i|
|
78
|
+
|
79
|
+
# Skip whitespace
|
80
|
+
if char == ENGLISH_SPACE || char == HEBREW_SPACE
|
81
|
+
next
|
82
|
+
|
83
|
+
# If it's a final letter, normalize it to its standard form (מ –> ם)
|
84
|
+
elsif char =~ FINAL_LETTER
|
85
|
+
@completed << normalize_final_letter(char)
|
86
|
+
|
87
|
+
# If it's a CHATAF, normalize it to it's standard form
|
88
|
+
elsif CHATAF.include? char
|
89
|
+
@completed << deCHATAFize(char)
|
90
|
+
|
91
|
+
# If it's a SHIN_DOT, find the previous SIN and replace it with SHIN_WITH_DOT
|
92
|
+
elsif char == SHIN_DOT
|
93
|
+
@completed[@completed.rindex(SIN)] = SHIN_WITH_DOT
|
94
|
+
|
95
|
+
# If it's a DAGESH:
|
96
|
+
# 1. Find the previous letter
|
97
|
+
# 2. Check if it's on the list of DAGESH-compatible letters
|
98
|
+
# 3. If it is, add it
|
99
|
+
# 4. If it's not, implicitly fall through to the `else` case
|
100
|
+
elsif char == DAGESH
|
101
|
+
previous_letter = previous_letter_index(i, @completed)
|
102
|
+
if previous_letter.nil? then raise "Orphaned DAGESH: DAGESH at position #{i} is not preceded by a letter.(Word: \"#{@hebword}\")"; end
|
103
|
+
if DAGESH_WHITELIST =~ @completed[previous_letter]
|
104
|
+
@completed[previous_letter] += DAGESH
|
105
|
+
end
|
106
|
+
|
107
|
+
# Skip the VAV of a CHOLOM MALEI, otherwise add it
|
108
|
+
elsif char == VAV
|
109
|
+
@hebword[i + 1] == CHOLOM ? next : @completed << VAV
|
110
|
+
|
111
|
+
# Skip the YUD of a CHIRIK MALEI and TZEIREI MALEI, otherwise add them
|
112
|
+
elsif char == YUD
|
113
|
+
(@completed.last == CHIRIK ||
|
114
|
+
@completed.last == TZEIREI) ?
|
115
|
+
next : @completed << YUD
|
116
|
+
|
117
|
+
# Append a PATACH to a final CHES ( חַ )
|
118
|
+
elsif char == PATACH && # It's a PATACH
|
119
|
+
@completed.last == CHES && # Proceeded by a CHES
|
120
|
+
(i == @hebword.length - 1) # At the end of the word
|
121
|
+
@completed[@completed.length - 1] += PATACH
|
122
|
+
|
123
|
+
# Otherwise, pass the letter or nekuda unchanged
|
124
|
+
else
|
125
|
+
@completed << char
|
126
|
+
end
|
127
|
+
|
128
|
+
end # end loop
|
129
|
+
@completed
|
130
|
+
|
131
|
+
end
|
132
|
+
|
133
|
+
private
|
134
|
+
|
135
|
+
|
136
|
+
# Normalize final letters to standard forms
|
137
|
+
def normalize_final_letter char
|
138
|
+
case char
|
139
|
+
when "ם" then return "מ"
|
140
|
+
when "ן" then return "נ"
|
141
|
+
when "ץ" then return "צ"
|
142
|
+
when "ף" then return "פ"
|
143
|
+
when "ך" then return "כ"
|
144
|
+
else
|
145
|
+
raise "#{char} is not a final letter\nSuggested test snippet: #{FINAL_LETTER} =~ #{char}\n"
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Normalize CHATAF nekudos to standard forms
|
150
|
+
# Raises a `RuntimeError` if the character is not one of ['ֲ','ֳ','ֱ']
|
151
|
+
def deCHATAFize chataf
|
152
|
+
case chataf
|
153
|
+
when "ֲ" then return "ַ"
|
154
|
+
when "ֳ" then return "ָ"
|
155
|
+
when "ֱ" then return "ֶ"
|
156
|
+
end
|
157
|
+
raise "#{chataf} is not a CHATAF\n\tSuggested test snippet: ['ֲ','ֳ','ֱ'].include?(#{chataf})"
|
158
|
+
end
|
159
|
+
|
160
|
+
# Return the index of the first previous character that is a letter
|
161
|
+
# * If the index is a letter -> Ignore it and find the previous one #BugOrFeature?
|
162
|
+
# * If a previous character is a letter -> return its index
|
163
|
+
# * If no characters are letters -> nil
|
164
|
+
def previous_letter_index current_loc, array
|
165
|
+
current_loc.downto(0) do |i|
|
166
|
+
return i if array[i] =~ LETTER
|
167
|
+
end
|
168
|
+
nil
|
169
|
+
end
|
170
|
+
end
|
data/lib/readme.md
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# How Transliteration Works
|
2
|
+
|
3
|
+
LectureLab uses a pile of helper classes to ease mass-editing strings
|
4
|
+
|
5
|
+
## The HebrewWord class
|
6
|
+
|
7
|
+
`HebrewWord` takes a a Hebrew word (with _nikkud_) and a _phoneme list_, which maps Hebrew phonemes (letters with optional modifiers) unto English characters.
|
8
|
+
(If phonemes are not supplied, it loads a default set. See the implementation)
|
9
|
+
|
10
|
+
Example:
|
11
|
+
```ruby
|
12
|
+
@phonemes = {"ב" => ["v"], "בּ" => ["b","bb"]}
|
13
|
+
h = new HebrewWord "בָּעוֹמֶר", @phonemes
|
14
|
+
h.transliterate
|
15
|
+
# => ...
|
16
|
+
```
|
17
|
+
|
18
|
+
Let's see the implementation:
|
19
|
+
```ruby
|
20
|
+
def transliterate list_name = nil
|
21
|
+
Transliterator.new(@hebword, list_name).transliterate
|
22
|
+
end
|
23
|
+
```
|
24
|
+
|
25
|
+
`Hebrew` delegates the actual work to the `Transliterator` class.
|
26
|
+
|
27
|
+
## The Transliterator class
|
28
|
+
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
class Transliterator
|
32
|
+
def transliterate
|
33
|
+
@permuter.permutations
|
34
|
+
end
|
35
|
+
...
|
36
|
+
```
|
37
|
+
|
38
|
+
In the initializer:
|
39
|
+
```ruby
|
40
|
+
@permuter = Permuter.new
|
41
|
+
```
|
42
|
+
|
43
|
+
So HebrewWord delegates the actual permuting to the _Permuter_ class
|
44
|
+
|
45
|
+
## The Permuter class
|
46
|
+
|
47
|
+
The `Permuter` class is a general purpose object for generating combinations:
|
48
|
+
```ruby
|
49
|
+
p = Permuter.new
|
50
|
+
3.times { p.add_array [1,2,3] }
|
51
|
+
|
52
|
+
|
53
|
+
p.permutations
|
54
|
+
# => [1,1,1]
|
55
|
+
[1,1,2]
|
56
|
+
[1,1,3]
|
57
|
+
[1,2,3]
|
58
|
+
...
|
59
|
+
```
|
60
|
+
|
61
|
+
In our case, the arrays are the possible English letters for every Hebrew phoneme:
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
def setup_permuter
|
65
|
+
heb_letters.each do |heb_letter|
|
66
|
+
@permuter.add_array @possible_english_letters[heb_letter]
|
67
|
+
end
|
68
|
+
end
|
69
|
+
```
|
70
|
+
Suppose that:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
@possible_english_letters = {"ב" => ["v"], "בּ" => ["b","bb"]}
|
74
|
+
@possible_english_letters["בּ"]
|
75
|
+
# => "["b","bb"]"`
|
76
|
+
```
|
77
|
+
|
78
|
+
If the word contains the letter _'בּ'_, permutations will be generated containing both _'b'_ and _bb_.
|
79
|
+
|
80
|
+
###### And how does Permuter work?
|
81
|
+
`Permuter` uses a basic recursive strategy to generate the permutations.
|
82
|
+
|
83
|
+
From the implementation
|
84
|
+
```ruby
|
85
|
+
private
|
86
|
+
# permute (indices)
|
87
|
+
# Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
|
88
|
+
#
|
89
|
+
# The classic recursive permutation algorithm:
|
90
|
+
# Imagine picking a combination lock: [0][0][0]
|
91
|
+
# Each cylinder is the index to one of the arrays
|
92
|
+
# On each recursion, we add another cylinder [0], [0][0], [0][0][0]
|
93
|
+
# When we have enough cylinders, we generate the permutation (base case)
|
94
|
+
# and iterate to the next value by dropping a cylinder, [0][0]
|
95
|
+
# iterating the loop in else, and recursing again [0][0][1]
|
96
|
+
# Simple and elegant
|
97
|
+
def permute indices
|
98
|
+
# Base case
|
99
|
+
if indices.length == @arrays.length
|
100
|
+
build_permutation indices
|
101
|
+
else
|
102
|
+
@arrays[indices.length].each_with_index do |item,i|
|
103
|
+
permute indices.dup << i
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
```
|
108
|
+
|
109
|
+
`Permuter` now returns to `Transliterator`, which returns to `HebrewWord`, which returns to the user.
|
110
|
+
|
111
|
+
## Summary
|
112
|
+
|
113
|
+
1. `HebrewWord` is given a string of Hebrew text, with all the necessary vowelisations.
|
114
|
+
2. `HebrewWord` passes the string into `Transliterator`
|
115
|
+
3. `Transliterator` passes the string into `Phonemizer`
|
116
|
+
4. `Phonemizer` digests the string into usable phonemes for mapping, and hands them back to `Transliterator`
|
117
|
+
5. `Transliterator` loads the phoneme map from `PhonemeMaps`, and uses the map and string to configure the `Permuter`
|
118
|
+
6. `Permuter` generates the transliterations, and hands them back to `Transliterator`
|
119
|
+
7. `Transliterator` returns to `HebrewWord`
|
120
|
+
8. `HebrewWord` returns to the user
|
data/lib/translit_kit.rb
ADDED
@@ -0,0 +1,115 @@
|
|
1
|
+
=begin
|
2
|
+
Transliterator.rb
|
3
|
+
|
4
|
+
|
5
|
+
=end
|
6
|
+
|
7
|
+
require 'permuter'
|
8
|
+
require 'phoneme_maps'
|
9
|
+
require 'phonemizer'
|
10
|
+
|
11
|
+
class Transliterator < String
|
12
|
+
|
13
|
+
# Initializer
|
14
|
+
# Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
|
15
|
+
# and a optional phoneme-mapping list
|
16
|
+
def initialize string, map_name = nil
|
17
|
+
@hebword = string
|
18
|
+
@phoneme_map = fetch_phoneme_map map_name
|
19
|
+
setup_permuter
|
20
|
+
end
|
21
|
+
|
22
|
+
# Get the raw Hebrew text of the word (Included NIKUD)
|
23
|
+
def raw
|
24
|
+
@hebword
|
25
|
+
end
|
26
|
+
|
27
|
+
# Alias of `raw`
|
28
|
+
def to_s
|
29
|
+
raw
|
30
|
+
end
|
31
|
+
|
32
|
+
def phoneme_map
|
33
|
+
@list_name
|
34
|
+
end
|
35
|
+
|
36
|
+
def phoneme_map= name
|
37
|
+
@phoneme_map = fetch_phoneme_map name
|
38
|
+
end
|
39
|
+
# Returns a `String` of format:
|
40
|
+
# `hebrew_text`: Permutations: `x` single | `y` short | `z` long
|
41
|
+
def inspect
|
42
|
+
"#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
|
43
|
+
end
|
44
|
+
|
45
|
+
|
46
|
+
def phonemes
|
47
|
+
Phonemizer.new(@hebword).phonemes
|
48
|
+
end
|
49
|
+
|
50
|
+
# Return an `Array` of all possible transliterations of the word
|
51
|
+
# As defined in the optional `list_name` argument. options: [:long, :short, :single]
|
52
|
+
# Default is `:single`
|
53
|
+
def transliterate list_name = nil
|
54
|
+
self.phoneme_map = list_name
|
55
|
+
setup_permuter()
|
56
|
+
generate_permutations()
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
# #fetch_phoneme_maponeme_map(list_name)
|
62
|
+
# Returns the appropriate `phoneme_map` for transliteration
|
63
|
+
#
|
64
|
+
# If a name is supplied, use that
|
65
|
+
# options: [:long, :short, :single] (default is :short)
|
66
|
+
#
|
67
|
+
# Following init, if no list is supplied, the one selected in init is used.
|
68
|
+
#
|
69
|
+
# On init:
|
70
|
+
# >> name -> use name
|
71
|
+
# >> nil -> use :short
|
72
|
+
#
|
73
|
+
# After init
|
74
|
+
# >> name -> use name
|
75
|
+
# >> nil -> use what we've already got
|
76
|
+
|
77
|
+
def fetch_phoneme_map list_name = nil
|
78
|
+
if list_name.nil?
|
79
|
+
defined?(@phoneme_map) ? (return @phoneme_map) : list_name = :short
|
80
|
+
end
|
81
|
+
|
82
|
+
map = PhonemeMaps.new.load list_name
|
83
|
+
@list_name = list_name
|
84
|
+
map
|
85
|
+
end
|
86
|
+
|
87
|
+
# Get all permutations for `@hebword`
|
88
|
+
def generate_permutations
|
89
|
+
@permuter.permutations.
|
90
|
+
select do |pr|
|
91
|
+
# Eliminate duplicate chars
|
92
|
+
# At start and end of permutations
|
93
|
+
# i.e. "avrohom" -> keep
|
94
|
+
# "avrohomm" -> reject
|
95
|
+
pr[0] != pr[1] && # compare first 2 chars
|
96
|
+
pr[pr.length - 1] != pr[pr.length - 2] # compare last 2 chars
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
# Configures the versatile Permuter for permuting the word
|
101
|
+
def setup_permuter
|
102
|
+
@permuter = Permuter.new
|
103
|
+
|
104
|
+
# Get the letters of the word
|
105
|
+
heb_letters = self.phonemes
|
106
|
+
|
107
|
+
# For each letter, add the array
|
108
|
+
# of possible english letters to the permuter
|
109
|
+
heb_letters.each do |heb_letter|
|
110
|
+
en_letters = @phoneme_map[heb_letter]
|
111
|
+
if en_letters.nil? then raise "Couldn't find phoneme_map entry for letter ( #{heb_letter.chars} ) in list `#{@list_name}`\nSuggested test snippet: #{@list_name == ":custom" ? @list_name : "require \'phoneme_maps\';PhonemeMaps.new.short"}['#{heb_letter}'].nil?\n" end
|
112
|
+
@permuter.add_array en_letters
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|