translit_kit 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +86 -0
  4. data/Rakefile +29 -0
  5. data/lib/hebrewword.rb +60 -0
  6. data/lib/permuter.rb +97 -0
  7. data/lib/phoneme_maps.rb +80 -0
  8. data/lib/phoneme_maps/long.json +41 -0
  9. data/lib/phoneme_maps/short.json +39 -0
  10. data/lib/phoneme_maps/single.json +40 -0
  11. data/lib/phonemizer.rb +170 -0
  12. data/lib/readme.md +120 -0
  13. data/lib/translit_kit.rb +2 -0
  14. data/lib/translit_kit/version.rb +3 -0
  15. data/lib/transliterator.rb +115 -0
  16. data/test/dummy/README.rdoc +28 -0
  17. data/test/dummy/Rakefile +6 -0
  18. data/test/dummy/app/assets/javascripts/application.js +13 -0
  19. data/test/dummy/app/assets/stylesheets/application.css +15 -0
  20. data/test/dummy/app/controllers/application_controller.rb +5 -0
  21. data/test/dummy/app/helpers/application_helper.rb +2 -0
  22. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  23. data/test/dummy/bin/bundle +3 -0
  24. data/test/dummy/bin/rails +4 -0
  25. data/test/dummy/bin/rake +4 -0
  26. data/test/dummy/bin/setup +34 -0
  27. data/test/dummy/bin/update +29 -0
  28. data/test/dummy/config.ru +4 -0
  29. data/test/dummy/config/application.rb +15 -0
  30. data/test/dummy/config/boot.rb +3 -0
  31. data/test/dummy/config/cable.yml +9 -0
  32. data/test/dummy/config/database.yml +25 -0
  33. data/test/dummy/config/environment.rb +5 -0
  34. data/test/dummy/config/environments/development.rb +54 -0
  35. data/test/dummy/config/environments/production.rb +86 -0
  36. data/test/dummy/config/environments/test.rb +42 -0
  37. data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
  38. data/test/dummy/config/initializers/assets.rb +11 -0
  39. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  40. data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
  41. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  42. data/test/dummy/config/initializers/inflections.rb +16 -0
  43. data/test/dummy/config/initializers/mime_types.rb +4 -0
  44. data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
  45. data/test/dummy/config/initializers/session_store.rb +3 -0
  46. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  47. data/test/dummy/config/locales/en.yml +23 -0
  48. data/test/dummy/config/puma.rb +47 -0
  49. data/test/dummy/config/routes.rb +3 -0
  50. data/test/dummy/config/secrets.yml +22 -0
  51. data/test/dummy/config/spring.rb +6 -0
  52. data/test/dummy/db/test.sqlite3 +0 -0
  53. data/test/dummy/log/test.log +85939 -0
  54. data/test/dummy/public/404.html +67 -0
  55. data/test/dummy/public/422.html +67 -0
  56. data/test/dummy/public/500.html +66 -0
  57. data/test/dummy/public/favicon.ico +0 -0
  58. data/test/hebrewword_test.rb +45 -0
  59. data/test/permuter_test.rb +53 -0
  60. data/test/phoneme_maps_test.rb +29 -0
  61. data/test/phonemizer_test.rb +209 -0
  62. data/test/test_helper.rb +29 -0
  63. data/test/transliterator_test.rb +75 -0
  64. metadata +155 -0
@@ -0,0 +1,40 @@
1
+ {
2
+ "א": [""],
3
+ "ב": ["v"],
4
+ "בּ": ["b"],
5
+ "ג": ["g"],
6
+ "ד": ["d"],
7
+ "ה": ["h"],
8
+ "ו": ["v"],
9
+ "ז": ["z"],
10
+ "ח": ["ch"],
11
+ "חַ": ["ach"],
12
+ "ט": ["t"],
13
+ "י": ["y"],
14
+ "כ": ["ch"],
15
+ "כּ": ["k"],
16
+ "ל": ["l"],
17
+ "מ": ["m"],
18
+ "נ": ["n"],
19
+ "ס": ["s"],
20
+ "ע": ["a"],
21
+ "פ": ["f"],
22
+ "פּ": ["p"],
23
+ "צ": ["tz"],
24
+ "ק": ["k"],
25
+ "ר": ["r"],
26
+ "שׁ": ["sh"],
27
+ "ש": ["s"],
28
+ "ת": ["s"],
29
+ "תּ": ["t"],
30
+ "ָ": ["o"],
31
+ "ַ": ["a"],
32
+ "ֵ": ["ei"],
33
+ "ֶ": ["e"],
34
+ "ִ": ["i"],
35
+ "ֹ": ["o"],
36
+ "וֹ": ["o"],
37
+ "וּ": ["u"],
38
+ "ֻ": ["u"],
39
+ "ְ": ["e"]
40
+ }
data/lib/phonemizer.rb ADDED
@@ -0,0 +1,170 @@
1
+ =begin
2
+ Phonemizer.rb
3
+
4
+ Takes a raw Hebrew word (with nekudos)
5
+ and returns an array of phonemes.
6
+
7
+ Behavior:
8
+ * Letters and nekudos are seperated.
9
+ * Strips spaces
10
+ * Normalizes CHATAF nekudos
11
+ * Normalizes final letters
12
+ * The DAGESH is joined to its letter
13
+ * The SHIN's dot is attached to the SHIN
14
+ * MALEI nekudos are stripped of their extra YUD
15
+
16
+ =end
17
+
18
+
19
+ require 'permuter'
20
+ require 'phoneme_maps'
21
+
22
+ # Constants
23
+
24
+ # English and Hebrew Unicode have different space (' ') characters
25
+ ENGLISH_SPACE = [160].pack "U"
26
+ HEBREW_SPACE = [32].pack "U"
27
+
28
+ # Edge-case characters
29
+ DAGESH = "ּ"
30
+ SHIN_DOT = "ׁ"
31
+
32
+ # Nekudos that have special cases
33
+ CHOLOM = "ֹ"
34
+ PATACH = "ַ"
35
+ CHIRIK = "ִ"
36
+ TZEIREI = "ֵ"
37
+
38
+ # Letters that have special cases
39
+ SIN = "ש"
40
+ VAV = "ו"
41
+ CHES = "ח"
42
+ YUD = "י"
43
+ SHIN_WITH_DOT = "שׁ"
44
+
45
+
46
+ # Regexes
47
+ LETTER = /[אבגדהוזחטיכלמנסעפקרשתםןץףךצ]/
48
+ FINAL_LETTER = /[םןךףץ]/
49
+ CHATAF = ['ֲ','ֳ','ֱ']
50
+ DAGESH_WHITELIST = /[בוכפת]/
51
+
52
+
53
+ # Breaks a Hebrew string into its discrete phonemes
54
+ class Phonemizer
55
+
56
+ def initialize word
57
+ @hebword = word
58
+ end
59
+
60
+ # Returns the unedited Hebrew string
61
+ def raw
62
+ @hebword
63
+ end
64
+
65
+ # Breaks the word down into its discrete phonemes
66
+ # “ם’’ ,“וּ“ ,“כּ“ ,“ע“] = "עַכּוּם]
67
+ #
68
+ # No arguments; returns an array
69
+ #
70
+ # This function depends heavily on the workings of Hebrew grammer,
71
+ # so it gets a bit complicated. If you have a more elegant solution, I'd gladly take it.
72
+ # This thing was a hornet's nest full of bugs, so watch that test suite when editing!
73
+ def phonemes
74
+ @completed = []
75
+
76
+ # For each raw character :
77
+ @hebword.chars.each_with_index do |char,i|
78
+
79
+ # Skip whitespace
80
+ if char == ENGLISH_SPACE || char == HEBREW_SPACE
81
+ next
82
+
83
+ # If it's a final letter, normalize it to its standard form (מ –> ם)
84
+ elsif char =~ FINAL_LETTER
85
+ @completed << normalize_final_letter(char)
86
+
87
+ # If it's a CHATAF, normalize it to it's standard form
88
+ elsif CHATAF.include? char
89
+ @completed << deCHATAFize(char)
90
+
91
+ # If it's a SHIN_DOT, find the previous SIN and replace it with SHIN_WITH_DOT
92
+ elsif char == SHIN_DOT
93
+ @completed[@completed.rindex(SIN)] = SHIN_WITH_DOT
94
+
95
+ # If it's a DAGESH:
96
+ # 1. Find the previous letter
97
+ # 2. Check if it's on the list of DAGESH-compatible letters
98
+ # 3. If it is, add it
99
+ # 4. If it's not, implicitly fall through to the `else` case
100
+ elsif char == DAGESH
101
+ previous_letter = previous_letter_index(i, @completed)
102
+ if previous_letter.nil? then raise "Orphaned DAGESH: DAGESH at position #{i} is not preceded by a letter.(Word: \"#{@hebword}\")"; end
103
+ if DAGESH_WHITELIST =~ @completed[previous_letter]
104
+ @completed[previous_letter] += DAGESH
105
+ end
106
+
107
+ # Skip the VAV of a CHOLOM MALEI, otherwise add it
108
+ elsif char == VAV
109
+ @hebword[i + 1] == CHOLOM ? next : @completed << VAV
110
+
111
+ # Skip the YUD of a CHIRIK MALEI and TZEIREI MALEI, otherwise add them
112
+ elsif char == YUD
113
+ (@completed.last == CHIRIK ||
114
+ @completed.last == TZEIREI) ?
115
+ next : @completed << YUD
116
+
117
+ # Append a PATACH to a final CHES ( חַ )
118
+ elsif char == PATACH && # It's a PATACH
119
+ @completed.last == CHES && # Proceeded by a CHES
120
+ (i == @hebword.length - 1) # At the end of the word
121
+ @completed[@completed.length - 1] += PATACH
122
+
123
+ # Otherwise, pass the letter or nekuda unchanged
124
+ else
125
+ @completed << char
126
+ end
127
+
128
+ end # end loop
129
+ @completed
130
+
131
+ end
132
+
133
+ private
134
+
135
+
136
+ # Normalize final letters to standard forms
137
+ def normalize_final_letter char
138
+ case char
139
+ when "ם" then return "מ"
140
+ when "ן" then return "נ"
141
+ when "ץ" then return "צ"
142
+ when "ף" then return "פ"
143
+ when "ך" then return "כ"
144
+ else
145
+ raise "#{char} is not a final letter\nSuggested test snippet: #{FINAL_LETTER} =~ #{char}\n"
146
+ end
147
+ end
148
+
149
+ # Normalize CHATAF nekudos to standard forms
150
+ # Raises a `RuntimeError` if the character is not one of ['ֲ','ֳ','ֱ']
151
+ def deCHATAFize chataf
152
+ case chataf
153
+ when "ֲ" then return "ַ"
154
+ when "ֳ" then return "ָ"
155
+ when "ֱ" then return "ֶ"
156
+ end
157
+ raise "#{chataf} is not a CHATAF\n\tSuggested test snippet: ['ֲ','ֳ','ֱ'].include?(#{chataf})"
158
+ end
159
+
160
+ # Return the index of the first previous character that is a letter
161
+ # * If the index is a letter -> Ignore it and find the previous one #BugOrFeature?
162
+ # * If a previous character is a letter -> return its index
163
+ # * If no characters are letters -> nil
164
+ def previous_letter_index current_loc, array
165
+ current_loc.downto(0) do |i|
166
+ return i if array[i] =~ LETTER
167
+ end
168
+ nil
169
+ end
170
+ end
data/lib/readme.md ADDED
@@ -0,0 +1,120 @@
1
+ # How Transliteration Works
2
+
3
+ LectureLab uses a pile of helper classes to ease mass-editing strings
4
+
5
+ ## The HebrewWord class
6
+
7
+ `HebrewWord` takes a a Hebrew word (with _nikkud_) and a _phoneme list_, which maps Hebrew phonemes (letters with optional modifiers) unto English characters.
8
+ (If phonemes are not supplied, it loads a default set. See the implementation)
9
+
10
+ Example:
11
+ ```ruby
12
+ @phonemes = {"ב" => ["v"], "בּ" => ["b","bb"]}
13
+ h = new HebrewWord "בָּעוֹמֶר", @phonemes
14
+ h.transliterate
15
+ # => ...
16
+ ```
17
+
18
+ Let's see the implementation:
19
+ ```ruby
20
+ def transliterate list_name = nil
21
+ Transliterator.new(@hebword, list_name).transliterate
22
+ end
23
+ ```
24
+
25
+ `Hebrew` delegates the actual work to the `Transliterator` class.
26
+
27
+ ## The Transliterator class
28
+
29
+
30
+ ```ruby
31
+ class Transliterator
32
+ def transliterate
33
+ @permuter.permutations
34
+ end
35
+ ...
36
+ ```
37
+
38
+ In the initializer:
39
+ ```ruby
40
+ @permuter = Permuter.new
41
+ ```
42
+
43
+ So HebrewWord delegates the actual permuting to the _Permuter_ class
44
+
45
+ ## The Permuter class
46
+
47
+ The `Permuter` class is a general purpose object for generating combinations:
48
+ ```ruby
49
+ p = Permuter.new
50
+ 3.times { p.add_array [1,2,3] }
51
+
52
+
53
+ p.permutations
54
+ # => [1,1,1]
55
+ [1,1,2]
56
+ [1,1,3]
57
+ [1,2,3]
58
+ ...
59
+ ```
60
+
61
+ In our case, the arrays are the possible English letters for every Hebrew phoneme:
62
+
63
+ ```ruby
64
+ def setup_permuter
65
+ heb_letters.each do |heb_letter|
66
+ @permuter.add_array @possible_english_letters[heb_letter]
67
+ end
68
+ end
69
+ ```
70
+ Suppose that:
71
+
72
+ ```ruby
73
+ @possible_english_letters = {"ב" => ["v"], "בּ" => ["b","bb"]}
74
+ @possible_english_letters["בּ"]
75
+ # => "["b","bb"]"`
76
+ ```
77
+
78
+ If the word contains the letter _'בּ'_, permutations will be generated containing both _'b'_ and _bb_.
79
+
80
+ ###### And how does Permuter work?
81
+ `Permuter` uses a basic recursive strategy to generate the permutations.
82
+
83
+ From the implementation
84
+ ```ruby
85
+ private
86
+ # permute (indices)
87
+ # Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
88
+ #
89
+ # The classic recursive permutation algorithm:
90
+ # Imagine picking a combination lock: [0][0][0]
91
+ # Each cylinder is the index to one of the arrays
92
+ # On each recursion, we add another cylinder [0], [0][0], [0][0][0]
93
+ # When we have enough cylinders, we generate the permutation (base case)
94
+ # and iterate to the next value by dropping a cylinder, [0][0]
95
+ # iterating the loop in else, and recursing again [0][0][1]
96
+ # Simple and elegant
97
+ def permute indices
98
+ # Base case
99
+ if indices.length == @arrays.length
100
+ build_permutation indices
101
+ else
102
+ @arrays[indices.length].each_with_index do |item,i|
103
+ permute indices.dup << i
104
+ end
105
+ end
106
+ end
107
+ ```
108
+
109
+ `Permuter` now returns to `Transliterator`, which returns to `HebrewWord`, which returns to the user.
110
+
111
+ ## Summary
112
+
113
+ 1. `HebrewWord` is given a string of Hebrew text, with all the necessary vowelisations.
114
+ 2. `HebrewWord` passes the string into `Transliterator`
115
+ 3. `Transliterator` passes the string into `Phonemizer`
116
+ 4. `Phonemizer` digests the string into usable phonemes for mapping, and hands them back to `Transliterator`
117
+ 5. `Transliterator` loads the phoneme map from `PhonemeMaps`, and uses the map and string to configure the `Permuter`
118
+ 6. `Permuter` generates the transliterations, and hands them back to `Transliterator`
119
+ 7. `Transliterator` returns to `HebrewWord`
120
+ 8. `HebrewWord` returns to the user
@@ -0,0 +1,2 @@
1
+ module TranslitKit
2
+ end
@@ -0,0 +1,3 @@
1
+ module TranslitKit
2
+ VERSION = "0.9"
3
+ end
@@ -0,0 +1,115 @@
1
+ =begin
2
+ Transliterator.rb
3
+
4
+
5
+ =end
6
+
7
+ require 'permuter'
8
+ require 'phoneme_maps'
9
+ require 'phonemizer'
10
+
11
+ class Transliterator < String
12
+
13
+ # Initializer
14
+ # Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
15
+ # and a optional phoneme-mapping list
16
+ def initialize string, map_name = nil
17
+ @hebword = string
18
+ @phoneme_map = fetch_phoneme_map map_name
19
+ setup_permuter
20
+ end
21
+
22
+ # Get the raw Hebrew text of the word (Included NIKUD)
23
+ def raw
24
+ @hebword
25
+ end
26
+
27
+ # Alias of `raw`
28
+ def to_s
29
+ raw
30
+ end
31
+
32
+ def phoneme_map
33
+ @list_name
34
+ end
35
+
36
+ def phoneme_map= name
37
+ @phoneme_map = fetch_phoneme_map name
38
+ end
39
+ # Returns a `String` of format:
40
+ # `hebrew_text`: Permutations: `x` single | `y` short | `z` long
41
+ def inspect
42
+ "#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
43
+ end
44
+
45
+
46
+ def phonemes
47
+ Phonemizer.new(@hebword).phonemes
48
+ end
49
+
50
+ # Return an `Array` of all possible transliterations of the word
51
+ # As defined in the optional `list_name` argument. options: [:long, :short, :single]
52
+ # Default is `:single`
53
+ def transliterate list_name = nil
54
+ self.phoneme_map = list_name
55
+ setup_permuter()
56
+ generate_permutations()
57
+ end
58
+
59
+ private
60
+
61
+ # #fetch_phoneme_maponeme_map(list_name)
62
+ # Returns the appropriate `phoneme_map` for transliteration
63
+ #
64
+ # If a name is supplied, use that
65
+ # options: [:long, :short, :single] (default is :short)
66
+ #
67
+ # Following init, if no list is supplied, the one selected in init is used.
68
+ #
69
+ # On init:
70
+ # >> name -> use name
71
+ # >> nil -> use :short
72
+ #
73
+ # After init
74
+ # >> name -> use name
75
+ # >> nil -> use what we've already got
76
+
77
+ def fetch_phoneme_map list_name = nil
78
+ if list_name.nil?
79
+ defined?(@phoneme_map) ? (return @phoneme_map) : list_name = :short
80
+ end
81
+
82
+ map = PhonemeMaps.new.load list_name
83
+ @list_name = list_name
84
+ map
85
+ end
86
+
87
+ # Get all permutations for `@hebword`
88
+ def generate_permutations
89
+ @permuter.permutations.
90
+ select do |pr|
91
+ # Eliminate duplicate chars
92
+ # At start and end of permutations
93
+ # i.e. "avrohom" -> keep
94
+ # "avrohomm" -> reject
95
+ pr[0] != pr[1] && # compare first 2 chars
96
+ pr[pr.length - 1] != pr[pr.length - 2] # compare last 2 chars
97
+ end
98
+ end
99
+
100
+ # Configures the versatile Permuter for permuting the word
101
+ def setup_permuter
102
+ @permuter = Permuter.new
103
+
104
+ # Get the letters of the word
105
+ heb_letters = self.phonemes
106
+
107
+ # For each letter, add the array
108
+ # of possible english letters to the permuter
109
+ heb_letters.each do |heb_letter|
110
+ en_letters = @phoneme_map[heb_letter]
111
+ if en_letters.nil? then raise "Couldn't find phoneme_map entry for letter ( #{heb_letter.chars} ) in list `#{@list_name}`\nSuggested test snippet: #{@list_name == ":custom" ? @list_name : "require \'phoneme_maps\';PhonemeMaps.new.short"}['#{heb_letter}'].nil?\n" end
112
+ @permuter.add_array en_letters
113
+ end
114
+ end
115
+ end