translit_kit 0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +86 -0
  4. data/Rakefile +29 -0
  5. data/lib/hebrewword.rb +60 -0
  6. data/lib/permuter.rb +97 -0
  7. data/lib/phoneme_maps.rb +80 -0
  8. data/lib/phoneme_maps/long.json +41 -0
  9. data/lib/phoneme_maps/short.json +39 -0
  10. data/lib/phoneme_maps/single.json +40 -0
  11. data/lib/phonemizer.rb +170 -0
  12. data/lib/readme.md +120 -0
  13. data/lib/translit_kit.rb +2 -0
  14. data/lib/translit_kit/version.rb +3 -0
  15. data/lib/transliterator.rb +115 -0
  16. data/test/dummy/README.rdoc +28 -0
  17. data/test/dummy/Rakefile +6 -0
  18. data/test/dummy/app/assets/javascripts/application.js +13 -0
  19. data/test/dummy/app/assets/stylesheets/application.css +15 -0
  20. data/test/dummy/app/controllers/application_controller.rb +5 -0
  21. data/test/dummy/app/helpers/application_helper.rb +2 -0
  22. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  23. data/test/dummy/bin/bundle +3 -0
  24. data/test/dummy/bin/rails +4 -0
  25. data/test/dummy/bin/rake +4 -0
  26. data/test/dummy/bin/setup +34 -0
  27. data/test/dummy/bin/update +29 -0
  28. data/test/dummy/config.ru +4 -0
  29. data/test/dummy/config/application.rb +15 -0
  30. data/test/dummy/config/boot.rb +3 -0
  31. data/test/dummy/config/cable.yml +9 -0
  32. data/test/dummy/config/database.yml +25 -0
  33. data/test/dummy/config/environment.rb +5 -0
  34. data/test/dummy/config/environments/development.rb +54 -0
  35. data/test/dummy/config/environments/production.rb +86 -0
  36. data/test/dummy/config/environments/test.rb +42 -0
  37. data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
  38. data/test/dummy/config/initializers/assets.rb +11 -0
  39. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  40. data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
  41. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  42. data/test/dummy/config/initializers/inflections.rb +16 -0
  43. data/test/dummy/config/initializers/mime_types.rb +4 -0
  44. data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
  45. data/test/dummy/config/initializers/session_store.rb +3 -0
  46. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  47. data/test/dummy/config/locales/en.yml +23 -0
  48. data/test/dummy/config/puma.rb +47 -0
  49. data/test/dummy/config/routes.rb +3 -0
  50. data/test/dummy/config/secrets.yml +22 -0
  51. data/test/dummy/config/spring.rb +6 -0
  52. data/test/dummy/db/test.sqlite3 +0 -0
  53. data/test/dummy/log/test.log +85939 -0
  54. data/test/dummy/public/404.html +67 -0
  55. data/test/dummy/public/422.html +67 -0
  56. data/test/dummy/public/500.html +66 -0
  57. data/test/dummy/public/favicon.ico +0 -0
  58. data/test/hebrewword_test.rb +45 -0
  59. data/test/permuter_test.rb +53 -0
  60. data/test/phoneme_maps_test.rb +29 -0
  61. data/test/phonemizer_test.rb +209 -0
  62. data/test/test_helper.rb +29 -0
  63. data/test/transliterator_test.rb +75 -0
  64. metadata +155 -0
@@ -0,0 +1,40 @@
1
+ {
2
+ "א": [""],
3
+ "ב": ["v"],
4
+ "בּ": ["b"],
5
+ "ג": ["g"],
6
+ "ד": ["d"],
7
+ "ה": ["h"],
8
+ "ו": ["v"],
9
+ "ז": ["z"],
10
+ "ח": ["ch"],
11
+ "חַ": ["ach"],
12
+ "ט": ["t"],
13
+ "י": ["y"],
14
+ "כ": ["ch"],
15
+ "כּ": ["k"],
16
+ "ל": ["l"],
17
+ "מ": ["m"],
18
+ "נ": ["n"],
19
+ "ס": ["s"],
20
+ "ע": ["a"],
21
+ "פ": ["f"],
22
+ "פּ": ["p"],
23
+ "צ": ["tz"],
24
+ "ק": ["k"],
25
+ "ר": ["r"],
26
+ "שׁ": ["sh"],
27
+ "ש": ["s"],
28
+ "ת": ["s"],
29
+ "תּ": ["t"],
30
+ "ָ": ["o"],
31
+ "ַ": ["a"],
32
+ "ֵ": ["ei"],
33
+ "ֶ": ["e"],
34
+ "ִ": ["i"],
35
+ "ֹ": ["o"],
36
+ "וֹ": ["o"],
37
+ "וּ": ["u"],
38
+ "ֻ": ["u"],
39
+ "ְ": ["e"]
40
+ }
data/lib/phonemizer.rb ADDED
@@ -0,0 +1,170 @@
1
+ =begin
2
+ Phonemizer.rb
3
+
4
+ Takes a raw Hebrew word (with nekudos)
5
+ and returns an array of phonemes.
6
+
7
+ Behavior:
8
+ * Letters and nekudos are seperated.
9
+ * Strips spaces
10
+ * Normalizes CHATAF nekudos
11
+ * Normalizes final letters
12
+ * The DAGESH is joined to its letter
13
+ * The SHIN's dot is attached to the SHIN
14
+ * MALEI nekudos are stripped of their extra YUD
15
+
16
+ =end
17
+
18
+
19
+ require 'permuter'
20
+ require 'phoneme_maps'
21
+
22
+ # Constants
23
+
24
+ # English and Hebrew Unicode have different space (' ') characters
25
+ ENGLISH_SPACE = [160].pack "U"
26
+ HEBREW_SPACE = [32].pack "U"
27
+
28
+ # Edge-case characters
29
+ DAGESH = "ּ"
30
+ SHIN_DOT = "ׁ"
31
+
32
+ # Nekudos that have special cases
33
+ CHOLOM = "ֹ"
34
+ PATACH = "ַ"
35
+ CHIRIK = "ִ"
36
+ TZEIREI = "ֵ"
37
+
38
+ # Letters that have special cases
39
+ SIN = "ש"
40
+ VAV = "ו"
41
+ CHES = "ח"
42
+ YUD = "י"
43
+ SHIN_WITH_DOT = "שׁ"
44
+
45
+
46
+ # Regexes
47
+ LETTER = /[אבגדהוזחטיכלמנסעפקרשתםןץףךצ]/
48
+ FINAL_LETTER = /[םןךףץ]/
49
+ CHATAF = ['ֲ','ֳ','ֱ']
50
+ DAGESH_WHITELIST = /[בוכפת]/
51
+
52
+
53
+ # Breaks a Hebrew string into its discrete phonemes
54
+ class Phonemizer
55
+
56
+ def initialize word
57
+ @hebword = word
58
+ end
59
+
60
+ # Returns the unedited Hebrew string
61
+ def raw
62
+ @hebword
63
+ end
64
+
65
+ # Breaks the word down into its discrete phonemes
66
+ # “ם’’ ,“וּ“ ,“כּ“ ,“ע“] = "עַכּוּם]
67
+ #
68
+ # No arguments; returns an array
69
+ #
70
+ # This function depends heavily on the workings of Hebrew grammer,
71
+ # so it gets a bit complicated. If you have a more elegant solution, I'd gladly take it.
72
+ # This thing was a hornet's nest full of bugs, so watch that test suite when editing!
73
+ def phonemes
74
+ @completed = []
75
+
76
+ # For each raw character :
77
+ @hebword.chars.each_with_index do |char,i|
78
+
79
+ # Skip whitespace
80
+ if char == ENGLISH_SPACE || char == HEBREW_SPACE
81
+ next
82
+
83
+ # If it's a final letter, normalize it to its standard form (מ –> ם)
84
+ elsif char =~ FINAL_LETTER
85
+ @completed << normalize_final_letter(char)
86
+
87
+ # If it's a CHATAF, normalize it to it's standard form
88
+ elsif CHATAF.include? char
89
+ @completed << deCHATAFize(char)
90
+
91
+ # If it's a SHIN_DOT, find the previous SIN and replace it with SHIN_WITH_DOT
92
+ elsif char == SHIN_DOT
93
+ @completed[@completed.rindex(SIN)] = SHIN_WITH_DOT
94
+
95
+ # If it's a DAGESH:
96
+ # 1. Find the previous letter
97
+ # 2. Check if it's on the list of DAGESH-compatible letters
98
+ # 3. If it is, add it
99
+ # 4. If it's not, implicitly fall through to the `else` case
100
+ elsif char == DAGESH
101
+ previous_letter = previous_letter_index(i, @completed)
102
+ if previous_letter.nil? then raise "Orphaned DAGESH: DAGESH at position #{i} is not preceded by a letter.(Word: \"#{@hebword}\")"; end
103
+ if DAGESH_WHITELIST =~ @completed[previous_letter]
104
+ @completed[previous_letter] += DAGESH
105
+ end
106
+
107
+ # Skip the VAV of a CHOLOM MALEI, otherwise add it
108
+ elsif char == VAV
109
+ @hebword[i + 1] == CHOLOM ? next : @completed << VAV
110
+
111
+ # Skip the YUD of a CHIRIK MALEI and TZEIREI MALEI, otherwise add them
112
+ elsif char == YUD
113
+ (@completed.last == CHIRIK ||
114
+ @completed.last == TZEIREI) ?
115
+ next : @completed << YUD
116
+
117
+ # Append a PATACH to a final CHES ( חַ )
118
+ elsif char == PATACH && # It's a PATACH
119
+ @completed.last == CHES && # Proceeded by a CHES
120
+ (i == @hebword.length - 1) # At the end of the word
121
+ @completed[@completed.length - 1] += PATACH
122
+
123
+ # Otherwise, pass the letter or nekuda unchanged
124
+ else
125
+ @completed << char
126
+ end
127
+
128
+ end # end loop
129
+ @completed
130
+
131
+ end
132
+
133
+ private
134
+
135
+
136
+ # Normalize final letters to standard forms
137
+ def normalize_final_letter char
138
+ case char
139
+ when "ם" then return "מ"
140
+ when "ן" then return "נ"
141
+ when "ץ" then return "צ"
142
+ when "ף" then return "פ"
143
+ when "ך" then return "כ"
144
+ else
145
+ raise "#{char} is not a final letter\nSuggested test snippet: #{FINAL_LETTER} =~ #{char}\n"
146
+ end
147
+ end
148
+
149
+ # Normalize CHATAF nekudos to standard forms
150
+ # Raises a `RuntimeError` if the character is not one of ['ֲ','ֳ','ֱ']
151
+ def deCHATAFize chataf
152
+ case chataf
153
+ when "ֲ" then return "ַ"
154
+ when "ֳ" then return "ָ"
155
+ when "ֱ" then return "ֶ"
156
+ end
157
+ raise "#{chataf} is not a CHATAF\n\tSuggested test snippet: ['ֲ','ֳ','ֱ'].include?(#{chataf})"
158
+ end
159
+
160
+ # Return the index of the first previous character that is a letter
161
+ # * If the index is a letter -> Ignore it and find the previous one #BugOrFeature?
162
+ # * If a previous character is a letter -> return its index
163
+ # * If no characters are letters -> nil
164
+ def previous_letter_index current_loc, array
165
+ current_loc.downto(0) do |i|
166
+ return i if array[i] =~ LETTER
167
+ end
168
+ nil
169
+ end
170
+ end
data/lib/readme.md ADDED
@@ -0,0 +1,120 @@
1
+ # How Transliteration Works
2
+
3
+ LectureLab uses a pile of helper classes to ease mass-editing strings
4
+
5
+ ## The HebrewWord class
6
+
7
+ `HebrewWord` takes a a Hebrew word (with _nikkud_) and a _phoneme list_, which maps Hebrew phonemes (letters with optional modifiers) unto English characters.
8
+ (If phonemes are not supplied, it loads a default set. See the implementation)
9
+
10
+ Example:
11
+ ```ruby
12
+ @phonemes = {"ב" => ["v"], "בּ" => ["b","bb"]}
13
+ h = new HebrewWord "בָּעוֹמֶר", @phonemes
14
+ h.transliterate
15
+ # => ...
16
+ ```
17
+
18
+ Let's see the implementation:
19
+ ```ruby
20
+ def transliterate list_name = nil
21
+ Transliterator.new(@hebword, list_name).transliterate
22
+ end
23
+ ```
24
+
25
+ `Hebrew` delegates the actual work to the `Transliterator` class.
26
+
27
+ ## The Transliterator class
28
+
29
+
30
+ ```ruby
31
+ class Transliterator
32
+ def transliterate
33
+ @permuter.permutations
34
+ end
35
+ ...
36
+ ```
37
+
38
+ In the initializer:
39
+ ```ruby
40
+ @permuter = Permuter.new
41
+ ```
42
+
43
+ So HebrewWord delegates the actual permuting to the _Permuter_ class
44
+
45
+ ## The Permuter class
46
+
47
+ The `Permuter` class is a general purpose object for generating combinations:
48
+ ```ruby
49
+ p = Permuter.new
50
+ 3.times { p.add_array [1,2,3] }
51
+
52
+
53
+ p.permutations
54
+ # => [1,1,1]
55
+ [1,1,2]
56
+ [1,1,3]
57
+ [1,2,3]
58
+ ...
59
+ ```
60
+
61
+ In our case, the arrays are the possible English letters for every Hebrew phoneme:
62
+
63
+ ```ruby
64
+ def setup_permuter
65
+ heb_letters.each do |heb_letter|
66
+ @permuter.add_array @possible_english_letters[heb_letter]
67
+ end
68
+ end
69
+ ```
70
+ Suppose that:
71
+
72
+ ```ruby
73
+ @possible_english_letters = {"ב" => ["v"], "בּ" => ["b","bb"]}
74
+ @possible_english_letters["בּ"]
75
+ # => "["b","bb"]"`
76
+ ```
77
+
78
+ If the word contains the letter _'בּ'_, permutations will be generated containing both _'b'_ and _bb_.
79
+
80
+ ###### And how does Permuter work?
81
+ `Permuter` uses a basic recursive strategy to generate the permutations.
82
+
83
+ From the implementation
84
+ ```ruby
85
+ private
86
+ # permute (indices)
87
+ # Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
88
+ #
89
+ # The classic recursive permutation algorithm:
90
+ # Imagine picking a combination lock: [0][0][0]
91
+ # Each cylinder is the index to one of the arrays
92
+ # On each recursion, we add another cylinder [0], [0][0], [0][0][0]
93
+ # When we have enough cylinders, we generate the permutation (base case)
94
+ # and iterate to the next value by dropping a cylinder, [0][0]
95
+ # iterating the loop in else, and recursing again [0][0][1]
96
+ # Simple and elegant
97
+ def permute indices
98
+ # Base case
99
+ if indices.length == @arrays.length
100
+ build_permutation indices
101
+ else
102
+ @arrays[indices.length].each_with_index do |item,i|
103
+ permute indices.dup << i
104
+ end
105
+ end
106
+ end
107
+ ```
108
+
109
+ `Permuter` now returns to `Transliterator`, which returns to `HebrewWord`, which returns to the user.
110
+
111
+ ## Summary
112
+
113
+ 1. `HebrewWord` is given a string of Hebrew text, with all the necessary vowelisations.
114
+ 2. `HebrewWord` passes the string into `Transliterator`
115
+ 3. `Transliterator` passes the string into `Phonemizer`
116
+ 4. `Phonemizer` digests the string into usable phonemes for mapping, and hands them back to `Transliterator`
117
+ 5. `Transliterator` loads the phoneme map from `PhonemeMaps`, and uses the map and string to configure the `Permuter`
118
+ 6. `Permuter` generates the transliterations, and hands them back to `Transliterator`
119
+ 7. `Transliterator` returns to `HebrewWord`
120
+ 8. `HebrewWord` returns to the user
@@ -0,0 +1,2 @@
1
+ module TranslitKit
2
+ end
@@ -0,0 +1,3 @@
1
+ module TranslitKit
2
+ VERSION = "0.9"
3
+ end
@@ -0,0 +1,115 @@
1
+ =begin
2
+ Transliterator.rb
3
+
4
+
5
+ =end
6
+
7
+ require 'permuter'
8
+ require 'phoneme_maps'
9
+ require 'phonemizer'
10
+
11
+ class Transliterator < String
12
+
13
+ # Initializer
14
+ # Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
15
+ # and a optional phoneme-mapping list
16
+ def initialize string, map_name = nil
17
+ @hebword = string
18
+ @phoneme_map = fetch_phoneme_map map_name
19
+ setup_permuter
20
+ end
21
+
22
+ # Get the raw Hebrew text of the word (Included NIKUD)
23
+ def raw
24
+ @hebword
25
+ end
26
+
27
+ # Alias of `raw`
28
+ def to_s
29
+ raw
30
+ end
31
+
32
+ def phoneme_map
33
+ @list_name
34
+ end
35
+
36
+ def phoneme_map= name
37
+ @phoneme_map = fetch_phoneme_map name
38
+ end
39
+ # Returns a `String` of format:
40
+ # `hebrew_text`: Permutations: `x` single | `y` short | `z` long
41
+ def inspect
42
+ "#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
43
+ end
44
+
45
+
46
+ def phonemes
47
+ Phonemizer.new(@hebword).phonemes
48
+ end
49
+
50
+ # Return an `Array` of all possible transliterations of the word
51
+ # As defined in the optional `list_name` argument. options: [:long, :short, :single]
52
+ # Default is `:single`
53
+ def transliterate list_name = nil
54
+ self.phoneme_map = list_name
55
+ setup_permuter()
56
+ generate_permutations()
57
+ end
58
+
59
+ private
60
+
61
+ # #fetch_phoneme_maponeme_map(list_name)
62
+ # Returns the appropriate `phoneme_map` for transliteration
63
+ #
64
+ # If a name is supplied, use that
65
+ # options: [:long, :short, :single] (default is :short)
66
+ #
67
+ # Following init, if no list is supplied, the one selected in init is used.
68
+ #
69
+ # On init:
70
+ # >> name -> use name
71
+ # >> nil -> use :short
72
+ #
73
+ # After init
74
+ # >> name -> use name
75
+ # >> nil -> use what we've already got
76
+
77
+ def fetch_phoneme_map list_name = nil
78
+ if list_name.nil?
79
+ defined?(@phoneme_map) ? (return @phoneme_map) : list_name = :short
80
+ end
81
+
82
+ map = PhonemeMaps.new.load list_name
83
+ @list_name = list_name
84
+ map
85
+ end
86
+
87
+ # Get all permutations for `@hebword`
88
+ def generate_permutations
89
+ @permuter.permutations.
90
+ select do |pr|
91
+ # Eliminate duplicate chars
92
+ # At start and end of permutations
93
+ # i.e. "avrohom" -> keep
94
+ # "avrohomm" -> reject
95
+ pr[0] != pr[1] && # compare first 2 chars
96
+ pr[pr.length - 1] != pr[pr.length - 2] # compare last 2 chars
97
+ end
98
+ end
99
+
100
+ # Configures the versatile Permuter for permuting the word
101
+ def setup_permuter
102
+ @permuter = Permuter.new
103
+
104
+ # Get the letters of the word
105
+ heb_letters = self.phonemes
106
+
107
+ # For each letter, add the array
108
+ # of possible english letters to the permuter
109
+ heb_letters.each do |heb_letter|
110
+ en_letters = @phoneme_map[heb_letter]
111
+ if en_letters.nil? then raise "Couldn't find phoneme_map entry for letter ( #{heb_letter.chars} ) in list `#{@list_name}`\nSuggested test snippet: #{@list_name == ":custom" ? @list_name : "require \'phoneme_maps\';PhonemeMaps.new.short"}['#{heb_letter}'].nil?\n" end
112
+ @permuter.add_array en_letters
113
+ end
114
+ end
115
+ end