translit_kit 0.9

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +86 -0
  4. data/Rakefile +29 -0
  5. data/lib/hebrewword.rb +60 -0
  6. data/lib/permuter.rb +97 -0
  7. data/lib/phoneme_maps.rb +80 -0
  8. data/lib/phoneme_maps/long.json +41 -0
  9. data/lib/phoneme_maps/short.json +39 -0
  10. data/lib/phoneme_maps/single.json +40 -0
  11. data/lib/phonemizer.rb +170 -0
  12. data/lib/readme.md +120 -0
  13. data/lib/translit_kit.rb +2 -0
  14. data/lib/translit_kit/version.rb +3 -0
  15. data/lib/transliterator.rb +115 -0
  16. data/test/dummy/README.rdoc +28 -0
  17. data/test/dummy/Rakefile +6 -0
  18. data/test/dummy/app/assets/javascripts/application.js +13 -0
  19. data/test/dummy/app/assets/stylesheets/application.css +15 -0
  20. data/test/dummy/app/controllers/application_controller.rb +5 -0
  21. data/test/dummy/app/helpers/application_helper.rb +2 -0
  22. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  23. data/test/dummy/bin/bundle +3 -0
  24. data/test/dummy/bin/rails +4 -0
  25. data/test/dummy/bin/rake +4 -0
  26. data/test/dummy/bin/setup +34 -0
  27. data/test/dummy/bin/update +29 -0
  28. data/test/dummy/config.ru +4 -0
  29. data/test/dummy/config/application.rb +15 -0
  30. data/test/dummy/config/boot.rb +3 -0
  31. data/test/dummy/config/cable.yml +9 -0
  32. data/test/dummy/config/database.yml +25 -0
  33. data/test/dummy/config/environment.rb +5 -0
  34. data/test/dummy/config/environments/development.rb +54 -0
  35. data/test/dummy/config/environments/production.rb +86 -0
  36. data/test/dummy/config/environments/test.rb +42 -0
  37. data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
  38. data/test/dummy/config/initializers/assets.rb +11 -0
  39. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  40. data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
  41. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  42. data/test/dummy/config/initializers/inflections.rb +16 -0
  43. data/test/dummy/config/initializers/mime_types.rb +4 -0
  44. data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
  45. data/test/dummy/config/initializers/session_store.rb +3 -0
  46. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  47. data/test/dummy/config/locales/en.yml +23 -0
  48. data/test/dummy/config/puma.rb +47 -0
  49. data/test/dummy/config/routes.rb +3 -0
  50. data/test/dummy/config/secrets.yml +22 -0
  51. data/test/dummy/config/spring.rb +6 -0
  52. data/test/dummy/db/test.sqlite3 +0 -0
  53. data/test/dummy/log/test.log +85939 -0
  54. data/test/dummy/public/404.html +67 -0
  55. data/test/dummy/public/422.html +67 -0
  56. data/test/dummy/public/500.html +66 -0
  57. data/test/dummy/public/favicon.ico +0 -0
  58. data/test/hebrewword_test.rb +45 -0
  59. data/test/permuter_test.rb +53 -0
  60. data/test/phoneme_maps_test.rb +29 -0
  61. data/test/phonemizer_test.rb +209 -0
  62. data/test/test_helper.rb +29 -0
  63. data/test/transliterator_test.rb +75 -0
  64. metadata +155 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7280e8d3c76a0d829c9675616aef2913f879c9ad
4
+ data.tar.gz: 11188ff85dfe552a5057ad0bb7ca4003725343b8
5
+ SHA512:
6
+ metadata.gz: 831e6a6bb8c98af691721f055d74be5d415bfda3fe694f194133893218ae29af6df25b288b9bfeea656dc144a960c22c3dbe0d479afdf279c529676c5d0c9275
7
+ data.tar.gz: 18d2d3eee2be1383118ee6fd56547bc9974c74547a7bf4aed8bff8a9e3f08397409855f7495e16507bf6a7cb6da08f00aed17191f128a07218e4839525ac8582
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright 2017 Michoel Samuels
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,86 @@
1
+ # TranslitKit
2
+
3
+ [![Build Status](https://travis-ci.org/AnalyzePlatypus/TranslitKit.svg?branch=master)](https://travis-ci.org/AnalyzePlatypus/TranslitKit)
4
+ [![Code Climate](https://codeclimate.com/github/AnalyzePlatypus/TranslitKit/badges/gpa.svg)](https://codeclimate.com/github/AnalyzePlatypus/TranslitKit)
5
+ [![Coverage Status](https://coveralls.io/repos/github/AnalyzePlatypus/TranslitKit/badge.svg?branch=master)](https://coveralls.io/github/AnalyzePlatypus/TranslitKit?branch=master)
6
+ [![Inline docs](http://inch-ci.org/github/AnalyzePlatypus/TranslitKit.svg?branch=master)](http://inch-ci.org/github/AnalyzePlatypus/TranslitKit)
7
+
8
+ *TranslitKit* is a framework for Hebrew-English transliteration.
9
+
10
+ Example:
11
+ ```ruby
12
+ require 'translit_kit'
13
+ word = HebrewWord.new "אַברָהָם"
14
+ word.transliterate(:single)
15
+ # => ["avrohom"]
16
+
17
+ # Shortcut
18
+ word.t(:single)
19
+ # => ["avrohom"]
20
+ ```
21
+ Transliteration is powered by _phoneme maps_, files that map between Hebrew _phonemes_, or units of sound, and English characters. (see below)
22
+
23
+ Three `phoneme_maps` are provided: `:long`, `:short`, and `:single`.
24
+ You can easily add your own (see below)
25
+
26
+ ```ruby
27
+ word.t(:single)
28
+ # => ["avrohom"]
29
+ word.t(:short)
30
+ # => ["avroom", "avroam", "avroem", "avrohom", "avroham",
31
+ # "avrohem", "avraom", "avraam", "avraem", "avrahom",
32
+ # "avraham", "avrahem", "avreom", "avream", "avreem",
33
+ # "avrehom", "avreham", "avrehem" ]
34
+ word.t(:long)
35
+ # => ["avroom", "avrooom", "avroohm", ... ] # 5,997 more!
36
+ ```
37
+
38
+ The default is `:short`:
39
+ ```ruby
40
+ word.t == word.t(:short)
41
+ # => true
42
+ ```
43
+ To get the total permutation count, call `HebrewWord#inspect`
44
+ ```ruby
45
+ word.inspect
46
+ # => "אַברָהָם: Permutations: 1 single | 18 short | 6000 long"
47
+ ```
48
+
49
+ ## Adding Custom Phoneme maps
50
+ ###### Format
51
+ _Phoneme Maps_ are simply JSON files, placed in the `lib/phoneme_maps` directory.
52
+
53
+ The file should map between each `String` (the phonemes) and an `Array`s of replacement characters.
54
+
55
+ ```json
56
+ {
57
+ "ב": ["v"],
58
+ "בּ": ["b", "bb"]
59
+ }
60
+ ```
61
+
62
+ A _phoneme_ can be a Hebrew character `א`, _nekuda_ (`ָ`), or character with modifiers, such as a _dagesh_ (`בּ`). Keep in mind that many characters will be normalized (see below).
63
+
64
+ ###### Installation
65
+ To install your custom map, place the file in `lib/resources`
66
+
67
+ Your file will be available as the symbol`:<filename>` without the `.json` extension.
68
+
69
+ Example: `klingon.json` becomes `:klingon`
70
+
71
+ Now you can use it anywhere:
72
+ ```ruby
73
+ word.transliterate(:klingon)
74
+ # => (Results)
75
+ ```
76
+
77
+ At present, your map will not display results in `HebrewWord#inspect`
78
+
79
+ ## Appendix: Pre-Processing
80
+ When a word is transliterated, it is pre-processed to normalize certain characters.
81
+ Specifically:
82
+ * Whitespace is stripped
83
+ * The final letters `[םןךףץ]` are normalized to their standard forms
84
+ * _CHATAF_ _nekudos_ `['ֲ','ֳ','ֱ']` are normalized to their standard forms
85
+ * Full _CHIRIK_, _TZEIREI_, and _CHOLOM_ _nekudos_ have their letters removed
86
+ * _DAGESH_ characters are removed from all but the characters `[בוכפת]`
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'TranslitKit'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.rdoc')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+ Bundler::GemHelper.install_tasks
18
+
19
+ require 'rake/testtask'
20
+
21
+ Rake::TestTask.new(:test) do |t|
22
+ t.libs << 'lib'
23
+ t.libs << 'test'
24
+ t.pattern = 'test/**/*_test.rb'
25
+ t.verbose = false
26
+ end
27
+
28
+
29
+ task default: :test
data/lib/hebrewword.rb ADDED
@@ -0,0 +1,60 @@
1
+ =begin
2
+ HebrewWord.rb
3
+
4
+ Wraps a Hebrew word.
5
+
6
+ Methods:
7
+ * raw -> returns the original word
8
+ * to_s -> Alias to `raw`
9
+ * phonemes -> Returns an Array of phonemes (see Class::Phonemizer)
10
+ * transliterate(list_name) -> Returns as Array of transliterated strings
11
+ * t -> Alias for `transliterate`
12
+ * inspect -> Returns an informative string of the original Hebrew, and the available translit counts
13
+
14
+ =end
15
+
16
+ require 'phoneme_maps'
17
+ require 'phonemizer'
18
+ require 'transliterator'
19
+
20
+ # The user-facing transliterator class
21
+ class HebrewWord
22
+
23
+ # Initializer
24
+ # Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
25
+ def initialize string
26
+ @hebword = string
27
+ end
28
+
29
+ # Get the raw Hebrew text of the word (Included NIKUD)
30
+ def raw
31
+ @hebword
32
+ end
33
+
34
+ # Alias of `raw`
35
+ def to_s
36
+ raw
37
+ end
38
+
39
+ # Returns a `String` of format:
40
+ # `hebrew_text`: Permutations: `x` single | `y` short | `z` long
41
+ def inspect
42
+ "#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
43
+ end
44
+
45
+ def phonemes
46
+ Phonemizer.new(@hebword).phonemes
47
+ end
48
+
49
+ # Return an `Array` of all possible transliterations of the word
50
+ # As defined in the optional `list_name` argument. options: [:long, :short, :single]
51
+ # Default is `:short`
52
+ def transliterate list_name = nil
53
+ Transliterator.new(@hebword, list_name).transliterate
54
+ end
55
+
56
+ # Alias for #transliterate
57
+ def t list_name = nil
58
+ transliterate list_name
59
+ end
60
+ end
data/lib/permuter.rb ADDED
@@ -0,0 +1,97 @@
1
+ =begin
2
+ Permuter.rb
3
+
4
+ Encapsulates the logic of creating permutations
5
+
6
+ Usage:
7
+ p = Permuter.new
8
+ p.add_array [0,1]
9
+ p.add_array [0,1]
10
+ p.permutations
11
+ => ["00","01","10","11"]
12
+
13
+ Methods:
14
+ #add_array arr
15
+ #permutations
16
+
17
+ #any?
18
+ #empty?
19
+ #clear
20
+
21
+ Test Suite:
22
+ Complete
23
+ =end
24
+
25
+ class Permuter
26
+ def initialize
27
+ @arrays = []
28
+ end
29
+
30
+ # Add an array to be permuted
31
+ # Raises an error if given nil
32
+ def add_array arr
33
+ raise "Cannot add nil array" if arr == nil
34
+ @arrays << arr
35
+ end
36
+
37
+ # Remove all arrays
38
+ def clear
39
+ @arrays = []
40
+ end
41
+
42
+ def any?
43
+ @arrays.any?
44
+ end
45
+
46
+ def empty?
47
+ @arrays.empty?
48
+ end
49
+
50
+ # Get all permutations of the previously registered arrays
51
+ # Returns an array of strings,
52
+ # or an empty array if none were registered
53
+ def permutations
54
+ return [] if @arrays.empty?
55
+ @permutations = []
56
+ permute []
57
+ @permutations
58
+ end
59
+
60
+ private
61
+ # permute (indices)
62
+ # Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
63
+ #
64
+ # The classic recursive permutation algorithm:
65
+ # Imagine picking a combination lock: [0][0][0]
66
+ # Each cylinder is the index to one of the arrays
67
+ # On each recursion, we add another cylinder [0], [0][0], [0][0][0]
68
+ # When we have enough cylinders, we generate the permutation (base case)
69
+ # and iterate to the next value by dropping a cylinder, [0][0]
70
+ # iterating the loop in else, and recursing again [0][0][1]
71
+ # Simple and elegant
72
+ def permute indices
73
+ # Base case
74
+ # puts "permute(#{indices})"
75
+ if indices.length == @arrays.length # If the set of indices is complete
76
+ # Build a `String` based on the completed set of indices
77
+ build_permutation indices
78
+ else
79
+ # Otherwise, add a cylinder, iterate through its options;
80
+ # If it's the final cylinder it will trigger the base case on every option and return;
81
+ # If it's not, it will also trigger this case and iterate through the options of the next cylinder.
82
+ @arrays[indices.length].each_with_index do |item,i|
83
+ permute indices.dup << i
84
+ end
85
+ end
86
+ end
87
+
88
+ def build_permutation indices
89
+ permutation = []
90
+ indices.each_with_index do |item_code,i|
91
+ permutation << @arrays[i][item_code]
92
+ end
93
+ result = permutation.join('')
94
+ @permutations << result
95
+ result
96
+ end
97
+ end
@@ -0,0 +1,80 @@
1
+ =begin
2
+
3
+ PhonemeMaps.rb
4
+ Loads phoneme_map files
5
+
6
+ Lazily loads by default;
7
+ The file is loaded on the first method call,
8
+ and is cached for future calls.
9
+
10
+ For eager loading, pass true in the initializer.
11
+
12
+ Methods:
13
+ * initialize(eager?) ->
14
+ * long
15
+ * short
16
+ * single
17
+ * loaded? (:list_name)
18
+
19
+ Test Suite:
20
+ Complete
21
+ =end
22
+
23
+ require 'json'
24
+
25
+ lib_directory = File.dirname(__FILE__)
26
+ FILE_DIRECTORY = "#{lib_directory}/phoneme_maps"
27
+
28
+ class PhonemeMaps
29
+
30
+ # Takes a symbol, converts it into a file name,
31
+ # And attempts to load its contents
32
+ # Returns a Hash
33
+ def load symbol
34
+ load_file "#{FILE_DIRECTORY}/#{symbol.to_s}.json"
35
+ end
36
+
37
+ # What directory are we searching in?
38
+ def directory
39
+ FILE_DIRECTORY
40
+ end
41
+
42
+ # Parses a string into JSON
43
+ # Raises an informative error if the JSON is malformed
44
+ def validate_json text
45
+ begin
46
+ return JSON.parse text
47
+ rescue JSON::ParserError
48
+ raise "JSON is not formatted properly.\nTry validating it at JSONlint.com (Look out for missing braces and missing/extra commas)\n File contents: #{text}"
49
+ end
50
+ end
51
+
52
+ # Opens a file with `File.open`
53
+ # Raises an informative error if the file cannot be found
54
+ def open_file_safely path
55
+ dir = path[0..path.rindex('/')]
56
+ filename = path[ (path.rindex('/') + 1)..path.length ]
57
+ begin
58
+ return File.open path, 'r'
59
+ rescue Errno::ENOENT
60
+ raise "Unknown list name. Could not find file `#{filename}` in directory `#{dir}`.\n
61
+ Is the file name spelled correctly, or altered somewhere in your code?\n
62
+ Contents of directory:
63
+ #{Dir.new(dir).entries}"
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ # Loads the file from the supplied path,
70
+ # and parses it with `JSON.parse`
71
+ # Returns a hash
72
+ def load_file path
73
+ text = ""
74
+ open_file_safely(path).
75
+ each_line(){|line| text << line }.
76
+ close
77
+ validate_json text
78
+ end
79
+
80
+ end
@@ -0,0 +1,41 @@
1
+ {
2
+ "א": ["", "a"],
3
+ "ב": ["v", "bb"],
4
+ "בּ": ["v", "b", "bb"],
5
+ "ג": ["g", "gg"],
6
+ "ד": ["d", "dd"],
7
+ "ה": ["", "h"],
8
+ "ו": ["v", "w"],
9
+ "וּּ": ["oo", "ou"],
10
+ "ז": ["z", "s", "zz", "ss"],
11
+ "ח": ["ch", "h", "kh"],
12
+ "חַ": ["ach"],
13
+ "ט": ["t", "tt", "th"],
14
+ "י": ["y"],
15
+ "כ": ["ch", "h", "k", "c", "kk", "cc"],
16
+ "כּ": ["k", "c", "kk", "cc"],
17
+ "ל": ["l", "ll"],
18
+ "מ": ["m", "mm"],
19
+ "נ": ["n", "nn"],
20
+ "ס": ["s", "ss"],
21
+ "ע": [""],
22
+ "פ": ["f", "ff", "ph", "p", "pp"],
23
+ "פּ": ["p", "pp"],
24
+ "צ": ["ts", "tz", "s", "z"],
25
+ "ק": ["k", "c", "kk", "cc"],
26
+ "ר": ["r", "rr"],
27
+ "שׁ": ["sh", "ss", "s", "ch", "sch"],
28
+ "ש": ["s", "ss", "sh"],
29
+ "ת": ["s", "ss", "t", "tt", "th"],
30
+ "תּ": ["t", "t", "th"],
31
+ "ָ": ["o", "oo", "oh", "a", "ah", "aa", "e", "ee", "i", "a"],
32
+ "ַ": ["a", "o", "ah", "oh", ""],
33
+ "ֵ": ["e", "ei", "ey", "eh", "ay", "ai", ""],
34
+ "ֶ": ["e", "eh", "ei"],
35
+ "ִ": ["i", "e", "ee"],
36
+ "ֹ": ["o", "oh", "oi", "oy", "ey", "ow"],
37
+ "וֹ": ["o", "oh", "oi", "oy", "ey", "ow"],
38
+ "וּ": ["u", "oo", "i", "ee"],
39
+ "ֻ": ["u", "oo", "i", "ee"],
40
+ "ְ": ["u", "o", "e"]
41
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "א": [""],
3
+ "ב": ["v"],
4
+ "בּ": ["b","bb"],
5
+ "ג": ["g","gg"],
6
+ "ד": ["d","dd"],
7
+ "ה": ["","h"],
8
+ "ו": ["v"],
9
+ "ז": ["z","zz"],
10
+ "ח": ["ch"],
11
+ "חַ": ["ach"],
12
+ "ט": ["t","tt"],
13
+ "י": ["y",""],
14
+ "כ": ["ch"],
15
+ "כּ": ["k","c","kk","cc"],
16
+ "ל": ["l","ll"],
17
+ "מ": ["m","mm"],
18
+ "נ": ["n","nn"],
19
+ "ס": ["s","ss"],
20
+ "ע": ["a"],
21
+ "פ": ["f","ff","ph"],
22
+ "פּ": ["p","pp"],
23
+ "צ": ["ts","tz","tez","z"],
24
+ "ק": ["k","kk"],
25
+ "ר": ["r"],
26
+ "שׁ": ["sh"],
27
+ "ש": ["s","ss"],
28
+ "ת": ["s","ss","th","t"],
29
+ "תּ": ["t","tt"],
30
+ "ָ": ["o", "a", "e"],
31
+ "ַ": ["a"],
32
+ "ֵ": ["ay","ai","e","ei"],
33
+ "ֶ": ["e","a"],
34
+ "ִ": ["i","ee"],
35
+ "ֹ": ["a","o",""],
36
+ "וּ": ["u","oo","eu"],
37
+ "ֻ": ["u","oo","eu"],
38
+ "ְ": ["a","e","i","'"]
39
+ }