translit_kit 0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +7 -0
  2. data/MIT-LICENSE +20 -0
  3. data/README.md +86 -0
  4. data/Rakefile +29 -0
  5. data/lib/hebrewword.rb +60 -0
  6. data/lib/permuter.rb +97 -0
  7. data/lib/phoneme_maps.rb +80 -0
  8. data/lib/phoneme_maps/long.json +41 -0
  9. data/lib/phoneme_maps/short.json +39 -0
  10. data/lib/phoneme_maps/single.json +40 -0
  11. data/lib/phonemizer.rb +170 -0
  12. data/lib/readme.md +120 -0
  13. data/lib/translit_kit.rb +2 -0
  14. data/lib/translit_kit/version.rb +3 -0
  15. data/lib/transliterator.rb +115 -0
  16. data/test/dummy/README.rdoc +28 -0
  17. data/test/dummy/Rakefile +6 -0
  18. data/test/dummy/app/assets/javascripts/application.js +13 -0
  19. data/test/dummy/app/assets/stylesheets/application.css +15 -0
  20. data/test/dummy/app/controllers/application_controller.rb +5 -0
  21. data/test/dummy/app/helpers/application_helper.rb +2 -0
  22. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  23. data/test/dummy/bin/bundle +3 -0
  24. data/test/dummy/bin/rails +4 -0
  25. data/test/dummy/bin/rake +4 -0
  26. data/test/dummy/bin/setup +34 -0
  27. data/test/dummy/bin/update +29 -0
  28. data/test/dummy/config.ru +4 -0
  29. data/test/dummy/config/application.rb +15 -0
  30. data/test/dummy/config/boot.rb +3 -0
  31. data/test/dummy/config/cable.yml +9 -0
  32. data/test/dummy/config/database.yml +25 -0
  33. data/test/dummy/config/environment.rb +5 -0
  34. data/test/dummy/config/environments/development.rb +54 -0
  35. data/test/dummy/config/environments/production.rb +86 -0
  36. data/test/dummy/config/environments/test.rb +42 -0
  37. data/test/dummy/config/initializers/application_controller_renderer.rb +6 -0
  38. data/test/dummy/config/initializers/assets.rb +11 -0
  39. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  40. data/test/dummy/config/initializers/cookies_serializer.rb +5 -0
  41. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  42. data/test/dummy/config/initializers/inflections.rb +16 -0
  43. data/test/dummy/config/initializers/mime_types.rb +4 -0
  44. data/test/dummy/config/initializers/new_framework_defaults.rb +23 -0
  45. data/test/dummy/config/initializers/session_store.rb +3 -0
  46. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  47. data/test/dummy/config/locales/en.yml +23 -0
  48. data/test/dummy/config/puma.rb +47 -0
  49. data/test/dummy/config/routes.rb +3 -0
  50. data/test/dummy/config/secrets.yml +22 -0
  51. data/test/dummy/config/spring.rb +6 -0
  52. data/test/dummy/db/test.sqlite3 +0 -0
  53. data/test/dummy/log/test.log +85939 -0
  54. data/test/dummy/public/404.html +67 -0
  55. data/test/dummy/public/422.html +67 -0
  56. data/test/dummy/public/500.html +66 -0
  57. data/test/dummy/public/favicon.ico +0 -0
  58. data/test/hebrewword_test.rb +45 -0
  59. data/test/permuter_test.rb +53 -0
  60. data/test/phoneme_maps_test.rb +29 -0
  61. data/test/phonemizer_test.rb +209 -0
  62. data/test/test_helper.rb +29 -0
  63. data/test/transliterator_test.rb +75 -0
  64. metadata +155 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7280e8d3c76a0d829c9675616aef2913f879c9ad
4
+ data.tar.gz: 11188ff85dfe552a5057ad0bb7ca4003725343b8
5
+ SHA512:
6
+ metadata.gz: 831e6a6bb8c98af691721f055d74be5d415bfda3fe694f194133893218ae29af6df25b288b9bfeea656dc144a960c22c3dbe0d479afdf279c529676c5d0c9275
7
+ data.tar.gz: 18d2d3eee2be1383118ee6fd56547bc9974c74547a7bf4aed8bff8a9e3f08397409855f7495e16507bf6a7cb6da08f00aed17191f128a07218e4839525ac8582
data/MIT-LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright 2017 Michoel Samuels
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,86 @@
1
+ # TranslitKit
2
+
3
+ [![Build Status](https://travis-ci.org/AnalyzePlatypus/TranslitKit.svg?branch=master)](https://travis-ci.org/AnalyzePlatypus/TranslitKit)
4
+ [![Code Climate](https://codeclimate.com/github/AnalyzePlatypus/TranslitKit/badges/gpa.svg)](https://codeclimate.com/github/AnalyzePlatypus/TranslitKit)
5
+ [![Coverage Status](https://coveralls.io/repos/github/AnalyzePlatypus/TranslitKit/badge.svg?branch=master)](https://coveralls.io/github/AnalyzePlatypus/TranslitKit?branch=master)
6
+ [![Inline docs](http://inch-ci.org/github/AnalyzePlatypus/TranslitKit.svg?branch=master)](http://inch-ci.org/github/AnalyzePlatypus/TranslitKit)
7
+
8
+ *TranslitKit* is a framework for Hebrew-English transliteration.
9
+
10
+ Example:
11
+ ```ruby
12
+ require 'translit_kit'
13
+ word = HebrewWord.new "אַברָהָם"
14
+ word.transliterate(:single)
15
+ # => ["avrohom"]
16
+
17
+ # Shortcut
18
+ word.t(:single)
19
+ # => ["avrohom"]
20
+ ```
21
+ Transliteration is powered by _phoneme maps_, files that map between Hebrew _phonemes_, or units of sound, and English characters. (see below)
22
+
23
+ Three `phoneme_maps` are provided: `:long`, `:short`, and `:single`.
24
+ You can easily add your own (see below)
25
+
26
+ ```ruby
27
+ word.t(:single)
28
+ # => ["avrohom"]
29
+ word.t(:short)
30
+ # => ["avroom", "avroam", "avroem", "avrohom", "avroham",
31
+ # "avrohem", "avraom", "avraam", "avraem", "avrahom",
32
+ # "avraham", "avrahem", "avreom", "avream", "avreem",
33
+ # "avrehom", "avreham", "avrehem" ]
34
+ word.t(:long)
35
+ # => ["avroom", "avrooom", "avroohm", ... ] # 5,997 more!
36
+ ```
37
+
38
+ The default is `:short`:
39
+ ```ruby
40
+ word.t == word.t(:short)
41
+ # => true
42
+ ```
43
+ To get the total permutation count, call `HebrewWord#inspect`
44
+ ```ruby
45
+ word.inspect
46
+ # => "אַברָהָם: Permutations: 1 single | 18 short | 6000 long"
47
+ ```
48
+
49
+ ## Adding Custom Phoneme maps
50
+ ###### Format
51
+ _Phoneme Maps_ are simply JSON files, placed in the `lib/phoneme_maps` directory.
52
+
53
+ The file should map between each `String` (the phonemes) and an `Array`s of replacement characters.
54
+
55
+ ```json
56
+ {
57
+ "ב": ["v"],
58
+ "בּ": ["b", "bb"]
59
+ }
60
+ ```
61
+
62
+ A _phoneme_ can be a Hebrew character `א`, _nekuda_ (`ָ`), or character with modifiers, such as a _dagesh_ (`בּ`). Keep in mind that many characters will be normalized (see below).
63
+
64
+ ###### Installation
65
+ To install your custom map, place the file in `lib/resources`
66
+
67
+ Your file will be available as the symbol`:<filename>` without the `.json` extension.
68
+
69
+ Example: `klingon.json` becomes `:klingon`
70
+
71
+ Now you can use it anywhere:
72
+ ```ruby
73
+ word.transliterate(:klingon)
74
+ # => (Results)
75
+ ```
76
+
77
+ At present, your map will not display results in `HebrewWord#inspect`
78
+
79
+ ## Appendix: Pre-Processing
80
+ When a word is transliterated, it is pre-processed to normalize certain characters.
81
+ Specifically:
82
+ * Whitespace is stripped
83
+ * The final letters `[םןךףץ]` are normalized to their standard forms
84
+ * _CHATAF_ _nekudos_ `['ֲ','ֳ','ֱ']` are normalized to their standard forms
85
+ * Full _CHIRIK_, _TZEIREI_, and _CHOLOM_ _nekudos_ have their letters removed
86
+ * _DAGESH_ characters are removed from all but the characters `[בוכפת]`
data/Rakefile ADDED
@@ -0,0 +1,29 @@
1
+ begin
2
+ require 'bundler/setup'
3
+ rescue LoadError
4
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
5
+ end
6
+
7
+ require 'rdoc/task'
8
+
9
+ RDoc::Task.new(:rdoc) do |rdoc|
10
+ rdoc.rdoc_dir = 'rdoc'
11
+ rdoc.title = 'TranslitKit'
12
+ rdoc.options << '--line-numbers'
13
+ rdoc.rdoc_files.include('README.rdoc')
14
+ rdoc.rdoc_files.include('lib/**/*.rb')
15
+ end
16
+
17
+ Bundler::GemHelper.install_tasks
18
+
19
+ require 'rake/testtask'
20
+
21
+ Rake::TestTask.new(:test) do |t|
22
+ t.libs << 'lib'
23
+ t.libs << 'test'
24
+ t.pattern = 'test/**/*_test.rb'
25
+ t.verbose = false
26
+ end
27
+
28
+
29
+ task default: :test
data/lib/hebrewword.rb ADDED
@@ -0,0 +1,60 @@
1
+ =begin
2
+ HebrewWord.rb
3
+
4
+ Wraps a Hebrew word.
5
+
6
+ Methods:
7
+ * raw -> returns the original word
8
+ * to_s -> Alias to `raw`
9
+ * phonemes -> Returns an Array of phonemes (see Class::Phonemizer)
10
+ * transliterate(list_name) -> Returns as Array of transliterated strings
11
+ * t -> Alias for `transliterate`
12
+ * inspect -> Returns an informative string of the original Hebrew, and the available translit counts
13
+
14
+ =end
15
+
16
+ require 'phoneme_maps'
17
+ require 'phonemizer'
18
+ require 'transliterator'
19
+
20
+ # The user-facing transliterator class
21
+ class HebrewWord
22
+
23
+ # Initializer
24
+ # Expects a Unicode Hebrew word (i.e. "עַקֵדָה")
25
+ def initialize string
26
+ @hebword = string
27
+ end
28
+
29
+ # Get the raw Hebrew text of the word (Included NIKUD)
30
+ def raw
31
+ @hebword
32
+ end
33
+
34
+ # Alias of `raw`
35
+ def to_s
36
+ raw
37
+ end
38
+
39
+ # Returns a `String` of format:
40
+ # `hebrew_text`: Permutations: `x` single | `y` short | `z` long
41
+ def inspect
42
+ "#{@hebword}: Permutations: #{transliterate(:single).length} single | #{transliterate(:short).length} short | #{transliterate(:long).length} long"
43
+ end
44
+
45
+ def phonemes
46
+ Phonemizer.new(@hebword).phonemes
47
+ end
48
+
49
+ # Return an `Array` of all possible transliterations of the word
50
+ # As defined in the optional `list_name` argument. options: [:long, :short, :single]
51
+ # Default is `:short`
52
+ def transliterate list_name = nil
53
+ Transliterator.new(@hebword, list_name).transliterate
54
+ end
55
+
56
+ # Alias for #transliterate
57
+ def t list_name = nil
58
+ transliterate list_name
59
+ end
60
+ end
data/lib/permuter.rb ADDED
@@ -0,0 +1,97 @@
1
+ =begin
2
+ Permuter.rb
3
+
4
+ Encapsulates the logic of creating permutations
5
+
6
+ Usage:
7
+ p = Permuter.new
8
+ p.add_array [0,1]
9
+ p.add_array [0,1]
10
+ p.permutations
11
+ => ["00","01","10","11"]
12
+
13
+ Methods:
14
+ #add_array arr
15
+ #permutations
16
+
17
+ #any?
18
+ #empty?
19
+ #clear
20
+
21
+ Test Suite:
22
+ Complete
23
+ =end
24
+
25
+ class Permuter
26
+ def initialize
27
+ @arrays = []
28
+ end
29
+
30
+ # Add an array to be permuted
31
+ # Raises an error if given nil
32
+ def add_array arr
33
+ raise "Cannot add nil array" if arr == nil
34
+ @arrays << arr
35
+ end
36
+
37
+ # Remove all arrays
38
+ def clear
39
+ @arrays = []
40
+ end
41
+
42
+ def any?
43
+ @arrays.any?
44
+ end
45
+
46
+ def empty?
47
+ @arrays.empty?
48
+ end
49
+
50
+ # Get all permutations of the previously registered arrays
51
+ # Returns an array of strings,
52
+ # or an empty array if none were registered
53
+ def permutations
54
+ return [] if @arrays.empty?
55
+ @permutations = []
56
+ permute []
57
+ @permutations
58
+ end
59
+
60
+ private
61
+ # permute (indices)
62
+ # Recursively generate every permutation of the arrays (Courtesy of Ari Fordsham)
63
+ #
64
+ # The classic recursive permutation algorithm:
65
+ # Imagine picking a combination lock: [0][0][0]
66
+ # Each cylinder is the index to one of the arrays
67
+ # On each recursion, we add another cylinder [0], [0][0], [0][0][0]
68
+ # When we have enough cylinders, we generate the permutation (base case)
69
+ # and iterate to the next value by dropping a cylinder, [0][0]
70
+ # iterating the loop in else, and recursing again [0][0][1]
71
+ # Simple and elegant
72
+ def permute indices
73
+ # Base case
74
+ # puts "permute(#{indices})"
75
+ if indices.length == @arrays.length # If the set of indices is complete
76
+ # Build a `String` based on the completed set of indices
77
+ build_permutation indices
78
+ else
79
+ # Otherwise, add a cylinder, iterate through its options;
80
+ # If it's the final cylinder it will trigger the base case on every option and return;
81
+ # If it's not, it will also trigger this case and iterate through the options of the next cylinder.
82
+ @arrays[indices.length].each_with_index do |item,i|
83
+ permute indices.dup << i
84
+ end
85
+ end
86
+ end
87
+
88
+ def build_permutation indices
89
+ permutation = []
90
+ indices.each_with_index do |item_code,i|
91
+ permutation << @arrays[i][item_code]
92
+ end
93
+ result = permutation.join('')
94
+ @permutations << result
95
+ result
96
+ end
97
+ end
@@ -0,0 +1,80 @@
1
+ =begin
2
+
3
+ PhonemeMaps.rb
4
+ Loads phoneme_map files
5
+
6
+ Lazily loads by default;
7
+ The file is loaded on the first method call,
8
+ and is cached for future calls.
9
+
10
+ For eager loading, pass true in the initializer.
11
+
12
+ Methods:
13
+ * initialize(eager?) ->
14
+ * long
15
+ * short
16
+ * single
17
+ * loaded? (:list_name)
18
+
19
+ Test Suite:
20
+ Complete
21
+ =end
22
+
23
+ require 'json'
24
+
25
+ lib_directory = File.dirname(__FILE__)
26
+ FILE_DIRECTORY = "#{lib_directory}/phoneme_maps"
27
+
28
+ class PhonemeMaps
29
+
30
+ # Takes a symbol, converts it into a file name,
31
+ # And attempts to load its contents
32
+ # Returns a Hash
33
+ def load symbol
34
+ load_file "#{FILE_DIRECTORY}/#{symbol.to_s}.json"
35
+ end
36
+
37
+ # What directory are we searching in?
38
+ def directory
39
+ FILE_DIRECTORY
40
+ end
41
+
42
+ # Parses a string into JSON
43
+ # Raises an informative error if the JSON is malformed
44
+ def validate_json text
45
+ begin
46
+ return JSON.parse text
47
+ rescue JSON::ParserError
48
+ raise "JSON is not formatted properly.\nTry validating it at JSONlint.com (Look out for missing braces and missing/extra commas)\n File contents: #{text}"
49
+ end
50
+ end
51
+
52
+ # Opens a file with `File.open`
53
+ # Raises an informative error if the file cannot be found
54
+ def open_file_safely path
55
+ dir = path[0..path.rindex('/')]
56
+ filename = path[ (path.rindex('/') + 1)..path.length ]
57
+ begin
58
+ return File.open path, 'r'
59
+ rescue Errno::ENOENT
60
+ raise "Unknown list name. Could not find file `#{filename}` in directory `#{dir}`.\n
61
+ Is the file name spelled correctly, or altered somewhere in your code?\n
62
+ Contents of directory:
63
+ #{Dir.new(dir).entries}"
64
+ end
65
+ end
66
+
67
+ private
68
+
69
+ # Loads the file from the supplied path,
70
+ # and parses it with `JSON.parse`
71
+ # Returns a hash
72
+ def load_file path
73
+ text = ""
74
+ open_file_safely(path).
75
+ each_line(){|line| text << line }.
76
+ close
77
+ validate_json text
78
+ end
79
+
80
+ end
@@ -0,0 +1,41 @@
1
+ {
2
+ "א": ["", "a"],
3
+ "ב": ["v", "bb"],
4
+ "בּ": ["v", "b", "bb"],
5
+ "ג": ["g", "gg"],
6
+ "ד": ["d", "dd"],
7
+ "ה": ["", "h"],
8
+ "ו": ["v", "w"],
9
+ "וּּ": ["oo", "ou"],
10
+ "ז": ["z", "s", "zz", "ss"],
11
+ "ח": ["ch", "h", "kh"],
12
+ "חַ": ["ach"],
13
+ "ט": ["t", "tt", "th"],
14
+ "י": ["y"],
15
+ "כ": ["ch", "h", "k", "c", "kk", "cc"],
16
+ "כּ": ["k", "c", "kk", "cc"],
17
+ "ל": ["l", "ll"],
18
+ "מ": ["m", "mm"],
19
+ "נ": ["n", "nn"],
20
+ "ס": ["s", "ss"],
21
+ "ע": [""],
22
+ "פ": ["f", "ff", "ph", "p", "pp"],
23
+ "פּ": ["p", "pp"],
24
+ "צ": ["ts", "tz", "s", "z"],
25
+ "ק": ["k", "c", "kk", "cc"],
26
+ "ר": ["r", "rr"],
27
+ "שׁ": ["sh", "ss", "s", "ch", "sch"],
28
+ "ש": ["s", "ss", "sh"],
29
+ "ת": ["s", "ss", "t", "tt", "th"],
30
+ "תּ": ["t", "t", "th"],
31
+ "ָ": ["o", "oo", "oh", "a", "ah", "aa", "e", "ee", "i", "a"],
32
+ "ַ": ["a", "o", "ah", "oh", ""],
33
+ "ֵ": ["e", "ei", "ey", "eh", "ay", "ai", ""],
34
+ "ֶ": ["e", "eh", "ei"],
35
+ "ִ": ["i", "e", "ee"],
36
+ "ֹ": ["o", "oh", "oi", "oy", "ey", "ow"],
37
+ "וֹ": ["o", "oh", "oi", "oy", "ey", "ow"],
38
+ "וּ": ["u", "oo", "i", "ee"],
39
+ "ֻ": ["u", "oo", "i", "ee"],
40
+ "ְ": ["u", "o", "e"]
41
+ }
@@ -0,0 +1,39 @@
1
+ {
2
+ "א": [""],
3
+ "ב": ["v"],
4
+ "בּ": ["b","bb"],
5
+ "ג": ["g","gg"],
6
+ "ד": ["d","dd"],
7
+ "ה": ["","h"],
8
+ "ו": ["v"],
9
+ "ז": ["z","zz"],
10
+ "ח": ["ch"],
11
+ "חַ": ["ach"],
12
+ "ט": ["t","tt"],
13
+ "י": ["y",""],
14
+ "כ": ["ch"],
15
+ "כּ": ["k","c","kk","cc"],
16
+ "ל": ["l","ll"],
17
+ "מ": ["m","mm"],
18
+ "נ": ["n","nn"],
19
+ "ס": ["s","ss"],
20
+ "ע": ["a"],
21
+ "פ": ["f","ff","ph"],
22
+ "פּ": ["p","pp"],
23
+ "צ": ["ts","tz","tez","z"],
24
+ "ק": ["k","kk"],
25
+ "ר": ["r"],
26
+ "שׁ": ["sh"],
27
+ "ש": ["s","ss"],
28
+ "ת": ["s","ss","th","t"],
29
+ "תּ": ["t","tt"],
30
+ "ָ": ["o", "a", "e"],
31
+ "ַ": ["a"],
32
+ "ֵ": ["ay","ai","e","ei"],
33
+ "ֶ": ["e","a"],
34
+ "ִ": ["i","ee"],
35
+ "ֹ": ["a","o",""],
36
+ "וּ": ["u","oo","eu"],
37
+ "ֻ": ["u","oo","eu"],
38
+ "ְ": ["a","e","i","'"]
39
+ }