interscript 0.1.6 → 2.1.0a9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (226) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +11 -0
  3. data/.rspec +3 -0
  4. data/Gemfile +29 -0
  5. data/LICENSE.adoc +31 -0
  6. data/README.md +3 -0
  7. data/Rakefile +53 -0
  8. data/bin/console +14 -0
  9. data/bin/interscript +3 -39
  10. data/bin/maps_analyze_staging +168 -0
  11. data/bin/maps_debug_compilers +58 -0
  12. data/bin/maps_debug_ordering +88 -0
  13. data/bin/maps_debug_ruby_compile +24 -0
  14. data/bin/maps_debug_step_by_step +44 -0
  15. data/bin/maps_optimize_order +112 -0
  16. data/bin/maps_v1_analyze_regexps +45 -0
  17. data/bin/maps_v1_to_v2 +426 -0
  18. data/exe/interscript +6 -0
  19. data/interscript.gemspec +31 -0
  20. data/lib/interscript.rb +81 -127
  21. data/lib/interscript/command.rb +5 -5
  22. data/lib/interscript/compiler.rb +22 -0
  23. data/lib/interscript/compiler/javascript.rb +292 -0
  24. data/lib/interscript/compiler/ruby.rb +262 -0
  25. data/lib/interscript/dsl.rb +67 -0
  26. data/lib/interscript/dsl/aliases.rb +23 -0
  27. data/lib/interscript/dsl/document.rb +46 -0
  28. data/lib/interscript/dsl/group.rb +45 -0
  29. data/lib/interscript/dsl/group/parallel.rb +6 -0
  30. data/lib/interscript/dsl/items.rb +89 -0
  31. data/lib/interscript/dsl/metadata.rb +26 -0
  32. data/lib/interscript/dsl/stage.rb +6 -0
  33. data/lib/interscript/dsl/symbol_mm.rb +11 -0
  34. data/lib/interscript/dsl/tests.rb +12 -0
  35. data/lib/interscript/interpreter.rb +251 -0
  36. data/lib/interscript/node.rb +25 -0
  37. data/lib/interscript/node/alias_def.rb +15 -0
  38. data/lib/interscript/node/dependency.rb +13 -0
  39. data/lib/interscript/node/document.rb +45 -0
  40. data/lib/interscript/node/group.rb +34 -0
  41. data/lib/interscript/node/group/parallel.rb +9 -0
  42. data/lib/interscript/node/group/sequential.rb +2 -0
  43. data/lib/interscript/node/item.rb +52 -0
  44. data/lib/interscript/node/item/alias.rb +42 -0
  45. data/lib/interscript/node/item/any.rb +61 -0
  46. data/lib/interscript/node/item/capture.rb +50 -0
  47. data/lib/interscript/node/item/group.rb +51 -0
  48. data/lib/interscript/node/item/repeat.rb +40 -0
  49. data/lib/interscript/node/item/stage.rb +23 -0
  50. data/lib/interscript/node/item/string.rb +51 -0
  51. data/lib/interscript/node/metadata.rb +18 -0
  52. data/lib/interscript/node/rule.rb +6 -0
  53. data/lib/interscript/node/rule/funcall.rb +18 -0
  54. data/lib/interscript/node/rule/run.rb +15 -0
  55. data/lib/interscript/node/rule/sub.rb +65 -0
  56. data/lib/interscript/node/stage.rb +19 -0
  57. data/lib/interscript/node/tests.rb +15 -0
  58. data/lib/interscript/stdlib.rb +211 -0
  59. data/lib/interscript/utils/regexp_converter.rb +283 -0
  60. data/lib/interscript/version.rb +1 -1
  61. data/requirements.txt +1 -0
  62. metadata +75 -339
  63. data/README.adoc +0 -298
  64. data/bin/rspec +0 -29
  65. data/lib/__pycache__/g2pwrapper.cpython-38.pyc +0 -0
  66. data/lib/g2pwrapper.py +0 -34
  67. data/lib/interscript-opal.rb +0 -2
  68. data/lib/interscript/fs.rb +0 -71
  69. data/lib/interscript/mapping.rb +0 -142
  70. data/lib/interscript/opal.rb +0 -27
  71. data/lib/interscript/opal/maps.js.erb +0 -10
  72. data/lib/interscript/opal_map_translate.rb +0 -12
  73. data/lib/model-7 +0 -0
  74. data/lib/tha-pt-b-7 +0 -0
  75. data/maps/acadsin-zho-Hani-Latn-2002.yaml +0 -38912
  76. data/maps/alalc-amh-Ethi-Latn-1997.yaml +0 -509
  77. data/maps/alalc-amh-Ethi-Latn-2011.yaml +0 -138
  78. data/maps/alalc-ara-Arab-Latn-1997.yaml +0 -1283
  79. data/maps/alalc-asm-Deva-Latn-1997.yaml +0 -159
  80. data/maps/alalc-aze-Cyrl-Latn-1997.yaml +0 -141
  81. data/maps/alalc-bel-Cyrl-Latn-1997.yaml +0 -125
  82. data/maps/alalc-ben-Beng-Latn-2017.yaml +0 -130
  83. data/maps/alalc-bul-Cyrl-Latn-1997.yaml +0 -94
  84. data/maps/alalc-ell-Grek-Latn-1997.yaml +0 -624
  85. data/maps/alalc-ell-Grek-Latn-2010.yaml +0 -627
  86. data/maps/alalc-hin-Deva-Latn-2020.yaml +0 -159
  87. data/maps/alalc-kat-Geok-Latn-1997.yaml +0 -111
  88. data/maps/alalc-kat-Geor-Latn-1997.yaml +0 -146
  89. data/maps/alalc-kor-Hang-Latn-1997.yaml +0 -94
  90. data/maps/alalc-mar-Deva-Latn-1997.yaml +0 -170
  91. data/maps/alalc-mkd-Cyrl-Latn-1997.yaml +0 -114
  92. data/maps/alalc-mkd-Cyrl-Latn-2013.yaml +0 -103
  93. data/maps/alalc-pan-Deva-Latn-1997.yaml +0 -237
  94. data/maps/alalc-rus-Cyrl-Latn-1997.yaml +0 -221
  95. data/maps/alalc-rus-Cyrl-Latn-2012.yaml +0 -162
  96. data/maps/alalc-srp-Cyrl-Latn-1997.yaml +0 -114
  97. data/maps/alalc-srp-Cyrl-Latn-2013.yaml +0 -135
  98. data/maps/alalc-ukr-Cyrl-Latn-1997.yaml +0 -141
  99. data/maps/alalc-ukr-Cyrl-Latn-2011.yaml +0 -16
  100. data/maps/apcbg-bul-Cyrl-Latn-1995.yaml +0 -283
  101. data/maps/bas-rus-Cyrl-Latn-2017-bss.yaml +0 -174
  102. data/maps/bas-rus-Cyrl-Latn-2017-oss.yaml +0 -169
  103. data/maps/bgn-jpn-Hrkt-Latn-1962.yaml +0 -292
  104. data/maps/bgn-kor-Hang-Latn-1943.yaml +0 -31
  105. data/maps/bgn-kor-Kore-Latn-1943.yaml +0 -31
  106. data/maps/bgna-bul-Cyrl-Latn-2006.yaml +0 -208
  107. data/maps/bgna-bul-Cyrl-Latn-2009.yaml +0 -208
  108. data/maps/bgnpcgn-amh-Ethi-Latn-1967.yaml +0 -528
  109. data/maps/bgnpcgn-ara-Arab-Latn-1956.yaml +0 -592
  110. data/maps/bgnpcgn-arm-Armn-Latn-1981.yaml +0 -108
  111. data/maps/bgnpcgn-aze-Cyrl-Latn-1993.yaml +0 -104
  112. data/maps/bgnpcgn-bak-Cyrl-Latn-2007.yaml +0 -184
  113. data/maps/bgnpcgn-bel-Cyrl-Latn-1979.yaml +0 -285
  114. data/maps/bgnpcgn-bul-Cyrl-Latn-1952.yaml +0 -115
  115. data/maps/bgnpcgn-bul-Cyrl-Latn-2013.yaml +0 -38
  116. data/maps/bgnpcgn-ell-Grek-Latn-1962.yaml +0 -701
  117. data/maps/bgnpcgn-ell-Grek-Latn-1996.yaml +0 -19
  118. data/maps/bgnpcgn-jpn-Hrkt-Latn-1976.yaml +0 -257
  119. data/maps/bgnpcgn-kat-Geor-Latn-1981.yaml +0 -127
  120. data/maps/bgnpcgn-kat-Geor-Latn-2009.yaml +0 -42
  121. data/maps/bgnpcgn-kor-Hang-Latn-kn-1945.yaml +0 -253
  122. data/maps/bgnpcgn-kor-Hang-Latn-rok-2011.yaml +0 -48
  123. data/maps/bgnpcgn-kor-Kore-Latn-rok-2011.yaml +0 -48
  124. data/maps/bgnpcgn-mkd-Cyrl-Latn-1981.yaml +0 -159
  125. data/maps/bgnpcgn-mkd-Cyrl-Latn-2013.yaml +0 -190
  126. data/maps/bgnpcgn-nep-Deva-Latn-2011.yaml +0 -200
  127. data/maps/bgnpcgn-per-Arab-Latn-1956.yaml +0 -92
  128. data/maps/bgnpcgn-rus-Cyrl-Latn-1947.yaml +0 -314
  129. data/maps/bgnpcgn-srp-Cyrl-Latn-2005.yaml +0 -166
  130. data/maps/bgnpcgn-ukr-Cyrl-Latn-1965.yaml +0 -162
  131. data/maps/bgnpcgn-ukr-Cyrl-Latn-2019.yaml +0 -208
  132. data/maps/bgnpcgn-zho-Hans-Latn-1979.yaml +0 -7456
  133. data/maps/bis-asm-Beng-Latn-13194-1991.yaml +0 -159
  134. data/maps/bis-ben-Beng-Latn-13194-1991.yaml +0 -156
  135. data/maps/bis-dev-Deva-Latn-13194-1991.yaml +0 -184
  136. data/maps/bis-gjr-Gujr-Latn-13194-1991.yaml +0 -166
  137. data/maps/bis-knd-Knda-Latn-13194-1991.yaml +0 -173
  138. data/maps/bis-mlm-Mlym-Latn-13194-1991.yaml +0 -176
  139. data/maps/bis-ori-Orya-Latn-13194-1991.yaml +0 -160
  140. data/maps/bis-pnj-Guru-Latn-13194-1991.yaml +0 -175
  141. data/maps/bis-tel-Telu-Latn-13194-1991.yaml +0 -170
  142. data/maps/bis-tml-Taml-Latn-13194-1991.yaml +0 -155
  143. data/maps/by-bel-Cyrl-Latn-1998.yaml +0 -168
  144. data/maps/by-bel-Cyrl-Latn-2007.yaml +0 -115
  145. data/maps/dos-nep-Deva-Latn-1997.yaml +0 -33
  146. data/maps/elot-ell-Grek-Latn-743-1982-tl.yaml +0 -684
  147. data/maps/elot-ell-Grek-Latn-743-1982-ts.yaml +0 -680
  148. data/maps/elot-ell-Grek-Latn-743-2001-tl.yaml +0 -19
  149. data/maps/elot-ell-Grek-Latn-743-2001-ts.yaml +0 -31
  150. data/maps/ggg-kat-Geor-Latn-2002.yaml +0 -88
  151. data/maps/gki-bel-Cyrl-Latn-1992.yaml +0 -33
  152. data/maps/gki-bel-Cyrl-Latn-2000.yaml +0 -201
  153. data/maps/gost-rus-Cyrl-Latn-16876-71-1983.yaml +0 -186
  154. data/maps/hk-yue-Hani-Latn-1888.yaml +0 -38497
  155. data/maps/icao-bel-Cyrl-Latn-9303.yaml +0 -136
  156. data/maps/icao-bul-Cyrl-Latn-9303.yaml +0 -118
  157. data/maps/icao-heb-Hebr-Latn-9303.yaml +0 -151
  158. data/maps/icao-mkd-Cyrl-Latn-9303.yaml +0 -117
  159. data/maps/icao-per-Arab-Latn-9303.yaml +0 -103
  160. data/maps/icao-rus-Cyrl-Latn-9303.yaml +0 -117
  161. data/maps/icao-srp-Cyrl-Latn-9303.yaml +0 -117
  162. data/maps/icao-ukr-Cyrl-Latn-9303.yaml +0 -119
  163. data/maps/iso-ara-Arab-Latn-233-1984.yaml +0 -323
  164. data/maps/iso-ell-Grek-Latn-843-1997-t1.yaml +0 -609
  165. data/maps/iso-ell-Grek-Latn-843-1997-t2.yaml +0 -40
  166. data/maps/iso-jpn-Hrkt-Latn-3602-1989.yaml +0 -62
  167. data/maps/iso-rus-Cyrl-Latn-9-1995.yaml +0 -271
  168. data/maps/iso-tha-Thai-Latn-11940-1998.yaml +0 -109
  169. data/maps/kp-kor-Hang-Latn-2002.yaml +0 -901
  170. data/maps/lshk-yue-Hani-Latn-jyutping-1993.yaml +0 -44820
  171. data/maps/mext-jpn-Hrkt-Latn-1954.yaml +0 -411
  172. data/maps/moct-kor-Hang-Latn-2000.yaml +0 -803
  173. data/maps/mofa-jpn-Hrkt-Latn-1989.yaml +0 -541
  174. data/maps/mvd-bel-Cyrl-Latn-2008.yaml +0 -225
  175. data/maps/mvd-bel-Cyrl-Latn-2010.yaml +0 -63
  176. data/maps/mvd-rus-Cyrl-Latn-2008.yaml +0 -109
  177. data/maps/mvd-rus-Cyrl-Latn-2010.yaml +0 -37
  178. data/maps/nil-kor-Hang-Hang-jamo.yaml +0 -11193
  179. data/maps/odni-aze-Cyrl-Latn-2015.yaml +0 -144
  180. data/maps/odni-bel-Cyrl-Latn-2015.yaml +0 -148
  181. data/maps/odni-bul-Cyrl-Latn-2015.yaml +0 -96
  182. data/maps/odni-hin-Deva-Latn-2015.yaml +0 -258
  183. data/maps/odni-kat-Geor-Latn-2015.yaml +0 -87
  184. data/maps/odni-kaz-Cyrl-Latn-2015.yaml +0 -148
  185. data/maps/odni-kir-Cyrl-Latn-2015.yaml +0 -136
  186. data/maps/odni-mkd-Cyrl-Latn-2015.yaml +0 -122
  187. data/maps/odni-rus-Cyrl-Latn-2015.yaml +0 -77
  188. data/maps/odni-srp-Cyrl-Latn-2015.yaml +0 -129
  189. data/maps/odni-tat-Cyrl-Latn-2015.yaml +0 -142
  190. data/maps/odni-tgk-Cyrl-Latn-2015.yaml +0 -148
  191. data/maps/odni-uig-Cyrl-Latn-2015.yaml +0 -138
  192. data/maps/odni-ukr-Cyrl-Latn-2015.yaml +0 -157
  193. data/maps/odni-urd-Arab-Latn-2015.yaml +0 -221
  194. data/maps/odni-uzb-Cyrl-Latn-2015.yaml +0 -166
  195. data/maps/royin-tha-Thai-Latn-1939-generic.yaml +0 -90
  196. data/maps/royin-tha-Thai-Latn-1968.yaml +0 -179
  197. data/maps/royin-tha-Thai-Latn-1999-chained.yaml +0 -180
  198. data/maps/royin-tha-Thai-Latn-1999.yaml +0 -76
  199. data/maps/sac-zho-Hans-Latn-1979.yaml +0 -24759
  200. data/maps/ses-ara-Arab-Latn-1930.yaml +0 -279
  201. data/maps/stategeocadastre-ukr-Cyrl-Latn-1993.yaml +0 -222
  202. data/maps/ua-ukr-Cyrl-Latn-1996.yaml +0 -193
  203. data/maps/un-ara-Arab-Latn-1971.yaml +0 -139
  204. data/maps/un-ara-Arab-Latn-1972.yaml +0 -159
  205. data/maps/un-ara-Arab-Latn-2017.yaml +0 -420
  206. data/maps/un-bel-Cyrl-Latn-2007.yaml +0 -114
  207. data/maps/un-ben-Beng-Latn-2016.yaml +0 -534
  208. data/maps/un-ell-Grek-Latn-1987-tl.yaml +0 -31
  209. data/maps/un-ell-Grek-Latn-1987-ts.yaml +0 -19
  210. data/maps/un-ell-Grek-Latn-phonetic-1987.yaml +0 -780
  211. data/maps/un-mon-Mong-Latn-2013.yaml +0 -99
  212. data/maps/un-nep-Deva-Latn-1972.yaml +0 -163
  213. data/maps/un-rus-Cyrl-Latn-1987.yaml +0 -166
  214. data/maps/un-ukr-Cyrl-Latn-1998.yaml +0 -30
  215. data/maps/ungegn-amh-Ethi-Latn-2016.yaml +0 -575
  216. data/maps/var-jpn-Hrkt-Latn-hepburn-1886.yaml +0 -406
  217. data/maps/var-jpn-Hrkt-Latn-hepburn-1954.yaml +0 -386
  218. data/maps/var-kor-Hang-Latn-mr-1939.yaml +0 -1054
  219. data/maps/var-kor-Kore-Hang-2013.yaml +0 -59754
  220. data/maps/var-kor-Kore-Latn-mr-1939.yaml +0 -36
  221. data/maps/var-tha-Thai-Thai-phonemic.yaml +0 -59
  222. data/maps/var-tha-Thai-Zsym-ipa.yaml +0 -301
  223. data/maps/var-zho-Hani-Latn-1979.yaml +0 -38908
  224. data/spec/interscript/mapping_spec.rb +0 -42
  225. data/spec/interscript_spec.rb +0 -26
  226. data/spec/spec_helper.rb +0 -3
@@ -1,144 +0,0 @@
1
- ---
2
- authority_id: odni
3
- id: 2015
4
- language: aze
5
- source_script: Cyrl
6
- destination_script: Latn
7
- name: Standards for the transliteration of azeri personal names in written reports and products
8
- source: ICS-630-01 Annex P
9
- creation_date: 2015
10
- confirmation_date: 2015
11
- description: |
12
- This system is the Intelligence Community standard for the transliteration of Azeri person names
13
- that will be applied to all final written reports and products for IC consumers. It is not
14
- intended to eliminate variations of a name that can contribute forensic information. Rather, it is
15
- to provide an IC standard Romanized (English) transliteration from Azeri that can then be linked
16
- to forensic information in ways that will help identify the referent of the name.
17
-
18
- In cases where an individual’s name has already been transliterated in a variant spelling, the IC
19
- Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the
20
- first usage. In addition, if the original Cyrillic-script spelling is known, that spelling should
21
- also appear in parentheses following the name, if possible, following best practices of the
22
- issuing organization and taking into consideration information system capabilities. For example:
23
- Rashad Sadykhov (also seen as Rashad Sadigov, Рашад Садыхов). This convention is designed to
24
- ensure that vital forensic information is not lost.
25
-
26
- For names of persons who are known to not be part of the Azeri-speaking community, use the
27
- relevant IC transliteration standard for names from that language (e.g., Yitzhak). A translator’s
28
- note may be used to clarify the known origin of the person. Spell names of individuals from
29
- languages that are written in Roman letters as they are spelled in those languages (e.g.,
30
- George Clooney, Jorge Garcia, Georges Pompidou).
31
-
32
- In the case of active senior government officials in the on-line CIA World Factbook and the on-
33
- line directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings given
34
- in these on-line reference works should be used in place of the IC Standard. For any individual
35
- who has at one time been listed in the Factbook or Chiefs of State directory but who no longer
36
- appears in those resources (i.e. is no longer a government official), the IC Standard spelling
37
- should appear first, with the spelling, if known, as it previously appeared in those resources
38
- listed within parentheses at the first usage.
39
-
40
- The primary goal is to produce a consistent Romanized transcription of names that is specifically
41
- readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
42
- (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use of
43
- diacritics. However, within the context of a report, where additional information about the
44
- individual is provided, the referent will be clearly identified. This system will be used in
45
- conjunction with on-line tools, name dictionaries, and lists containing conventional spellings of
46
- names of well-known individuals.
47
-
48
- notes:
49
- - Transliterate double digraphs as a single digraph, i.e. шш -> sh, not shsh
50
- - In the Roman, no distinction is made between digraphs such as 'sh' and single contiguous letters,
51
- (e.g. 's' followed by 'h').
52
- - The Cyrillic ъ and ь are not transliterated, but instead are left out of the transliteration.
53
-
54
- tests:
55
- - source: Рашад Садыхов
56
- expected: Rashad Sadykhov
57
-
58
- map:
59
- rules:
60
- # note[1]
61
- - pattern: "(?i)(\u0492|\u0401|\u0416|\u0425|\u0427|\u0428|\u0429|\u042E|\u042F)\\1(?-i)"
62
- result: "\\1"
63
- # note[3]
64
- - pattern: \u044A|\u044C
65
- result: ""
66
-
67
- characters:
68
- '\u0410': 'A' # А
69
- '\u0411': 'B' # Б
70
- '\u0412': 'V' # В
71
- '\u0413': 'G' # Г
72
- '\u049C': 'G' # Ҝ
73
- '\u0492': 'Gh' # Ғ
74
- '\u0414': 'D' # Д
75
- '\u0415': 'E' # Е
76
- '\u0401': 'Yo' # Ё
77
- '\u04D8': 'A' # Ә
78
- '\u0416': 'Zh' # Ж
79
- '\u0417': 'Z' # З
80
- '\u0418': 'I' # И
81
- '\u0419': 'Y' # Й
82
- '\u0408': 'Y' # Ј
83
- '\u041A': 'K' # К
84
- '\u041B': 'L' # Л
85
- '\u041C': 'M' # М
86
- '\u041D': 'N' # Н
87
- '\u041E': 'O' # О
88
- '\u04E8': 'O' # Ө
89
- '\u041F': 'P' # П
90
- '\u0420': 'R' # Р
91
- '\u0421': 'S' # С
92
- '\u0422': 'T' # Т
93
- '\u0423': 'U' # У
94
- '\u04AE': 'U' # Ү
95
- '\u0424': 'F' # Ф
96
- '\u0425': 'Kh' # Х
97
- '\u04BA': 'H' # Һ
98
- '\u0427': 'Ch' # Ч
99
- '\u04B8': 'J' # Ҹ
100
- '\u0428': 'Sh' # Ш
101
- '\u0429': 'Shch' # Щ
102
- '\u042B': 'Y' # Ы
103
- '\u042D': 'E' # Э
104
- '\u042E': 'Yu' # Ю
105
- '\u042F': 'Ya' # Я
106
-
107
- '\u0430': 'a' # а
108
- '\u0431': 'b' # б
109
- '\u0432': 'v' # в
110
- '\u0433': 'g' # г
111
- '\u049D': 'g' # ҝ
112
- '\u0493': 'gh' # ғ
113
- '\u0434': 'd' # д
114
- '\u0435': 'e' # e
115
- '\u0451': 'yo' # ё
116
- '\u04D9': 'a' # ә
117
- '\u0436': 'zh' # ж
118
- '\u0437': 'z' # з
119
- '\u0438': 'i' # и
120
- '\u0439': 'y' # й
121
- '\u0458': 'y' # ј
122
- '\u043A': 'k' # к
123
- '\u043B': 'l' # л
124
- '\u043C': 'm' # м
125
- '\u043D': 'n' # н
126
- '\u043E': 'o' # о
127
- '\u04E9': 'o' # ө
128
- '\u043F': 'p' # п
129
- '\u0440': 'r' # р
130
- '\u0441': 's' # с
131
- '\u0442': 't' # т
132
- '\u0443': 'u' # у
133
- '\u04AF': 'u' # ү
134
- '\u0444': 'f' # ф
135
- '\u0445': 'kh' # х
136
- '\u04BB': 'h' # һ
137
- '\u0447': 'ch' # ч
138
- '\u04B9': 'j' # ҹ
139
- '\u0448': 'sh' # ш
140
- '\u0449': 'shch' # щ
141
- '\u044B': 'y' # ы
142
- '\u044D': 'e' # э
143
- '\u044E': 'yu' # ю
144
- '\u044F': 'ya' # я
@@ -1,148 +0,0 @@
1
- ---
2
- authority_id: odni
3
- id: 2015
4
- language: bel
5
- source_script: Cyrl
6
- destination_script: Latn
7
- name: Office of the Director Of National Intelligence Belarusian Personal Names 2015, ICS 630-01 Annex B
8
- # url:
9
- source: ICS 630-01, Annex B
10
- creation_date: 2015
11
- confirmation_date: 2015
12
- description: |
13
- This system is the Intelligence Community (IC) standard for the transliteration of Belarusian
14
- names that will be applied to all final written reports and products for IC consumers. It is not
15
- intended to eliminate variations of a name that can contribute forensic information. Rather, it is to
16
- provide an IC standard Romanized (English) transliteration from Belarusian that can then be
17
- linked to forensic information in ways that will help identify the referent of the name.
18
-
19
- In cases where an individual’s name has already been transliterated in a variant spelling, the IC
20
- Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the first
21
- usage. In addition, if the original Cyrillic spelling is known, that spelling should also appear in
22
- parentheses following the name, if possible, following best practices of the issuing organization
23
- and taking into consideration information system capabilities. This convention is designed to
24
- ensure that vital forensic information is not lost.
25
-
26
- For names of persons who are known to not be part of the Belarusian-speaking community, use
27
- the relevant IC transliteration standard for names from that language (e.g., Mikhail, Yitzhak). A
28
- translator’s note may be used to clarify the known origin of the person. Spell names of
29
- individuals from languages that are written in Roman letters as they are spelled in those
30
- languages (e.g., George Clooney, Jorge Garcia, Georges Pompidou).
31
-
32
- In the case of active senior government officials in the on-line CIA World Factbook and the online directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings
33
- given in these on-line reference works should be used in place of the IC Standard. For any
34
- individual who has at one time been listed in the Factbook or Chiefs of State directory but who no
35
- longer appears in those resources (i.e. is no longer a government official), the IC Standard
36
- spelling should appear first, with the spelling, if known, as it previously appeared in those
37
- resources listed within parentheses at the first usage.
38
-
39
- The primary goal is to produce a consistent Romanized transcription of names that is specifically
40
- readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
41
- (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use
42
- of diacritics. However, within the context of a report, where additional information about the
43
- individual is provided, the referent will be clearly identified. This system will be used in
44
- conjunction with on-line tools, name
45
-
46
- notes:
47
-
48
- tests:
49
- - source: Міхаіл
50
- expected: Mikhail
51
- - source: Беларусь
52
- expected: Byelarus
53
- - source: Кастусь Каліноўскі
54
- expected: Kastus Kalinowski
55
- - source: Васіль Быкау
56
- expected: Vasil Bykau
57
- - source: Янка Купала
58
- expected: Yanka Kupala
59
- - source: Маланка
60
- expected: Malanka
61
- - source: Пакаранне
62
- expected: Pakarannye
63
- - source: Бэз
64
- expected: Bez
65
- - source: Чабор
66
- expected: Chabor
67
- - source: |
68
- Дзяўчына, дзяўчыначка пасярод гісторыі
69
- З прастадушнай шчырасьцю глядзіць на тэрыторыю.
70
- У вакне заўсёды звыклая выява:
71
- Шэры двор, шэры слуп, на слупе аб'явы.
72
- expected: |
73
- Dzyawchyna, dzyawchynachka pasyarod historyi
74
- Z prastadushnay shchyrastsyu hlyadzits na terytoryyu.
75
- U vaknye zawsyody zvyklaya vyyava:
76
- Shery dvor, shery slup, na slupye abyavy.
77
-
78
- map:
79
- characters:
80
- '\u0027' : '' # '
81
-
82
- '\u0410' : 'A' # A
83
- '\u0411' : 'B' # Б
84
- '\u0412' : 'V' # B
85
- '\u0413' : 'H' # Г
86
- '\u0490' : 'G' # Ґ
87
- '\u0414' : 'D' # Д
88
- '\u0415' : 'Ye' # Е
89
- '\u0401' : 'Yo' # Ё
90
- '\u0416' : 'Zh' # Ж
91
- '\u0417' : 'Z' # З
92
- '\u0406' : 'I' # І
93
- '\u0419' : 'Y' # Й
94
- '\u041A' : 'K' # К
95
- '\u041B' : 'L' # Л
96
- '\u041C' : 'M' # М
97
- '\u041D' : 'N' # Н
98
- '\u041E' : 'O' # О
99
- '\u041F' : 'P' # П
100
- '\u0420' : 'R' # Р
101
- '\u0421' : 'S' # С
102
- '\u0422' : 'T' # Т
103
- '\u0423' : 'U' # У
104
- '\U040E' : 'W' # Ў
105
- '\u0424' : 'F' # Ф
106
- '\u0425' : 'Kh' # Х
107
- '\u0426' : 'Ts' # Ц
108
- '\u0427' : 'Ch' # Ч
109
- '\u0428' : 'Sh' # Ш
110
- '\u042B' : 'Y' # Ы
111
- '\u042C' : '' # Ь
112
- '\u042D' : 'E' # Э
113
- '\u042E' : 'Yu' # Ю
114
- '\u042F' : 'Ya' # Я
115
-
116
- '\u0430' : 'a' # а
117
- '\u0431' : 'b' # б
118
- '\u0432' : 'v' # в
119
- '\u0433' : 'h' # г
120
- '\u0491' : 'g' # ґ
121
- '\u0434' : 'd' # д
122
- '\u0435' : 'ye' # е
123
- '\u0451' : 'yo' # ё
124
- '\u0436' : 'zh' # ж
125
- '\u0437' : 'z' # з
126
- '\u0456' : 'i' # і
127
- '\u0439' : 'y' # й
128
- '\u043A' : 'k' # к
129
- '\u043B' : 'l' # л
130
- '\u043C' : 'm' # м
131
- '\u043D' : 'n' # н
132
- '\u043E' : 'o' # о
133
- '\u043F' : 'p' # п
134
- '\u0440' : 'r' # р
135
- '\u0441' : 's' # с
136
- '\u0442' : 't' # т
137
- '\u0443' : 'u' # у
138
- '\u045E' : 'w' # ў
139
- '\u0444' : 'f' # ф
140
- '\u0445' : 'kh' # х
141
- '\u0446' : 'ts' # ц
142
- '\u0447' : 'ch' # ч
143
- '\u0448' : 'sh' # ш
144
- '\u044B' : 'y' # ы
145
- '\u044c' : '' # Ь
146
- '\u044D' : 'e' # э
147
- '\u044E' : 'yu' # ю
148
- '\u044F' : 'ya' # я
@@ -1,96 +0,0 @@
1
- ---
2
- authority_id: odni
3
- id: 2015
4
- language: bul
5
- source_script: Cyrl
6
- destination_script: Latn
7
- name: Office of the Director Of National Intelligence Bulgarian Personal Names 2015, ICS-630-01 Annex O
8
- # url:
9
- source: ICS-630-01 Annex O
10
- creation_date: 2015
11
- confirmation_date: 2015
12
- description: |
13
- This system is the Intelligence Community standard for the transliteration of Bulgarian person
14
- names that will be applied to all final written reports and products for IC consumers. This
15
- standard matches both the Bulgarian national standard adopted in 2009 and the Board of
16
- Geographic Names / Permanent Committee on Geographic Names standard adopted in 2013. It is
17
- not intended to eliminate variations of a name that can contribute forensic information. Rather, it
18
- is to provide an IC standard Romanized (English) transliteration from Bulgarian that can then be
19
- linked to forensic information in ways that will help identify the referent of the name.
20
-
21
- In cases where an individual’s name has already been transliterated in a variant spelling, the IC
22
- Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the first
23
- usage. In addition, if the original Cyrillic-script spelling is known, that spelling should also
24
- appear in parentheses following the name, if possible, following best practices of the issuing
25
- organization and taking into consideration information system capabilities. For example: Dobri
26
- Hristov (also seen as Dobri Khristov, Добри Христов). This convention is designed to ensure
27
- that vital forensic information is not lost.
28
-
29
- For names of persons who are known to not be part of the Bulgarian-speaking community, use
30
- the relevant IC transliteration standard for names from that language (e.g., Yitzhak). A
31
- translator’s note may be used to clarify the known origin of the person. Spell names of
32
- individuals from languages that are written in Roman letters as they are spelled in those
33
- languages (e.g., George Clooney, Jorge Garcia, Georges Pompidou).
34
-
35
- In the case of active senior government officials in the on-line CIA World Factbook and the online directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings
36
- given in these on-line reference works should be used in place of the IC Standard. For any
37
- individual who has at one time been listed in the Factbook or Chiefs of State directory but who no
38
- longer appears in those resources (i.e. is no longer a government official), the IC Standard
39
- spelling should appear first, with the spelling, if known, as it previously appeared in those
40
- resources listed within parentheses at the first usage.
41
-
42
- The primary goal is to produce a consistent Romanized transcription of names that is specifically
43
- readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
44
- (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use
45
- of diacritics. However, within the context of a report, where additional information about the
46
- individual is provided, the referent will be clearly identified. This system will be used in
47
- conjunction with on-line tools, name dictionaries, and lists containing conventional spellings of
48
- names of well-known individuals.
49
-
50
- notes:
51
- - Transliterate double digraphs as a single digraph i.e. шш -> sh, not shsh
52
- - In the Roman, no distinction is made between digraphs such as 'sh' and single contiguous letters (e.g. 's' followed by 'h').
53
-
54
- tests:
55
-
56
- - source: Добри Христов
57
- expected: Dobri Khristov
58
- - source: болгарица
59
- expected: bolgaritsa
60
- - source: български език
61
- expected: balgarski ezik
62
- - source: българска азбука
63
- expected: balgarska azbuka
64
- - source: градъ
65
- expected: grad
66
- - source: аз държа
67
- expected: az darzha
68
- - source: Ядеш хляба с чубрица
69
- expected: Yadesh khlyaba s chubritsa
70
-
71
-
72
- # note[1]
73
- - source: шш
74
- expected: sh
75
- - source: ччччч
76
- expected: ch
77
-
78
- map:
79
- inherit: bgnpcgn-bul-Cyrl-Latn-2013
80
-
81
- rules:
82
- # note[1]
83
- - pattern: "(.)\\1{1,}"
84
- result: "\\1"
85
-
86
- - pattern: \u042C# # Ь
87
- result: "Y"
88
-
89
- - pattern: \u042A # Ъ
90
- result: "A"
91
-
92
- - pattern: \u044C # ь
93
- result: "y"
94
-
95
- - pattern: \u044A # ъ
96
- result: "a"
@@ -1,258 +0,0 @@
1
- ---
2
- authority_id: odni
3
- id: 2015
4
- language: hin
5
- source_script: Deva
6
- destination_script: Latn
7
- name: Office of the Director Of National Intelligence Hindi Urdu Personal Names 2015 System, ICS-630-01 Annex F
8
- #url:
9
- creation_date: 2015
10
- confirmation_date: 2015
11
- description: |
12
- This system is the Intelligence Community (IC) standard for the transliteration of names
13
- from Hindi and Urdu that will be applied to all final written reports and products for IC
14
- consumers. It is not intended to eliminate variations of a name that can contribute
15
- forensic information. Rather, it is to provide an IC standard Romanized (English)
16
- transliteration from standard Hindi and Urdu that can then be linked to forensic
17
- information in ways that will help identify the referent of the name.
18
-
19
- There are typically a number of ways that names can be Romanized from either
20
- Devanagari (Hindi) or modified Arabic (Urdu) scripts. Ambiguities can result from the
21
- Romanization of Hindi and Urdu names for several reasons, including the fact that some
22
- sounds in South Asian languages (e.g., retroflex consonants, voiced aspirates) have no
23
- equivalent in English or other European languages. In the case of Urdu, as in the original
24
- Arabic source of many Islamic names, short vowel markings, double consonant marks
25
- and other diacritics that would clearly distinguish the name are almost always omitted
26
- from standard written texts. And many Islamic names of Arabic or Persian origin reflect
27
- spelling distinctions from those languages that are lost in modern Urdu pronunciation
28
- (e.g., three distinct Arabic letters all represent the identical sound [s] for Urdu speakers);
29
- transliterations might either maintain those spelling distinctions or ignore them entirely.
30
- And, as in the Arabic source, names containing the Arabic definite article ‘al’ (‘ul’) show
31
- anticipatory assimilation in pronunciation (e.g., Shams al Din > Shamsuddin);
32
- transliterations may either reflect spelling or pronunciation in such cases.
33
-
34
- Because Hindi and Urdu overlap so extensively, it is desirable to correlate Hindi and
35
- Urdu transliterations as much as possible. In the area of names, this can become
36
- problematic when the Urdu spelling accurately reflects original Arabic spelling while the
37
- Hindi spelling was phonetically-based, in essence, already a transliteration. This would
38
- argue for ignoring Arabic/Urdu spelling distinctions not reflected in pronunciation in
39
- either Urdu or Hindi (e.g., letter sin versus letter sad) while maintaining Arabic/Urdu
40
- spelling distinctions like Hindi also maintains (e.g., qaf as in Qutubbin versus kaf)).
41
- However, this is not always possible (see Hindi va versus Urdu wau).
42
-
43
- In cases where an individual’s name has already been transliterated, that is to be indicated
44
- – as found – in parentheses immediately following its rendition in the transliteration
45
- standard (e.g., Muhammad Khulud (Mohamed Khulood)). In addition, if the original
46
- Devanagari or Arabic-script spelling is known, that spelling should also appear in
47
- parentheses following the name, if possible, following best practices of the issuing
48
- organization and taking into consideration information system capabilities. This
49
- convention is designed to ensure that vital forensic information is not lost.
50
-
51
- For names of persons who are known to not be part of the Hindi- or Urdu-speaking
52
- community, use the relevant IC transliteration standard for names from that language
53
- (e.g., Mikhail, Yitzhak). A translator’s note may be used to clarify the known origin of
54
- the person. Spell names of individuals from languages that are written in Roman letters as
55
- they are spelled in those languages (e.g., George Clooney, Jorge Garcia, Georges
56
- Pompidou).
57
-
58
- In the case of active senior government officials in the on-line CIA World Factbook and
59
- the on-line directory of Chiefs of State and Cabinet Members of Foreign Governments,
60
- the spellings given in these on-line reference works should be used in place of the IC
61
- Standard. For any individual who has at one time been listed in the Factbook or Chiefs of
62
- State directory but who no longer appears in those resources (i.e. is no longer a
63
- government official), the IC Standard spelling should appear first, with the spelling, if
64
- known, as it previously appeared in those resources listed within parentheses at the first
65
- usage.
66
-
67
- The primary goal of this system is to produce a consistent Romanized transcription of the
68
- name that is readable to the non-specialist. The system uses the 26 letters of the standard
69
- (English) Roman alphabet. Some ambiguities in the Romanized form will occur without
70
- the use of diacritics. However, within the context of a report, where additional
71
- information about the individual is provided, the referent will be clearly identified.
72
- Additionally, this system will be used in conjunction with on-line tools, name dictionaries
73
- and lists containing conventional spellings of names of well-known individuals
74
-
75
- notes:
76
-
77
- - |
78
- Long/Short Vowels: Long and short vowels are not distinguished in the system:
79
- The borrowed Arabic name Samir could represent two distinct names, one with a
80
- long /a/ (Saamir) and one with a long /i/ (Samiir). One solution would be to use
81
- /ee/ to stand for the long /i/, as is often done (Sameer). The IC Standard will not
82
- distinguish between these.
83
-
84
- - |
85
- No distinction is made between: retroflex and non-retroflex consonants; and
86
- nasalized vowels and vowels followed by /n/.
87
- - |
88
- No distinction is made between the several Arabic letters with the same phonetic
89
- value in Urdu: e.g., letters sin/svad, zal/ze/zoe.
90
- - |
91
- A distinction is drawn between Urdu letters qaf and kaf (and correspondingly,
92
- Hindi qa and ka).
93
- - |
94
- A distinction is drawn between aspirated (e.g., /d/) and nonaspirated consonants
95
- (e.g., /dh/), with the exception of ch/chh, both represented by /ch/.
96
- - |
97
- Double consonants: Double consonants represented by the tashdid (shaddah) are
98
- shown in most cases (e.g., Hassan, Muhammad). Exceptions: consonants
99
- represented by digraphs are not doubled (e.g., Mubashir [not Mubashshir]).
100
- - |
101
- Hamzah (glottal stop) and ayn: Unlike in the Arabic IC Standard, these are not
102
- represented in the IC standard.
103
- - |
104
- Digraphs: No distinction is made between digraphs such as /sh/ and single
105
- contiguous letters such as /s/ followed by /h/.
106
- - |
107
- Arabic definite article “al” (‘the’): Shows sun letter assimilation in the
108
- Romanized form (e.g., Abdur Rahman rather than Abdal Rahman, rather than
109
- Arabic IC standard 'Abd al-Rahman).
110
- - |
111
- Special Rules
112
- - |
113
- Hyphens: Hyphens (-) are NOT used to connect name elements within a name:
114
- Abdur Rahman. The single exception to this is the izafat (i.e., linking vowel in
115
- noun-link-modifier construction of Persian origin), which does show a hyphen
116
- before the /e/ and a following space: Koh-e Nur (‘mountain of light’), “Jaish-e
117
- xx” (‘Army of xx’ construction).
118
- - |
119
- Names incorporating “din” are written as one unit: Azermuddin, Badruddin,
120
- Faizuddin, Salahuddin.
121
- - |
122
- Names that incorporate Allah as part of the name show the Arabic grammatical
123
- marker /u/ rather than the /a/ of Allah: Abdullah (not Abdallah).
124
- - |
125
- Inherent short vowel /a/ in Devanagari is represented with an /a/ in Roman. Final
126
- consonants are assumed not to have a short /a/ (e.g., masc. name Ram Lal, not
127
- Rama Lala).
128
- - |
129
- As a general rule, Devanagari va is transcribed as a /v/: Vijay, Vishal, etc.
130
- Exception: /sw/ combination: Saraswati, Krishnaswami. Urdu wau, however, is
131
- transcribed as /w/: Wasim, Walid.
132
-
133
- tests:
134
- - source: "हसन मोहम्मद"
135
- expected: "hsn mohmmd"
136
- - source: "विशाल ठाकुर"
137
- expected: "vishal thakur"
138
- - source: "अमिताभ जैन"
139
- expected: "amitabh jain"
140
- - source: "आकाङ्क्षा बच्चन"
141
- expected: "akankshaa bchchn"
142
- - source: "अनुष्का शर्मा"
143
- expected: "anushka shrma"
144
- - source: "शाहरुख खान"
145
- expected: "shahrukh khan"
146
- - source: "इंजमाम उल हक"
147
- expected: "injmam ul hk"
148
- - source: "शाहिद अफरीदी"
149
- expected: "shahid aphridi"
150
- - source: "सचिन तेंडुलकर"
151
- expected: "schin tendulkr"
152
- - source: "वसीम अकरम"
153
- expected: "vsim akrm"
154
- - source: "हसन अब्दुल्ला"
155
- expected: "hsn abdulla"
156
-
157
- map:
158
-
159
- characters:
160
-
161
- # I. Vowels and Diphthongs (see Note 1)
162
-
163
- 'अ': 'a'
164
- 'आ': 'a'
165
- 'इ': 'i'
166
- 'ई': 'i'
167
- 'उ': 'u'
168
- 'ऊ': 'u'
169
- 'ऋ': 'ri'
170
- 'ए': 'e'
171
- 'ऐ': 'ai'
172
- 'ऑ': 'o'
173
- 'ओ': 'au'
174
-
175
- # II. Consonants (see Note 2)
176
- # Gutturals
177
- 'क': 'k'
178
- 'क्ष': 'ksha'
179
- 'क़': 'q'
180
- 'ख': 'kh'
181
- 'ख़': 'kh'
182
- 'ग': 'g'
183
- 'घ': 'gh'
184
- 'ग़': 'gh'
185
- 'ङ': 'n'
186
-
187
- # Palatals
188
- 'च': 'ch'
189
- 'छ': 'ch'
190
- 'ज': 'j'
191
- 'झ': 'jh'
192
- 'ज़': 'z'
193
- 'ञ': 'n'
194
-
195
- # Cerebrals
196
- 'ट': 't'
197
- 'ठ': 'th'
198
- 'ड': 'd'
199
- 'ड़': 'r'
200
- 'ढ': 'dh'
201
- 'ढ़': 'rh'
202
- 'ण': 'n'
203
-
204
- # Dentals
205
- 'त': 't'
206
- 'थ': 'th'
207
- 'द': 'd'
208
- 'ध': 'dh'
209
- 'न': 'n'
210
-
211
- # Labials
212
- 'प': 'p'
213
- 'फ': 'ph'
214
- 'फ़': 'f'
215
- 'ब': 'b'
216
- 'भ': 'bh'
217
- 'म': 'm'
218
-
219
- # Semivowels
220
- 'य': 'y'
221
- 'र': 'r'
222
- 'ल': 'l'
223
- 'व': 'v'
224
-
225
- # Sibilants
226
- 'श': 'sh'
227
- 'ष': 'sh'
228
- 'स': 's'
229
-
230
- # Aspirate
231
- 'ह': 'h'
232
-
233
- # Anusvāra
234
- 'ं': 'n'
235
-
236
- # Bisarga
237
- 'ः ': 'h'
238
-
239
- # Anunāsika
240
- 'ँ': 'n'
241
-
242
- '़': ''
243
- '्': ''
244
-
245
- # Medials # Needed for connecting constants
246
- 'ा': "a"
247
- 'ि': "i"
248
- 'ी': "i"
249
- 'ु': "u"
250
- 'ू': "u"
251
- 'ृ': "ri"
252
- 'े': "e"
253
- 'ै': "ai"
254
- 'ॅ': "ai"
255
- 'ॊ': "o"
256
- 'ो': "o"
257
- 'ौ': "au"
258
- 'ॉ': "au"