interscript 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ Opal.global.InterscriptMaps = {
2
+ <% Dir['maps/*.yaml'].each do |yaml_file| %>
3
+ "<%= File.basename(yaml_file, ".yaml") %>": <%=
4
+ JSON.dump(Interscript::OpalMapTranslate.translate_regexp(JSON.dump(YAML.load(File.read(yaml_file)))))
5
+ %>,
6
+ <% end %>
7
+ }
@@ -0,0 +1,12 @@
1
+ module Interscript
2
+ module OpalMapTranslate
3
+ def self.translate_regexp(src)
4
+ src.
5
+ gsub('[:upper:]', '\\\\\\\\p{Lu}').
6
+ gsub('[:lower:]', '\\\\\\\\p{Ll}').
7
+ gsub('[:alpha:]', '\\\\\\\\p{L}').
8
+ gsub('(?<=[\\\\p{Lu}])?', '(?<=[\\\\\\\\p{Lu}]?)').
9
+ gsub('(?=[\\\\p{Lu}])?', '(?=[\\\\\\\\p{Lu}]?)')
10
+ end
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Interscript
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  authority_id: bgnpcgn
3
3
  id: pinyin
4
- language: chn
4
+ language: zho
5
5
  source_script: Hans
6
6
  destination_script: Latn
7
7
  name: ROMANIZATION OF CHINESE -- BGN/PCGN 1979 AGREEMENT
@@ -0,0 +1,144 @@
1
+ ---
2
+ authority_id: odni
3
+ id: 2015
4
+ language: aze
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: Standards for the transliteration of azeri personal names in written reports and products
8
+ source: ICS-630-01 Annex P
9
+ creation_date: 2015
10
+ confirmation_date: 2015
11
+ description: |
12
+ This system is the Intelligence Community standard for the transliteration of Azeri person names
13
+ that will be applied to all final written reports and products for IC consumers. It is not
14
+ intended to eliminate variations of a name that can contribute forensic information. Rather, it is
15
+ to provide an IC standard Romanized (English) transliteration from Azeri that can then be linked
16
+ to forensic information in ways that will help identify the referent of the name.
17
+
18
+ In cases where an individual’s name has already been transliterated in a variant spelling, the IC
19
+ Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the
20
+ first usage. In addition, if the original Cyrillic-script spelling is known, that spelling should
21
+ also appear in parentheses following the name, if possible, following best practices of the
22
+ issuing organization and taking into consideration information system capabilities. For example:
23
+ Rashad Sadykhov (also seen as Rashad Sadigov, Рашад Садыхов). This convention is designed to
24
+ ensure that vital forensic information is not lost.
25
+
26
+ For names of persons who are known to not be part of the Azeri-speaking community, use the
27
+ relevant IC transliteration standard for names from that language (e.g., Yitzhak). A translator’s
28
+ note may be used to clarify the known origin of the person. Spell names of individuals from
29
+ languages that are written in Roman letters as they are spelled in those languages (e.g.,
30
+ George Clooney, Jorge Garcia, Georges Pompidou).
31
+
32
+ In the case of active senior government officials in the on-line CIA World Factbook and the on-
33
+ line directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings given
34
+ in these on-line reference works should be used in place of the IC Standard. For any individual
35
+ who has at one time been listed in the Factbook or Chiefs of State directory but who no longer
36
+ appears in those resources (i.e. is no longer a government official), the IC Standard spelling
37
+ should appear first, with the spelling, if known, as it previously appeared in those resources
38
+ listed within parentheses at the first usage.
39
+
40
+ The primary goal is to produce a consistent Romanized transcription of names that is specifically
41
+ readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
42
+ (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use of
43
+ diacritics. However, within the context of a report, where additional information about the
44
+ individual is provided, the referent will be clearly identified. This system will be used in
45
+ conjunction with on-line tools, name dictionaries, and lists containing conventional spellings of
46
+ names of well-known individuals.
47
+
48
+ notes:
49
+ - Transliterate double digraphs as a single digraph, i.e. шш -> sh, not shsh
50
+ - In the Roman, no distinction is made between digraphs such as 'sh' and single contiguous letters,
51
+ (e.g. 's' followed by 'h').
52
+ - The Cyrillic ъ and ь are not transliterated, but instead are left out of the transliteration.
53
+
54
+ tests:
55
+ - source: Рашад Садыхов
56
+ expected: Rashad Sadykhov
57
+
58
+ map:
59
+ rules:
60
+ # note[1]
61
+ - pattern: "(?i)(\u0492|\u0401|\u0416|\u0425|\u0427|\u0428|\u0429|\u042E|\u042F)\\1(?-i)"
62
+ result: "\\1"
63
+ # note[3]
64
+ - pattern: \u044A|\u044C
65
+ result: ""
66
+
67
+ characters:
68
+ '\u0410': 'A' # А
69
+ '\u0411': 'B' # Б
70
+ '\u0412': 'V' # В
71
+ '\u0413': 'G' # Г
72
+ '\u049C': 'G' # Ҝ
73
+ '\u0492': 'Gh' # Ғ
74
+ '\u0414': 'D' # Д
75
+ '\u0415': 'E' # Е
76
+ '\u0401': 'Yo' # Ё
77
+ '\u04D8': 'A' # Ә
78
+ '\u0416': 'Zh' # Ж
79
+ '\u0417': 'Z' # З
80
+ '\u0418': 'I' # И
81
+ '\u0419': 'Y' # Й
82
+ '\u0408': 'Y' # Ј
83
+ '\u041A': 'K' # К
84
+ '\u041B': 'L' # Л
85
+ '\u041C': 'M' # М
86
+ '\u041D': 'N' # Н
87
+ '\u041E': 'O' # О
88
+ '\u04E8': 'O' # Ө
89
+ '\u041F': 'P' # П
90
+ '\u0420': 'R' # Р
91
+ '\u0421': 'S' # С
92
+ '\u0422': 'T' # Т
93
+ '\u0423': 'U' # У
94
+ '\u04AE': 'U' # Ү
95
+ '\u0424': 'F' # Ф
96
+ '\u0425': 'Kh' # Х
97
+ '\u04BA': 'H' # Һ
98
+ '\u0427': 'Ch' # Ч
99
+ '\u04B8': 'J' # Ҹ
100
+ '\u0428': 'Sh' # Ш
101
+ '\u0429': 'Shch' # Щ
102
+ '\u042B': 'Y' # Ы
103
+ '\u042D': 'E' # Э
104
+ '\u042E': 'Yu' # Ю
105
+ '\u042F': 'Ya' # Я
106
+
107
+ '\u0430': 'a' # а
108
+ '\u0431': 'b' # б
109
+ '\u0432': 'v' # в
110
+ '\u0433': 'g' # г
111
+ '\u049D': 'g' # ҝ
112
+ '\u0493': 'gh' # ғ
113
+ '\u0434': 'd' # д
114
+ '\u0435': 'e' # e
115
+ '\u0451': 'yo' # ё
116
+ '\u04D9': 'a' # ә
117
+ '\u0436': 'zh' # ж
118
+ '\u0437': 'z' # з
119
+ '\u0438': 'i' # и
120
+ '\u0439': 'y' # й
121
+ '\u0458': 'y' # ј
122
+ '\u043A': 'k' # к
123
+ '\u043B': 'l' # л
124
+ '\u043C': 'm' # м
125
+ '\u043D': 'n' # н
126
+ '\u043E': 'o' # о
127
+ '\u04E9': 'o' # ө
128
+ '\u043F': 'p' # п
129
+ '\u0440': 'r' # р
130
+ '\u0441': 's' # с
131
+ '\u0442': 't' # т
132
+ '\u0443': 'u' # у
133
+ '\u04AF': 'u' # ү
134
+ '\u0444': 'f' # ф
135
+ '\u0445': 'kh' # х
136
+ '\u04BB': 'h' # һ
137
+ '\u0447': 'ch' # ч
138
+ '\u04B9': 'j' # ҹ
139
+ '\u0448': 'sh' # ш
140
+ '\u0449': 'shch' # щ
141
+ '\u044B': 'y' # ы
142
+ '\u044D': 'e' # э
143
+ '\u044E': 'yu' # ю
144
+ '\u044F': 'ya' # я
@@ -0,0 +1,148 @@
1
+ ---
2
+ authority_id: odni
3
+ id: 2015
4
+ language: kaz
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: Standards for the transliteration of kazakh personal names in written reports and products
8
+ source: ICS-630-01 Annex Q
9
+ creation_date: 2015
10
+ confirmation_date: 2015
11
+ description: |
12
+ This system is the Intelligence Community standard for the transliteration of Kazakh person names
13
+ that will be applied to all final written reports and products for IC consumers. It is not
14
+ intended to eliminate variations of a name that can contribute forensic information. Rather, it is
15
+ to provide an IC standard Romanized (English) transliteration from Kazakh that can then be linked
16
+ to forensic information in ways that will help identify the referent of the name.
17
+
18
+ In cases where an individual’s name has already been transliterated in a variant spelling, the IC
19
+ Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the
20
+ first usage. In addition, if the original Cyrillic-script spelling is known, that spelling should
21
+ also appear in parentheses following the name, if possible, following best practices of the
22
+ issuing organization and taking into consideration information system capabilities. For example:
23
+ Bekzat Sattarkhanov (also seen as Bekzat Sattarkanov, Бекзат Саттарханов). This convention is
24
+ designed to ensure that vital forensic information is not lost.
25
+
26
+ For names of persons who are known to not be part of the Kazakh-speaking community, use the
27
+ relevant IC transliteration standard for names from that language (e.g., Yitzhak). A translator’s
28
+ note may be used to clarify the known origin of the person. Spell names of individuals from
29
+ languages that are written in Roman letters as they are spelled in those languages (e.g.,
30
+ George Clooney, Jorge Garcia, Georges Pompidou).
31
+
32
+ In the case of active senior government officials in the on-line CIA World Factbook and the on-
33
+ line directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings given
34
+ in these on-line reference works should be used in place of the IC Standard. For any individual
35
+ who has at one time been listed in the Factbook or Chiefs of State directory but who no longer
36
+ appears in those resources (i.e. is no longer a government official), the IC Standard spelling
37
+ should appear first, with the spelling, if known, as it previously appeared in those resources
38
+ listed within parentheses at the first usage.
39
+
40
+ The primary goal is to produce a consistent Romanized transcription of names that is specifically
41
+ readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
42
+ (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use of
43
+ diacritics. However, within the context of a report, where additional information about the
44
+ individual is provided, the referent will be clearly identified. This system will be used in
45
+ conjunction with on-line tools, name dictionaries, and lists containing conventional spellings of
46
+ names of well-known individuals.
47
+
48
+ notes:
49
+ - Transliterate double digraphs as a single digraph, i.e. шш -> sh, not shsh
50
+ - In the Roman, no distinction is made between digraphs such as 'sh' and single contiguous letters,
51
+ (e.g. 's' followed by 'h').
52
+ - The Cyrillic ъ and ь are not transliterated, but instead are left out of the transliteration.
53
+
54
+ tests:
55
+ - source: Бекзат Саттарханов
56
+ expected: Bekzat Sattarkhanov
57
+
58
+ map:
59
+ rules:
60
+ # note[1]
61
+ - pattern: "(?i)(\u0492|\u0401|\u0416|\u04A2|\u0425|\u0426|\u0427|\u0428|\u0429|\u042E|\u042F)\\1(?-i)"
62
+ result: "\\1"
63
+ # note[3]
64
+ - pattern: \u044A|\u044C
65
+ result: ""
66
+
67
+ characters:
68
+ '\u0410': 'A' # А
69
+ '\u04D8': 'A' # Ә
70
+ '\u0411': 'B' # Б
71
+ '\u0412': 'V' # В
72
+ '\u0413': 'G' # Г
73
+ '\u0492': 'Gh' # Ғ
74
+ '\u0414': 'D' # Д
75
+ '\u0415': 'E' # Е
76
+ '\u0401': 'Yo' # Ё
77
+ '\u0416': 'Zh' # Ж
78
+ '\u0417': 'Z' # З
79
+ '\u0418': 'I' # И
80
+ '\u0406': 'I' # І
81
+ '\u0419': 'Y' # Й
82
+ '\u041A': 'K' # К
83
+ '\u049A': 'Q' # Қ
84
+ '\u041B': 'L' # Л
85
+ '\u041C': 'M' # М
86
+ '\u041D': 'N' # Н
87
+ '\u04A2': 'Ng' # Ң
88
+ '\u041E': 'O' # О
89
+ '\u04E8': 'O' # Ө
90
+ '\u041F': 'P' # П
91
+ '\u0420': 'R' # Р
92
+ '\u0421': 'S' # С
93
+ '\u0422': 'T' # Т
94
+ '\u0423': 'U' # У
95
+ '\u04AE': 'U' # Ү
96
+ '\u04B0': 'U' # Ұ
97
+ '\u0424': 'F' # Ф
98
+ '\u0425': 'Kh' # Х
99
+ '\u04BA': 'H' # Һ
100
+ '\u0426': 'Ts' # Ц
101
+ '\u0427': 'Ch' # Ч
102
+ '\u0428': 'Sh' # Ш
103
+ '\u0429': 'Shch' # Щ
104
+ '\u042B': 'Y' # Ы
105
+ '\u042D': 'E' # Э
106
+ '\u042E': 'Yu' # Ю
107
+ '\u042F': 'Ya' # Я
108
+
109
+ '\u0430': 'a' # а
110
+ '\u04D9': 'a' # ә
111
+ '\u0431': 'b' # б
112
+ '\u0432': 'v' # в
113
+ '\u0433': 'g' # г
114
+ '\u0493': 'gh' # ғ
115
+ '\u0434': 'd' # д
116
+ '\u0435': 'e' # e
117
+ '\u0451': 'yo' # ё
118
+ '\u0436': 'zh' # ж
119
+ '\u0437': 'z' # з
120
+ '\u0438': 'i' # и
121
+ '\u0456': 'i' # і
122
+ '\u0439': 'y' # й
123
+ '\u043A': 'k' # к
124
+ '\u049B': 'q' # қ
125
+ '\u043B': 'l' # л
126
+ '\u043C': 'm' # м
127
+ '\u043D': 'n' # н
128
+ '\u04A3': 'ng' # ң
129
+ '\u043E': 'o' # о
130
+ '\u04E9': 'o' # ө
131
+ '\u043F': 'p' # п
132
+ '\u0440': 'r' # р
133
+ '\u0441': 's' # с
134
+ '\u0442': 't' # т
135
+ '\u0443': 'u' # у
136
+ '\u04AF': 'u' # ү
137
+ '\u04B1': 'u' # ұ
138
+ '\u0444': 'f' # ф
139
+ '\u0445': 'kh' # х
140
+ '\u04BB': 'h' # һ
141
+ '\u0446': 'ts' # ц
142
+ '\u0447': 'ch' # ч
143
+ '\u0448': 'sh' # ш
144
+ '\u0449': 'shch' # щ
145
+ '\u044B': 'y' # ы
146
+ '\u044D': 'e' # э
147
+ '\u044E': 'yu' # ю
148
+ '\u044F': 'ya' # я
@@ -0,0 +1,136 @@
1
+ ---
2
+ authority_id: odni
3
+ id: 2015
4
+ language: kir
5
+ source_script: Cyrl
6
+ destination_script: Latn
7
+ name: Standards for the transliteration of kyrgyz personal names in written reports and products
8
+ source: ICS-630-01 Annex R
9
+ creation_date: 2015
10
+ confirmation_date: 2015
11
+ description: |
12
+ This system is the Intelligence Community standard for the transliteration of Kyrgyz person names
13
+ that will be applied to all final written reports and products for IC consumers. It is not
14
+ intended to eliminate variations of a name that can contribute forensic information. Rather, it is
15
+ to provide an IC standard Romanized (English) transliteration from Kyrgyz that can then be linked
16
+ to forensic information in ways that will help identify the referent of the name.
17
+
18
+ In cases where an individual’s name has already been transliterated in a variant spelling, the IC
19
+ Standard spelling should appear first, followed by the variant spelling(s) in parentheses at the
20
+ first usage. In addition, if the original Cyrillic-script spelling is known, that spelling should
21
+ also appear in parentheses following the name, if possible, following best practices of the
22
+ issuing organization and taking into consideration information system capabilities. For example:
23
+ Guljigit Kalykov (also seen as Guljigit Kalikov, Гульжигит Калыков). This convention is designed
24
+ to ensure that vital forensic information is not lost.
25
+
26
+ For names of persons who are known to not be part of the Kyrgyz-speaking community, use the
27
+ relevant IC transliteration standard for names from that language (e.g., Yitzhak). A translator’s
28
+ note may be used to clarify the known origin of the person. Spell names of individuals from
29
+ languages that are written in Roman letters as they are spelled in those languages (e.g.,
30
+ George Clooney, Jorge Garcia, Georges Pompidou).
31
+
32
+ In the case of active senior government officials in the on-line CIA World Factbook and the on-
33
+ line directory of Chiefs of State and Cabinet Members of Foreign Governments, the spellings given
34
+ in these on-line reference works should be used in place of the IC Standard. For any individual
35
+ who has at one time been listed in the Factbook or Chiefs of State directory but who no longer
36
+ appears in those resources (i.e. is no longer a government official), the IC Standard spelling
37
+ should appear first, with the spelling, if known, as it previously appeared in those resources
38
+ listed within parentheses at the first usage.
39
+
40
+ The primary goal is to produce a consistent Romanized transcription of names that is specifically
41
+ readable to the English-speaking non-specialist. The system uses the 26 letters of the standard
42
+ (English) Roman alphabet. Some ambiguities in the Romanized form will occur without the use of
43
+ diacritics. However, within the context of a report, where additional information about the
44
+ individual is provided, the referent will be clearly identified. This system will be used in
45
+ conjunction with on-line tools, name dictionaries, and lists containing conventional spellings
46
+ of names of well-known individuals.
47
+
48
+ notes:
49
+ - Transliterate double digraphs as a single digraph, i.e. шш -> sh, not shsh
50
+ - In the Roman, no distinction is made between digraphs such as 'sh' and single contiguous letters,
51
+ (e.g. 's' followed by 'h').
52
+ - The Cyrillic ъ and ь are not transliterated, but instead are left out of the transliteration.
53
+
54
+ tests:
55
+ - source: Гульжигит Калыков
56
+ expected: Guljigit Kalykov
57
+
58
+ map:
59
+ rules:
60
+ # note[1]
61
+ - pattern: "(?i)(\u0401|\u04A2|\u0425|\u0426|\u0427|\u0428|\u0429|\u042E|\u042F)\\1(?-i)"
62
+ result: "\\1"
63
+ # note[3]
64
+ - pattern: \u044A|\u044C
65
+ result: ""
66
+
67
+ characters:
68
+ '\u0410': 'A' # А
69
+ '\u0411': 'B' # Б
70
+ '\u0412': 'V' # В
71
+ '\u0413': 'G' # Г
72
+ '\u0414': 'D' # Д
73
+ '\u0415': 'E' # Е
74
+ '\u0401': 'Yo' # Ё
75
+ '\u0416': 'J' # Ж
76
+ '\u0417': 'Z' # З
77
+ '\u0418': 'I' # И
78
+ '\u0419': 'Y' # Й
79
+ '\u041A': 'K' # К
80
+ '\u041B': 'L' # Л
81
+ '\u041C': 'M' # М
82
+ '\u041D': 'N' # Н
83
+ '\u04A2': 'Ng' # Ң
84
+ '\u041E': 'O' # О
85
+ '\u04E8': 'O' # Ө
86
+ '\u041F': 'P' # П
87
+ '\u0420': 'R' # Р
88
+ '\u0421': 'S' # С
89
+ '\u0422': 'T' # Т
90
+ '\u0423': 'U' # У
91
+ '\u04AE': 'U' # Ү
92
+ '\u0424': 'F' # Ф
93
+ '\u0425': 'Kh' # Х
94
+ '\u0426': 'Ts' # Ц
95
+ '\u0427': 'Ch' # Ч
96
+ '\u0428': 'Sh' # Ш
97
+ '\u0429': 'Shch' # Щ
98
+ '\u042B': 'Y' # Ы
99
+ '\u042D': 'E' # Э
100
+ '\u042E': 'Yu' # Ю
101
+ '\u042F': 'Ya' # Я
102
+
103
+ '\u0430': 'a' # а
104
+ '\u0431': 'b' # б
105
+ '\u0432': 'v' # в
106
+ '\u0433': 'g' # г
107
+ '\u0434': 'd' # д
108
+ '\u0435': 'e' # e
109
+ '\u0451': 'yo' # ё
110
+ '\u0436': 'j' # ж
111
+ '\u0437': 'z' # з
112
+ '\u0438': 'i' # и
113
+ '\u0439': 'y' # й
114
+ '\u043A': 'k' # к
115
+ '\u043B': 'l' # л
116
+ '\u043C': 'm' # м
117
+ '\u043D': 'n' # н
118
+ '\u04A3': 'ng' # ң
119
+ '\u043E': 'o' # о
120
+ '\u04E9': 'o' # ө
121
+ '\u043F': 'p' # п
122
+ '\u0440': 'r' # р
123
+ '\u0441': 's' # с
124
+ '\u0442': 't' # т
125
+ '\u0443': 'u' # у
126
+ '\u04AF': 'u' # ү
127
+ '\u0444': 'f' # ф
128
+ '\u0445': 'kh' # х
129
+ '\u0446': 'ts' # ц
130
+ '\u0447': 'ch' # ч
131
+ '\u0448': 'sh' # ш
132
+ '\u0449': 'shch' # щ
133
+ '\u044B': 'y' # ы
134
+ '\u044D': 'e' # э
135
+ '\u044E': 'yu' # ю
136
+ '\u044F': 'ya' # я