cldr-transforms 46.0.0 → 47.0.0-BETA2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +1 -1
- package/bower.json +2 -2
- package/package.json +3 -3
- package/transforms/Arabic-Latin-BGN.txt +1 -1
- package/transforms/Arabic-Latin.txt +1 -1
- package/transforms/Bengali-Latin.txt +1 -1
- package/transforms/Cyrillic-Latin.txt +4 -4
- package/transforms/Greek-Latin.txt +8 -8
- package/transforms/Greek_Latin_UNGEGN.txt +2 -2
- package/transforms/Han-Latin-Names.txt +1 -1
- package/transforms/Han-Latin.json +2 -2
- package/transforms/Han-Latin.txt +1 -1
- package/transforms/Han-Spacedhan.txt +4 -4
- package/transforms/Hant-Latin.json +8 -0
- package/transforms/Hant-Latin.txt +100 -0
- package/transforms/Hiragana-Katakana.txt +2 -2
- package/transforms/Latin-Jamo.txt +2 -2
- package/transforms/Latin-Katakana.txt +3 -3
- package/transforms/Latin-NumericPinyin.txt +1 -1
- package/transforms/Maldivian-Latin-BGN.txt +1 -1
- package/transforms/Persian-Latin-BGN.txt +1 -1
- package/transforms/Thai-Latin.txt +1 -1
- package/transforms/Thai-ThaiLogical.txt +1 -1
- package/transforms/Thai-ThaiSemi.txt +1 -1
- package/transforms/ThaiLogical-Latin.txt +2 -2
- package/transforms/am-Ethi-t-d0-morse.txt +1 -1
- package/transforms/az-Title.txt +3 -3
- package/transforms/byn-Ethi-t-byn-latn-m0-xaleget.txt +3 -3
- package/transforms/chr-chr_FONIPA.txt +1 -1
- package/transforms/de-ASCII.txt +1 -1
- package/transforms/el-Lower.txt +2 -2
- package/transforms/el-Title.txt +3 -3
- package/transforms/it-am.txt +1 -1
- package/transforms/it-ja.txt +1 -1
- package/transforms/lt-Title.txt +7 -7
- package/transforms/tr-Title.txt +3 -3
- package/transforms/und-Ethi-t-und-latn-m0-beta_metsehaf-geminate.txt +1 -1
- package/transforms.json +1 -0
package/LICENSE
CHANGED
|
@@ -2,7 +2,7 @@ UNICODE LICENSE V3
|
|
|
2
2
|
|
|
3
3
|
COPYRIGHT AND PERMISSION NOTICE
|
|
4
4
|
|
|
5
|
-
Copyright © 2004-
|
|
5
|
+
Copyright © 2004-2025 Unicode, Inc.
|
|
6
6
|
|
|
7
7
|
NOTICE TO USER: Carefully read the following legal agreement. BY
|
|
8
8
|
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
|
package/README.md
CHANGED
|
@@ -19,7 +19,7 @@ the data contained here, please file a new ticket at [Unicode Jira](https://unic
|
|
|
19
19
|
|
|
20
20
|
## License
|
|
21
21
|
|
|
22
|
-
Copyright © 1991-
|
|
22
|
+
Copyright © 1991-2025 Unicode, Inc.
|
|
23
23
|
[Terms of Use](http://www.unicode.org/copyright.html)
|
|
24
24
|
|
|
25
25
|
SPDX-License-Identifier: Unicode-3.0
|
package/bower.json
CHANGED
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "cldr-transforms",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "47.0.0-BETA2",
|
|
4
4
|
"peerDependencies": {
|
|
5
|
-
"cldr-core": "
|
|
5
|
+
"cldr-core": "47.0.0-BETA2"
|
|
6
6
|
},
|
|
7
7
|
"description": "Transform data",
|
|
8
8
|
"homepage": "https://cldr.unicode.org",
|
|
@@ -24,6 +24,6 @@
|
|
|
24
24
|
},
|
|
25
25
|
"license": "Unicode-3.0",
|
|
26
26
|
"bugs": "https://cldr.unicode.org/index/bug-reports#TOC-Filing-a-Ticket",
|
|
27
|
-
"cldrVersion": "
|
|
27
|
+
"cldrVersion": "47",
|
|
28
28
|
"unicodeVersion": "16.0.0"
|
|
29
29
|
}
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
# Does *not* do assimilation of "al", nor hyphenation.
|
|
11
11
|
# While it could be done, we need to determine whether a prefix "al" could
|
|
12
12
|
# occur other than as the definite article (since no space is used).
|
|
13
|
-
:: [[:Arabic:][:
|
|
13
|
+
:: [[:Arabic:][:Block=Arabic:][ⁿ،؛؟ـً-ٕ٠-٬۰-۹﷼ښ][\u0611\u0670]] ;
|
|
14
14
|
:: NFKD (NFC);
|
|
15
15
|
$disambig = ̱ ;
|
|
16
16
|
$disambig2 = ̰ ;
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
# Should add variants for Russian-English, Russian-German
|
|
3
3
|
# Those can use this as a base, and then remap cases
|
|
4
4
|
# like a $hat to ya or ja.
|
|
5
|
-
# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:
|
|
5
|
+
# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:Nonspacing_Mark:]] ;
|
|
6
6
|
### WARNING, ̈ must be added to the generated filters, in both directions ###
|
|
7
7
|
# MINIMAL FILTER
|
|
8
8
|
# Cyrillic-Latin
|
|
@@ -267,12 +267,12 @@ $ignore = [[:Mark:]''] * ;
|
|
|
267
267
|
| K ← Q ;
|
|
268
268
|
| u ← w ;
|
|
269
269
|
| U ← W ;
|
|
270
|
-
| KS ← X } $ignore [:
|
|
271
|
-
| KS ← [:
|
|
270
|
+
| KS ← X } $ignore [:Uppercase_Letter:] ;
|
|
271
|
+
| KS ← [:Uppercase_Letter:] $ignore { X ;
|
|
272
272
|
| Ks ← X ;
|
|
273
273
|
| ks ← x ;
|
|
274
274
|
:: NFC (NFD) ;
|
|
275
275
|
# note: a global filter is more efficient, but MUST include all source chars!!
|
|
276
|
-
# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:
|
|
276
|
+
# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:Nonspacing_Mark:] ‧]);
|
|
277
277
|
# MINIMAL FILTER: Latin-Cyrillic
|
|
278
278
|
:: ( [ḫḪhH‧ˌ̈A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ̀-̂̆-̦̱̇̌̀-́̈́ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ;
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# Rules are predicated on running NFD first, and NFC afterwards
|
|
2
|
-
# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:
|
|
2
|
+
# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:Nonspacing_Mark:]] ;
|
|
3
3
|
# MINIMAL FILTER GENERATED FOR: Greek-Latin
|
|
4
4
|
:: [΄´;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ̄̈̓-̔͂-ͅͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ;
|
|
5
5
|
:: NFD (NFC) ;
|
|
@@ -12,9 +12,9 @@
|
|
|
12
12
|
# ὨΣ ὩΣ ὪΣ ὫΣ
|
|
13
13
|
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
|
|
14
14
|
# Useful variables
|
|
15
|
-
$lower = [[:
|
|
16
|
-
$glower = [[:
|
|
17
|
-
$upper = [[:
|
|
15
|
+
$lower = [[:Latin:][:Greek:] & [:Ll:]];
|
|
16
|
+
$glower = [[:Greek:] & [:Ll:]];
|
|
17
|
+
$upper = [[:Latin:][:Greek:] & [:Lu:]] ;
|
|
18
18
|
$accent = [:M:] ;
|
|
19
19
|
# NOTE: restrict to just the Greek & Latin accents that we care about
|
|
20
20
|
# TODO: broaden out once interation is fixed
|
|
@@ -220,8 +220,8 @@ $ignore = [[:Mark:]''] * ;
|
|
|
220
220
|
| B ← W } $vowel ;
|
|
221
221
|
| U ← V ;
|
|
222
222
|
| U ← W ;
|
|
223
|
-
$rough } $ignore [:
|
|
224
|
-
$ignore [:
|
|
223
|
+
$rough } $ignore [:Uppercase_Letter:] → H ;
|
|
224
|
+
$ignore [:Uppercase_Letter:] { $rough → H ;
|
|
225
225
|
$rough ← H ;
|
|
226
226
|
$rough ↔ h ;
|
|
227
227
|
# Completeness for Greek
|
|
@@ -243,7 +243,7 @@ $rough ↔ h ;
|
|
|
243
243
|
← [Ππ] { \' } [Ss] ;
|
|
244
244
|
← [Νν] { \' } $egammaLike ;
|
|
245
245
|
::NFC (NFD) ;
|
|
246
|
-
# ([\u0000-\u007F [:Latin:] [:Greek:] [:
|
|
247
|
-
# ([\u0000-\u007F · [:Latin:] [:
|
|
246
|
+
# ([\u0000-\u007F [:Latin:] [:Greek:] [:Nonspacing_Mark:]]) ;
|
|
247
|
+
# ([\u0000-\u007F · [:Latin:] [:Nonspacing_Mark:]]) ;
|
|
248
248
|
# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
|
|
249
249
|
:: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ̀-̷̹-ͅ΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ;
|
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
:: [[[:Greek:][:Mn:][:Me:]] [\:-;?·;·]] ;
|
|
7
7
|
::NFD (NFC) ;
|
|
8
8
|
# Useful variables
|
|
9
|
-
$lower = [[:
|
|
10
|
-
$upper = [[:
|
|
9
|
+
$lower = [[:Latin:][:Greek:] & [:Ll:]] ;
|
|
10
|
+
$upper = [[:Latin:][:Greek:] & [:Lu:]] ;
|
|
11
11
|
$accent = [[:Mn:][:Me:]] ;
|
|
12
12
|
$macron = ̄ ;
|
|
13
13
|
$ddot = ̈ ;
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
# Do this before ::Han-Spacedhan() to catch Han after space in original text,
|
|
8
8
|
# and to apply before all other rules.
|
|
9
9
|
$startOfHanMarker = \uFDD1;
|
|
10
|
-
[:^
|
|
10
|
+
[:^Script=Han:] { ([:Script=Han:]) → $startOfHanMarker $1;
|
|
11
11
|
# Need Spacedhan so the name transliterations get spaced properly
|
|
12
12
|
::Han-Spacedhan();
|
|
13
13
|
# Convert special name readings that depend on next character
|
package/transforms/Han-Latin.txt
CHANGED
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
# Note that Han-Spacedhan() has already been applied, so there should be spaces between Han characters.
|
|
6
6
|
藏 } \u0020? 文 →zàng;# 藏 is zàng (not cáng) if followed by 文 wén: 藏文 language Zàngwén = Tibetan
|
|
7
7
|
重 } \u0020? 庆 →chóng;# 重 is chóng (not zhòng) if followed by 庆 qìng: 重庆 city Chóngqìng
|
|
8
|
-
沈 } \u0020? 阳 →shěn
|
|
8
|
+
# "沈 } \u0020? 阳 →shěn" is obsolete for Hans, the kMandarin entry for 沈 changed from "chén" to "shěn chén" in Unicode 14
|
|
9
9
|
秘 } \u0020? 鲁 →bì;# 秘 is bì (not mì) if followed by 鲁 lǔ: 秘鲁 country Bìlǔ = Peru
|
|
10
10
|
# START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin)
|
|
11
11
|
[吖錒锕阿𠼞𥥩𨉚𱚱]→ā;
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# Only intended for internal use
|
|
2
2
|
# Make sure Han are normalized, including characters that contain them.
|
|
3
|
-
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:
|
|
4
|
-
# Where XXX is the resolved [:
|
|
5
|
-
:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:
|
|
3
|
+
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:Ideographic:]-[:sc=Han:]
|
|
4
|
+
# Where XXX is the resolved [:Ideographic:][:sc=Han:]. It needs updating with each Unicode release!
|
|
5
|
+
:: [[、。々《-』〜・㆒-㆟㈠-㉇㊀-㊰㋀-㋋ ㍘-㍰㍻-㍿㏠-㏾🈐-🈒🈔-🈺🉀-🉈🉐🉑][:Ideographic:][:sc=Han:]] nfkc;
|
|
6
6
|
:: fullwidth-halfwidth;
|
|
7
7
|
。 → '.';
|
|
8
8
|
。→ '.';
|
|
@@ -23,7 +23,7 @@
|
|
|
23
23
|
々→ '⓶';
|
|
24
24
|
〜→ '~';
|
|
25
25
|
$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
|
|
26
|
-
$initialPunct = [:Ps:][:Pi:];
|
|
26
|
+
$initialPunct = [[:Ps:][:Pi:]];
|
|
27
27
|
# add space between any Han or terminal punctuation and letters, and
|
|
28
28
|
# between letters and Han or initial punct
|
|
29
29
|
[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Warning: does not do round-trip mapping!!
|
|
2
|
+
# Convert compounds; these are added individually, not derived from Unihan kMandarin.
|
|
3
|
+
# Here Han-Spacedhan() has not yet been applied.
|
|
4
|
+
# The following was moved from Hans-Latn; in a Hant/Taiwan context, the simplified-form city name 沈阳 should still transform to shěnyáng.
|
|
5
|
+
沈 } 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
|
|
6
|
+
# START From Unicode 17, the following should be autogenerated:
|
|
7
|
+
[棓]→bàng; # U+68D3
|
|
8
|
+
[繃]→bēng; # U+7E43
|
|
9
|
+
[俾]→bì; # U+4FFE
|
|
10
|
+
[萹]→biǎn; # U+8439
|
|
11
|
+
[摽脿蔈麃]→biāo; # U+647D,813F,8508,9E83
|
|
12
|
+
[啵]→bō; # U+5575
|
|
13
|
+
[柏薄]→bó; # U+67CF,8584
|
|
14
|
+
[卜]→bǔ; # U+535C
|
|
15
|
+
[差]→chā; # U+5DEE
|
|
16
|
+
[沈]→chén; # U+6C88
|
|
17
|
+
[牚]→chēng; # U+725A
|
|
18
|
+
[埫]→chǒng; # U+57EB
|
|
19
|
+
[槭]→cù; # U+69ED
|
|
20
|
+
[噠]→dá; # U+5660
|
|
21
|
+
[蹬]→dèng; # U+8E6C
|
|
22
|
+
[地]→dì; # U+5730
|
|
23
|
+
[嗲]→diē; # U+55F2
|
|
24
|
+
[䏲跌]→dié; # U+43F2,8DCC
|
|
25
|
+
[町]→dīng; # U+753A
|
|
26
|
+
[斗]→dǒu; # U+6597
|
|
27
|
+
[都]→dū; # U+90FD
|
|
28
|
+
[碡]→dú; # U+78A1
|
|
29
|
+
[柁]→duò; # U+67C1
|
|
30
|
+
[嗯]→en; # U+55EF
|
|
31
|
+
[髪髮]→fǎ; # U+9AEA,9AEE
|
|
32
|
+
[蕃]→fān; # U+8543
|
|
33
|
+
[帆]→fán; # U+5E06
|
|
34
|
+
[氾]→fàn; # U+6C3E
|
|
35
|
+
[彷]→fǎng; # U+5F77
|
|
36
|
+
[坋]→fèn; # U+574B
|
|
37
|
+
[諷讽]→fèng; # U+8AF7,8BBD
|
|
38
|
+
[乾]→gān; # U+4E7E
|
|
39
|
+
[㪅]→gēng; # U+3A85
|
|
40
|
+
[蓇]→gǔ; # U+84C7
|
|
41
|
+
[聒]→guā; # U+8052
|
|
42
|
+
[氿]→guǐ; # U+6C3F
|
|
43
|
+
[炔]→guì; # U+7094
|
|
44
|
+
[欻]→hū; # U+6B3B
|
|
45
|
+
[砉]→huò; # U+7809
|
|
46
|
+
[𪟝]→jī; # U+2A7DD
|
|
47
|
+
[蓻]→jí; # U+84FB
|
|
48
|
+
[袷]→jiá; # U+88B7
|
|
49
|
+
[叚]→jiǎ; # U+53DA
|
|
50
|
+
[菹]→jū; # U+83F9
|
|
51
|
+
[剋]→kè; # U+524B
|
|
52
|
+
[框]→kuāng; # U+6846
|
|
53
|
+
[适]→kuò; # U+9002
|
|
54
|
+
[肋]→lè; # U+808B
|
|
55
|
+
[釐]→lí; # U+91D0
|
|
56
|
+
[峛]→lǐ; # U+5CDB
|
|
57
|
+
[𩷕]→liáng; # U+29DD5
|
|
58
|
+
[瞭]→liǎo; # U+77AD
|
|
59
|
+
[蹣]→mán; # U+8E63
|
|
60
|
+
[眄]→miǎn; # U+7704
|
|
61
|
+
[碈]→mín; # U+7888
|
|
62
|
+
[万]→mò; # U+4E07
|
|
63
|
+
[伲]→nǐ; # U+4F32
|
|
64
|
+
[耙]→pá; # U+8019
|
|
65
|
+
[芘]→pí; # U+8298
|
|
66
|
+
[諞]→pián; # U+8ADE
|
|
67
|
+
[剽]→piào; # U+527D
|
|
68
|
+
[剖頗]→pǒ; # U+5256,9817
|
|
69
|
+
[醱]→pò; # U+91B1
|
|
70
|
+
[呇]→qǐ; # U+5447
|
|
71
|
+
[癿]→qié; # U+767F
|
|
72
|
+
[芎]→qiōng; # U+828E
|
|
73
|
+
[杣]→shān; # U+6763
|
|
74
|
+
[杓]→sháo; # U+6753
|
|
75
|
+
[舍]→shè; # U+820D
|
|
76
|
+
[誰]→shéi; # U+8AB0
|
|
77
|
+
[識识]→shì; # U+8B58,8BC6
|
|
78
|
+
[楯]→shǔn; # U+696F
|
|
79
|
+
[洓]→suǒ; # U+6D13
|
|
80
|
+
[沓]→tà; # U+6C93
|
|
81
|
+
[堤隄]→tí; # U+5824,9684
|
|
82
|
+
[萎]→wēi; # U+840E
|
|
83
|
+
[硊]→wěi; # U+784A
|
|
84
|
+
[筽]→wú; # U+7B7D
|
|
85
|
+
[嘸]→wǔ; # U+5638
|
|
86
|
+
[㴔]→xī; # U+3D14
|
|
87
|
+
[𲆰]→xí; # U+321B0
|
|
88
|
+
[𲆦]→xì; # U+321A6
|
|
89
|
+
[呷]→xiá; # U+5477
|
|
90
|
+
[硍]→xiàn; # U+784D
|
|
91
|
+
[崾]→yǎo; # U+5D3E
|
|
92
|
+
[畬]→yú; # U+756C
|
|
93
|
+
[薁]→yù; # U+8581
|
|
94
|
+
[嶦]→zhān; # U+5DA6
|
|
95
|
+
[著]→zhe; # U+8457
|
|
96
|
+
[徵]→zhēng; # U+5FB5
|
|
97
|
+
[苧]→zhù; # U+82E7
|
|
98
|
+
# END From Unicode 17, the above should be autogenerated:
|
|
99
|
+
# Then run the normal Hani-Latn transform for the rest
|
|
100
|
+
::Hani-Latn();
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# note: a global filter is more efficient, but MUST include all source chars
|
|
2
|
-
:: [[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:
|
|
2
|
+
:: [[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:Nonspacing_Mark:]]-[\u309B \u309C]];
|
|
3
3
|
:: NFKC (NFC);
|
|
4
4
|
# Hiragana-Katakana
|
|
5
5
|
# This is largely a one-to-one mapping, but it has a
|
|
@@ -173,5 +173,5 @@ $xo = [
|
|
|
173
173
|
お ← $xo {ー};
|
|
174
174
|
:: NFC (NFKC) ;
|
|
175
175
|
# note: a global filter is more efficient, but MUST include all source chars!!
|
|
176
|
-
:: ([[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:
|
|
176
|
+
:: ([[\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:Nonspacing_Mark:]]-[\u309B \u309C]]);
|
|
177
177
|
# eof
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
# note: a global filter is more efficient, but MUST include all source chars
|
|
2
|
-
#:: [\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:
|
|
2
|
+
#:: [\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]] ;
|
|
3
3
|
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
|
4
4
|
### WARNING -- must add width filter, both here and below!!! ###
|
|
5
5
|
:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」゙-゚ァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ̄Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
|
|
@@ -370,11 +370,11 @@ x → | ks ;
|
|
|
370
370
|
# Final cleanup
|
|
371
371
|
'~' → ; # delete stray tildes between letters
|
|
372
372
|
[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
|
|
373
|
-
# [ʾ[:
|
|
373
|
+
# [ʾ[:Nonspacing_Mark:]-[゙-゜]] → ; # delete any non-spacing marks that we didn't use
|
|
374
374
|
:: NFC (NFD) ;
|
|
375
375
|
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
|
|
376
376
|
# note: a global filter is more efficient, but MUST include all source chars!!
|
|
377
|
-
#:: ([\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:
|
|
377
|
+
#:: ([\u0000-\u007E 、。 ゙-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:Nonspacing_Mark:]]);
|
|
378
378
|
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
|
379
379
|
:: ( [[\ -~¢-£¥-¦¬̄₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ゙-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
|
|
380
380
|
# eof
|
|
@@ -23,5 +23,5 @@ $digit = [1-5];
|
|
|
23
23
|
$1 &NumericPinyin-Pinyin($3) $2 ← ([aAeE]) ($vowel* $consonant*) ($digit);
|
|
24
24
|
$1 &NumericPinyin-Pinyin($3) $2 ← ([oO]) ([$vowel-[aeAE]]* $consonant*) ($digit);
|
|
25
25
|
$1 &NumericPinyin-Pinyin($3) $2 ← ($vowel) ($consonant*) ($digit);
|
|
26
|
-
&NumericPinyin-Pinyin($1) ← [:
|
|
26
|
+
&NumericPinyin-Pinyin($1) ← [:Letter:] {($digit)};
|
|
27
27
|
::NFC (NFD);
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
# In our rules, we also convert Arabic punctuation characters to Latin.
|
|
11
11
|
# These appears to be used in Maldivian text, for example in the Universal
|
|
12
12
|
# Declaration of Human Rights.
|
|
13
|
-
::[[:
|
|
13
|
+
::[[:Block=Thaana:][،؛؟٪٫٬]\uFDF2] ;
|
|
14
14
|
::NFD;
|
|
15
15
|
$wordBoundary = [^[:L:][:M:][:N:]] ;
|
|
16
16
|
$vowel = [\u07A6-\u07AF] ;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
# This reverses the Thai LogicalOrderException vowels, and does (part of) spaces
|
|
2
2
|
# The rules that convert space into semicolon are in another file;
|
|
3
3
|
# since they have to come BEFORE the break iterator
|
|
4
|
-
$thai = [[:
|
|
4
|
+
$thai = [[:Thai:] ก-ฺเ-๛] ;
|
|
5
5
|
# First convert the semicolon back
|
|
6
6
|
' ' ← $thai { '; ' } $thai;
|
|
7
7
|
# Remove any other spaces between thai letters
|
|
@@ -18,8 +18,8 @@
|
|
|
18
18
|
#{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ;
|
|
19
19
|
#\uE000 → ọ ;
|
|
20
20
|
# ← ọ ;
|
|
21
|
-
$notAbove = [^\p{ccc=0}\p{ccc=
|
|
22
|
-
$notBelow = [^\p{ccc=0}\p{ccc=
|
|
21
|
+
$notAbove = [^\p{ccc=0}\p{ccc=Above}] ;
|
|
22
|
+
$notBelow = [^\p{ccc=0}\p{ccc=Below}] ;
|
|
23
23
|
# Consonants
|
|
24
24
|
# Warning: the 'h's need to be handled carefully!
|
|
25
25
|
# What we really want to say is the following, but we can't
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
#
|
|
13
13
|
# MINIMAL FILTER: Ethiopic-Morse Code
|
|
14
14
|
#
|
|
15
|
-
:: [[:Zs:]0-9!\?\+/@()\[\]_:;,\.'"
|
|
15
|
+
:: [[:Zs:]0-9!\?\+/@()\[\]_:;,\.'"\$=\-[:Ethiopic:]] ;
|
|
16
16
|
([:Lo:])([:Zs:]+)([:Lo:]) → | $1⁄⁂⁄$2$3 ; # ⁄⁂⁄ is assumed to be a sufficiently weird enough sequence that won't naturally appear in any normal content
|
|
17
17
|
#
|
|
18
18
|
########################################################################
|
package/transforms/az-Title.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
|
2
2
|
# Make any string of letters after a cased letter be lower, with rules for i
|
|
3
|
-
[:
|
|
4
|
-
[:
|
|
5
|
-
[:
|
|
3
|
+
[:Cased:] [:Case_Ignorable:]* { İ → i;
|
|
4
|
+
[:Cased:] [:Case_Ignorable:]* { I → ı;
|
|
5
|
+
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
|
|
6
6
|
# Otherwise all lowercase go to upper (titlecase stay as is)
|
|
7
7
|
i→İ ;
|
|
8
8
|
([:Lowercase:]) → &Any-Upper($1) ;
|
|
@@ -747,16 +747,16 @@ $wordBoundary{ኦ → $ኦ ; # ETHIOPIC SYLLABLE GLOTTAL O
|
|
|
747
747
|
# Convert to dot to dot if dot is followed by a number, ellipsis, or another dot.
|
|
748
748
|
\. $1 ← \.([0-9….]) ;
|
|
749
749
|
# Convert to Ethiopic Fullstop if dot is not followed by a number or another dot.
|
|
750
|
-
። $1
|
|
750
|
+
። $1 ← \.([^0-9.]) ;
|
|
751
751
|
\, $1 ← \,([0-9]) ;
|
|
752
|
-
# ፣ $1
|
|
752
|
+
# ፣ $1 ← \,([^0-9]) ;
|
|
753
753
|
፤ ↔ \; ;
|
|
754
754
|
፦ ↔ \:\- ;
|
|
755
755
|
# ፥ ↔ \: ;
|
|
756
756
|
# ፨ → "#" ;
|
|
757
757
|
# ፠ → \+ ;
|
|
758
758
|
፧ → \? ;
|
|
759
|
-
፡ $1
|
|
759
|
+
፡ $1 ← \,([^0-9]) ;
|
|
760
760
|
::Null ;
|
|
761
761
|
$1 $ጥበቅ ← $ጥበቅ ([ሀ-ፖ]) ;
|
|
762
762
|
########################################################################
|
package/transforms/de-ASCII.txt
CHANGED
package/transforms/el-Lower.txt
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
# and C is not followed by a sequence consisting of zero or more case-ignorable characters and then a cased letter.
|
|
5
5
|
# 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
|
6
6
|
# With translit rules, easiest is to handle the negative condition first, mapping in that case to the regular sigma.
|
|
7
|
-
Σ } [:
|
|
8
|
-
[:
|
|
7
|
+
Σ } [:Case_Ignorable:]* [:Cased:] → σ;
|
|
8
|
+
[:Cased:] [:Case_Ignorable:]* { Σ → ς;
|
|
9
9
|
::Any-Lower;
|
|
10
10
|
::NFC();
|
package/transforms/el-Title.txt
CHANGED
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
# Remove \0301 following Greek, with possible intervening 0308 marks.
|
|
3
3
|
# [[:Greek:] & [:Ll:]] [\u0308]? { \u0301 → ;
|
|
4
4
|
# Make any string of letters after a cased letter be lower, with rules for sigma
|
|
5
|
-
[:
|
|
6
|
-
[:
|
|
7
|
-
[:
|
|
5
|
+
[:Cased:] [:Case_Ignorable:]* { Σ } [:Case_Ignorable:]* [:Cased:] → σ;
|
|
6
|
+
[:Cased:] [:Case_Ignorable:]* { Σ → ς;
|
|
7
|
+
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
|
|
8
8
|
# Otherwise all lowercase go to upper (titlecase stay as is)
|
|
9
9
|
([:Lowercase:]) → &Any-Title($1) ;
|
|
10
10
|
::NFC();
|
package/transforms/it-am.txt
CHANGED
package/transforms/it-ja.txt
CHANGED
package/transforms/lt-Title.txt
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
# Make any string of letters after a cased letter be lower
|
|
2
2
|
::NFD();
|
|
3
|
-
[:
|
|
4
|
-
[:
|
|
5
|
-
[:
|
|
6
|
-
[:
|
|
7
|
-
[:
|
|
8
|
-
[:
|
|
9
|
-
[:
|
|
3
|
+
[:Cased:] [:Case_Ignorable:]* {I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
|
|
4
|
+
[:Cased:] [:Case_Ignorable:]* {J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
|
|
5
|
+
[:Cased:] [:Case_Ignorable:]* {I \u0328 } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307;
|
|
6
|
+
[:Cased:] [:Case_Ignorable:]* {I \u0300 → i \u0307 \u0300;
|
|
7
|
+
[:Cased:] [:Case_Ignorable:]* {I \u0301 → i \u0307 \u0301;
|
|
8
|
+
[:Cased:] [:Case_Ignorable:]* {I \u0303 → i \u0307 \u0303;
|
|
9
|
+
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
|
|
10
10
|
# Otherwise all lowercase go to upper (titlecase stay as is)
|
|
11
11
|
[:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
|
|
12
12
|
([:Lowercase:]) → &Any-Upper($1) ;
|
package/transforms/tr-Title.txt
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
|
2
2
|
# Make any string of letters after a cased letter be lower, with rules for i
|
|
3
|
-
[:
|
|
4
|
-
[:
|
|
5
|
-
[:
|
|
3
|
+
[:Cased:] [:Case_Ignorable:]* { İ → i;
|
|
4
|
+
[:Cased:] [:Case_Ignorable:]* { I → ı;
|
|
5
|
+
[:Cased:] [:Case_Ignorable:]* { (.) → &Any-Lower($1) ;
|
|
6
6
|
# Otherwise all lowercase go to upper (titlecase stay as is)
|
|
7
7
|
i→İ ;
|
|
8
8
|
([:Lowercase:]) → &Any-Upper($1) ;
|
|
@@ -12,6 +12,6 @@
|
|
|
12
12
|
########################################################################
|
|
13
13
|
#
|
|
14
14
|
:: Amharic-Amharic/Geminate ;
|
|
15
|
-
:: Ethiopic-Latin/
|
|
15
|
+
:: Ethiopic-Latin/Beta_Metsehaf ;
|
|
16
16
|
#
|
|
17
17
|
########################################################################
|