@sarmay/kaz-converter 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +148 -0
- package/dist/index.d.cts +6 -1
- package/dist/index.d.ts +6 -1
- package/dist/index.js +148 -0
- package/package.json +1 -1
package/dist/index.cjs
CHANGED
|
@@ -134,11 +134,38 @@ var EXCEPTIONS = {
|
|
|
134
134
|
"\u0627\u0644\u0645\u0627\u062A\u0649": "\u0410\u043B\u043C\u0430\u0442\u044B",
|
|
135
135
|
"\u0627\u0633\u062A\u0627\u0646\u0627": "\u0410\u0441\u0442\u0430\u043D\u0430",
|
|
136
136
|
"\u0642\u0627\u0632\u0627\u0642\u0633\u062A\u0627\u0646": "\u049A\u0430\u0437\u0430\u049B\u0441\u0442\u0430\u043D",
|
|
137
|
+
"\u0621\u0627\u0628\u062F\u0649": "\u04D9\u0431\u0434\u0456",
|
|
138
|
+
"\u0627\u0628\u062F\u0649": "\u04D9\u0431\u0434\u0456",
|
|
139
|
+
"\u0627\u0645\u0649\u0631": "\u04D9\u043C\u0456\u0440",
|
|
140
|
+
"\u0628\u06C7\u0644\u0627\u0646": "\u0431\u04B1\u043B\u0430\u043D",
|
|
141
|
+
"\u0628\u0627\u0642\u064A": "\u0431\u0430\u049B\u0438",
|
|
142
|
+
"\u0628\u0627\u0642\u064A\u062A": "\u0431\u0430\u049B\u0438\u0442",
|
|
143
|
+
"\u0628\u0627\u0642\u0649\u062A": "\u0431\u0430\u049B\u044B\u0442",
|
|
144
|
+
"\u0646\u0627\u0633\u0649\u0631": "\u043D\u0430\u0441\u044B\u0440",
|
|
145
|
+
"\u062F\u0627\u06CB\u0644\u06D5\u062A": "\u0434\u04D9\u0443\u043B\u0435\u0442",
|
|
146
|
+
"\u0628\u0648\u0644": "\u0431\u043E\u043B",
|
|
147
|
+
"\u0628\u0648\u0644\u0633\u0649\u0646": "\u0431\u043E\u043B\u0441\u044B\u043D",
|
|
148
|
+
"\u0628\u0648\u0644\u0644\u0627": "\u0431\u043E\u043B\u043B\u0430",
|
|
149
|
+
"\u0642\u0648\u062C\u0627": "\u049B\u043E\u0436\u0430",
|
|
150
|
+
"\u0639\u0648\u062C\u0627": "\u0493\u043E\u0436\u0430",
|
|
151
|
+
"\u0646\u0627\u0631": "\u043D\u0430\u0440",
|
|
152
|
+
"\u0646\u0627\u0632\u0627\u0631": "\u043D\u0430\u0437\u0430\u0440",
|
|
153
|
+
"\u0633\u0627\u0644\u064A": "\u0441\u0430\u043B\u0438",
|
|
154
|
+
"\u062F\u064A\u0627\u0633": "\u0434\u0438\u0430\u0441",
|
|
155
|
+
"\u062C\u0649\u0628\u06D5\u0643": "\u0436\u0456\u0431\u0435\u043A",
|
|
156
|
+
"\u062D\u0627\u0643\u0649\u0645": "\u0445\u0430\u043A\u0456\u043C",
|
|
157
|
+
"\u0643\u064A\u0631\u0627": "\u043A\u0438\u0440\u0430",
|
|
158
|
+
"\u064A\u0643\u0627\u0645\u0627\u0644": "\u0438\u043A\u0430\u043C\u0430\u043B",
|
|
137
159
|
"\u062C\u06C7\u06AD\u06AF\u0648": "\u0416\u04B1\u04A3\u0433\u043E",
|
|
138
160
|
"\u0634\u064A": "\u0421\u0438",
|
|
139
161
|
"\u062C\u064A\u0646\u067E\u064A\u06AD": "\u0426\u0437\u0438\u043D\u044C\u043F\u0438\u043D",
|
|
140
162
|
"\u0643\u0649\u062A\u0627\u067E": "\u043A\u0456\u0442\u0430\u043F",
|
|
141
163
|
"\u0631\u0627\u062D\u0645\u06D5\u062A": "\u0440\u0430\u0445\u043C\u0435\u0442",
|
|
164
|
+
"\u0627\u062D\u0645\u06D5\u062A": "\u0430\u0445\u043C\u0435\u0442",
|
|
165
|
+
"\u0645\u0627\u062D\u0645\u06D5\u062A": "\u043C\u0430\u0445\u043C\u0435\u0442",
|
|
166
|
+
"\u064A\u0627\u0631": "\u0438\u044F\u0440",
|
|
167
|
+
"\u064A\u0627\u0644": "\u0438\u044F\u043B",
|
|
168
|
+
"\u064A\u0627\u0632": "\u0438\u044F\u0437",
|
|
142
169
|
"\u0627\u06CB\u0649\u0644": "\u0430\u0443\u044B\u043B",
|
|
143
170
|
"\u06AF\u0628": "\u0413\u0411",
|
|
144
171
|
"\u067E\u0631\u0648\u06AF\u0631\u06D5\u0633": "\u043F\u0440\u043E\u0433\u0440\u0435\u0441\u0441",
|
|
@@ -443,6 +470,75 @@ var COMPOUND_PIVOT_ROOTS = [
|
|
|
443
470
|
"\u0648\u064A\u0649\u0646",
|
|
444
471
|
"\u062A\u06C7\u0633\u062A\u0649\u06AF"
|
|
445
472
|
];
|
|
473
|
+
var NAME_PREFIX_COMPONENTS = [
|
|
474
|
+
"\u0621\u0627\u0628\u062F\u0649",
|
|
475
|
+
"\u0627\u0628\u062F\u0649",
|
|
476
|
+
"\u062F\u0627\u06CB\u0644\u06D5\u062A",
|
|
477
|
+
"\u06AF\u06C7\u0644",
|
|
478
|
+
"\u0628\u06D5\u0643",
|
|
479
|
+
"\u0646\u06C7\u0631"
|
|
480
|
+
];
|
|
481
|
+
var NAME_SUFFIX_COMPONENTS = [
|
|
482
|
+
"\u0645\u06C7\u062D\u0627\u0645\u0628\u06D5\u062A",
|
|
483
|
+
"\u0627\u062D\u0645\u06D5\u062A",
|
|
484
|
+
"\u062D\u0627\u0644\u0649\u0642",
|
|
485
|
+
"\u0642\u0627\u0633\u0649\u0645",
|
|
486
|
+
"\u0633\u0627\u062F\u0649\u0642",
|
|
487
|
+
"\u0633\u0627\u0644\u0627\u0645",
|
|
488
|
+
"\u0628\u0648\u0644\u0633\u0649\u0646",
|
|
489
|
+
"\u0628\u0648\u0644\u0644\u0627",
|
|
490
|
+
"\u0646\u0627\u0633\u0649\u0631",
|
|
491
|
+
"\u0646\u0627\u0632\u0627\u0631",
|
|
492
|
+
"\u0628\u064A\u0631\u0627",
|
|
493
|
+
"\u0628\u064A\u0628\u0627",
|
|
494
|
+
"\u064A\u067E\u0627",
|
|
495
|
+
"\u0631\u064A\u067E\u0627",
|
|
496
|
+
"\u0646\u064A\u067E\u0627",
|
|
497
|
+
"\u064A\u0645\u0627",
|
|
498
|
+
"\u0633\u064A\u0645\u0627",
|
|
499
|
+
"\u0645\u064A\u0644\u0627",
|
|
500
|
+
"\u0628\u064A\u0644\u0627",
|
|
501
|
+
"\u062C\u064A\u062F\u0627",
|
|
502
|
+
"\u0632\u064A\u0644\u0627",
|
|
503
|
+
"\u0641\u064A\u0631\u0627",
|
|
504
|
+
"\u0646\u0627\u0631\u0627",
|
|
505
|
+
"\u0644\u064A\u067E\u0627",
|
|
506
|
+
"\u0644\u064A\u0645\u0627",
|
|
507
|
+
"\u062F\u064A\u0628\u0627",
|
|
508
|
+
"\u062F\u064A\u0644\u0627",
|
|
509
|
+
"\u062F\u064A\u0631\u0627",
|
|
510
|
+
"\u0644\u064A\u0646\u0627",
|
|
511
|
+
"\u0643\u064A\u0631\u0627",
|
|
512
|
+
"\u0646\u0646\u0627",
|
|
513
|
+
"\u0627\u062F\u0627",
|
|
514
|
+
"\u0628\u0627\u0631\u0649\u0633",
|
|
515
|
+
"\u0643\u06D5\u0644\u062F\u0649",
|
|
516
|
+
"\u06AF\u06D5\u0644\u062F\u0649",
|
|
517
|
+
"\u0628\u06C7\u0644\u0627\u0646",
|
|
518
|
+
"\u064A\u0643\u0627\u0645\u0627\u0644",
|
|
519
|
+
"\u0628\u0627\u0642\u0649\u062A",
|
|
520
|
+
"\u0628\u0627\u0642\u064A\u062A",
|
|
521
|
+
"\u0628\u0627\u0642\u064A",
|
|
522
|
+
"\u0642\u0648\u062C\u0627",
|
|
523
|
+
"\u0639\u0648\u062C\u0627",
|
|
524
|
+
"\u0633\u0627\u0644\u064A",
|
|
525
|
+
"\u0642\u0627\u0644\u064A",
|
|
526
|
+
"\u062F\u064A\u064A\u0627\u0631",
|
|
527
|
+
"\u064A\u064A\u0627\u0631",
|
|
528
|
+
"\u064A\u064A\u0627\u0632",
|
|
529
|
+
"\u064A\u064A\u0627\u0633",
|
|
530
|
+
"\u064A\u064A\u0627",
|
|
531
|
+
"\u064A\u0627\u0631",
|
|
532
|
+
"\u064A\u0627\u0644",
|
|
533
|
+
"\u064A\u0627\u0632",
|
|
534
|
+
"\u0646\u0627\u0631",
|
|
535
|
+
"\u062C\u0627\u0646",
|
|
536
|
+
"\u0646\u06C7\u0631",
|
|
537
|
+
"\u0628\u06D5\u0643",
|
|
538
|
+
"\u0628\u0627\u064A",
|
|
539
|
+
"\u062D\u0627\u0646",
|
|
540
|
+
"\u06AF\u06C7\u0644"
|
|
541
|
+
];
|
|
446
542
|
var IMPLICIT_SOFT_ROOTS = /* @__PURE__ */ new Set([
|
|
447
543
|
"\u06C7\u0645\u0649\u062A",
|
|
448
544
|
"\u062A\u06C7\u0628",
|
|
@@ -578,6 +674,7 @@ var ArabicToCyrillicConverter = class {
|
|
|
578
674
|
);
|
|
579
675
|
}
|
|
580
676
|
this.disambiguator = options.disambiguator ?? new NoopDisambiguator();
|
|
677
|
+
this.nameYSequenceStyle = options.nameYSequenceStyle ?? "normalize";
|
|
581
678
|
for (const prefix of LOANWORD_PREFIXES) {
|
|
582
679
|
this.loanwordPrefixTrie.insert(prefix);
|
|
583
680
|
}
|
|
@@ -635,6 +732,36 @@ var ArabicToCyrillicConverter = class {
|
|
|
635
732
|
}
|
|
636
733
|
return false;
|
|
637
734
|
}
|
|
735
|
+
crossesProtectedNameEnding(word, splitIndex) {
|
|
736
|
+
return NAME_SUFFIX_COMPONENTS.some((ending) => {
|
|
737
|
+
if (!word.endsWith(ending)) {
|
|
738
|
+
return false;
|
|
739
|
+
}
|
|
740
|
+
const protectedStart = word.length - ending.length;
|
|
741
|
+
return splitIndex > protectedStart && splitIndex < word.length;
|
|
742
|
+
});
|
|
743
|
+
}
|
|
744
|
+
convertNameYSequence(word) {
|
|
745
|
+
const normalized = {
|
|
746
|
+
"\u064A\u064A\u0627": "\u0438\u044F",
|
|
747
|
+
"\u064A\u064A\u0627\u0631": "\u0438\u044F\u0440",
|
|
748
|
+
"\u064A\u064A\u0627\u0632": "\u0438\u044F\u0437",
|
|
749
|
+
"\u064A\u064A\u0627\u0633": "\u0438\u044F\u0441"
|
|
750
|
+
};
|
|
751
|
+
const preserved = {
|
|
752
|
+
"\u064A\u064A\u0627": "\u0438\u0439\u0430",
|
|
753
|
+
"\u064A\u064A\u0627\u0631": "\u0438\u0439\u0430\u0440",
|
|
754
|
+
"\u064A\u064A\u0627\u0632": "\u0438\u0439\u0430\u0437",
|
|
755
|
+
"\u064A\u064A\u0627\u0633": "\u0438\u0439\u0430\u0441"
|
|
756
|
+
};
|
|
757
|
+
if (!(word in normalized)) {
|
|
758
|
+
return null;
|
|
759
|
+
}
|
|
760
|
+
if (this.nameYSequenceStyle === "preserve") {
|
|
761
|
+
return preserved[word];
|
|
762
|
+
}
|
|
763
|
+
return normalized[word];
|
|
764
|
+
}
|
|
638
765
|
isValidSuffixSequence(suffix) {
|
|
639
766
|
if (!suffix) {
|
|
640
767
|
return true;
|
|
@@ -685,6 +812,16 @@ var ArabicToCyrillicConverter = class {
|
|
|
685
812
|
if (word.includes("-")) {
|
|
686
813
|
return word.split("-");
|
|
687
814
|
}
|
|
815
|
+
for (const prefix of NAME_PREFIX_COMPONENTS) {
|
|
816
|
+
if (word.startsWith(prefix) && word.length > prefix.length) {
|
|
817
|
+
return [word.slice(0, prefix.length), word.slice(prefix.length)];
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
for (const suffix of NAME_SUFFIX_COMPONENTS) {
|
|
821
|
+
if (word.endsWith(suffix) && word.length > suffix.length) {
|
|
822
|
+
return [word.slice(0, -suffix.length), suffix];
|
|
823
|
+
}
|
|
824
|
+
}
|
|
688
825
|
if (word.startsWith(this.HAMZA)) {
|
|
689
826
|
return [word];
|
|
690
827
|
}
|
|
@@ -731,6 +868,9 @@ var ArabicToCyrillicConverter = class {
|
|
|
731
868
|
if (![...prefix].some((char) => this.arabicVowels.has(char))) {
|
|
732
869
|
continue;
|
|
733
870
|
}
|
|
871
|
+
if (this.crossesProtectedNameEnding(word, length)) {
|
|
872
|
+
continue;
|
|
873
|
+
}
|
|
734
874
|
if (this.isValidSuffixSequence(suffix)) {
|
|
735
875
|
return { matchType: "anonymous", base: prefix, suffix };
|
|
736
876
|
}
|
|
@@ -829,12 +969,20 @@ var ArabicToCyrillicConverter = class {
|
|
|
829
969
|
if (!word) {
|
|
830
970
|
return word;
|
|
831
971
|
}
|
|
972
|
+
const nameYSequence = this.convertNameYSequence(word);
|
|
973
|
+
if (nameYSequence) {
|
|
974
|
+
return nameYSequence;
|
|
975
|
+
}
|
|
832
976
|
if (hasKey(EXCEPTIONS, word)) {
|
|
833
977
|
return EXCEPTIONS[word];
|
|
834
978
|
}
|
|
835
979
|
if (hasKey(PROPER_NOUNS, word)) {
|
|
836
980
|
return PROPER_NOUNS[word];
|
|
837
981
|
}
|
|
982
|
+
const segments = this.segmentCompoundWord(word);
|
|
983
|
+
if (segments.length > 1) {
|
|
984
|
+
return word.includes("-") ? segments.map((segment) => this.convertWord(segment)).join("-") : segments.map((segment) => this.convertWord(segment)).join("");
|
|
985
|
+
}
|
|
838
986
|
if (LOANWORD_EXACT.has(word)) {
|
|
839
987
|
return this.convertWordInternal(word);
|
|
840
988
|
}
|
package/dist/index.d.cts
CHANGED
|
@@ -4,6 +4,7 @@ interface LexiconData {
|
|
|
4
4
|
}
|
|
5
5
|
type MaybePromise<T> = T | Promise<T>;
|
|
6
6
|
type RawToken = readonly [source: string, converted: string];
|
|
7
|
+
type NameYSequenceStyle = "normalize" | "preserve";
|
|
7
8
|
interface ContextDisambiguator {
|
|
8
9
|
disambiguate(rawTokens: readonly RawToken[], contextSentence: string): MaybePromise<string[]>;
|
|
9
10
|
}
|
|
@@ -13,6 +14,7 @@ interface CyrillicToArabicOptions {
|
|
|
13
14
|
interface ArabicToCyrillicOptions {
|
|
14
15
|
useLm?: boolean;
|
|
15
16
|
disambiguator?: ContextDisambiguator;
|
|
17
|
+
nameYSequenceStyle?: NameYSequenceStyle;
|
|
16
18
|
}
|
|
17
19
|
|
|
18
20
|
declare class NoopDisambiguator implements ContextDisambiguator {
|
|
@@ -29,6 +31,7 @@ interface RootMatch {
|
|
|
29
31
|
declare class ArabicToCyrillicConverter {
|
|
30
32
|
readonly HAMZA = "\u0674";
|
|
31
33
|
private readonly disambiguator;
|
|
34
|
+
private readonly nameYSequenceStyle;
|
|
32
35
|
private readonly loanwordPrefixTrie;
|
|
33
36
|
private readonly reZwnjEtc;
|
|
34
37
|
private readonly reSpaces;
|
|
@@ -48,6 +51,8 @@ declare class ArabicToCyrillicConverter {
|
|
|
48
51
|
hasConsonantCluster(word: string): boolean;
|
|
49
52
|
isLoanwordWithEPrefix(word: string): boolean;
|
|
50
53
|
getCyrillicVowelState(cyrillicWord: string): boolean;
|
|
54
|
+
private crossesProtectedNameEnding;
|
|
55
|
+
private convertNameYSequence;
|
|
51
56
|
isValidSuffixSequence(suffix: string): boolean;
|
|
52
57
|
getHarmonyFromArabicRoot(word: string): HarmonyState;
|
|
53
58
|
segmentCompoundWord(word: string): string[];
|
|
@@ -79,4 +84,4 @@ declare class CyrillicToArabicConverter {
|
|
|
79
84
|
}
|
|
80
85
|
declare function syr2arb(text: string, options?: CyrillicToArabicOptions): string;
|
|
81
86
|
|
|
82
|
-
export { ArabicToCyrillicConverter, type ArabicToCyrillicOptions, type ContextDisambiguator, CyrillicToArabicConverter, type CyrillicToArabicOptions, type LexiconData, type MaybePromise, NoopDisambiguator, type RawToken, arb2syr, arb2syrAsync, syr2arb };
|
|
87
|
+
export { ArabicToCyrillicConverter, type ArabicToCyrillicOptions, type ContextDisambiguator, CyrillicToArabicConverter, type CyrillicToArabicOptions, type LexiconData, type MaybePromise, type NameYSequenceStyle, NoopDisambiguator, type RawToken, arb2syr, arb2syrAsync, syr2arb };
|
package/dist/index.d.ts
CHANGED
|
@@ -4,6 +4,7 @@ interface LexiconData {
|
|
|
4
4
|
}
|
|
5
5
|
type MaybePromise<T> = T | Promise<T>;
|
|
6
6
|
type RawToken = readonly [source: string, converted: string];
|
|
7
|
+
type NameYSequenceStyle = "normalize" | "preserve";
|
|
7
8
|
interface ContextDisambiguator {
|
|
8
9
|
disambiguate(rawTokens: readonly RawToken[], contextSentence: string): MaybePromise<string[]>;
|
|
9
10
|
}
|
|
@@ -13,6 +14,7 @@ interface CyrillicToArabicOptions {
|
|
|
13
14
|
interface ArabicToCyrillicOptions {
|
|
14
15
|
useLm?: boolean;
|
|
15
16
|
disambiguator?: ContextDisambiguator;
|
|
17
|
+
nameYSequenceStyle?: NameYSequenceStyle;
|
|
16
18
|
}
|
|
17
19
|
|
|
18
20
|
declare class NoopDisambiguator implements ContextDisambiguator {
|
|
@@ -29,6 +31,7 @@ interface RootMatch {
|
|
|
29
31
|
declare class ArabicToCyrillicConverter {
|
|
30
32
|
readonly HAMZA = "\u0674";
|
|
31
33
|
private readonly disambiguator;
|
|
34
|
+
private readonly nameYSequenceStyle;
|
|
32
35
|
private readonly loanwordPrefixTrie;
|
|
33
36
|
private readonly reZwnjEtc;
|
|
34
37
|
private readonly reSpaces;
|
|
@@ -48,6 +51,8 @@ declare class ArabicToCyrillicConverter {
|
|
|
48
51
|
hasConsonantCluster(word: string): boolean;
|
|
49
52
|
isLoanwordWithEPrefix(word: string): boolean;
|
|
50
53
|
getCyrillicVowelState(cyrillicWord: string): boolean;
|
|
54
|
+
private crossesProtectedNameEnding;
|
|
55
|
+
private convertNameYSequence;
|
|
51
56
|
isValidSuffixSequence(suffix: string): boolean;
|
|
52
57
|
getHarmonyFromArabicRoot(word: string): HarmonyState;
|
|
53
58
|
segmentCompoundWord(word: string): string[];
|
|
@@ -79,4 +84,4 @@ declare class CyrillicToArabicConverter {
|
|
|
79
84
|
}
|
|
80
85
|
declare function syr2arb(text: string, options?: CyrillicToArabicOptions): string;
|
|
81
86
|
|
|
82
|
-
export { ArabicToCyrillicConverter, type ArabicToCyrillicOptions, type ContextDisambiguator, CyrillicToArabicConverter, type CyrillicToArabicOptions, type LexiconData, type MaybePromise, NoopDisambiguator, type RawToken, arb2syr, arb2syrAsync, syr2arb };
|
|
87
|
+
export { ArabicToCyrillicConverter, type ArabicToCyrillicOptions, type ContextDisambiguator, CyrillicToArabicConverter, type CyrillicToArabicOptions, type LexiconData, type MaybePromise, type NameYSequenceStyle, NoopDisambiguator, type RawToken, arb2syr, arb2syrAsync, syr2arb };
|
package/dist/index.js
CHANGED
|
@@ -103,11 +103,38 @@ var EXCEPTIONS = {
|
|
|
103
103
|
"\u0627\u0644\u0645\u0627\u062A\u0649": "\u0410\u043B\u043C\u0430\u0442\u044B",
|
|
104
104
|
"\u0627\u0633\u062A\u0627\u0646\u0627": "\u0410\u0441\u0442\u0430\u043D\u0430",
|
|
105
105
|
"\u0642\u0627\u0632\u0627\u0642\u0633\u062A\u0627\u0646": "\u049A\u0430\u0437\u0430\u049B\u0441\u0442\u0430\u043D",
|
|
106
|
+
"\u0621\u0627\u0628\u062F\u0649": "\u04D9\u0431\u0434\u0456",
|
|
107
|
+
"\u0627\u0628\u062F\u0649": "\u04D9\u0431\u0434\u0456",
|
|
108
|
+
"\u0627\u0645\u0649\u0631": "\u04D9\u043C\u0456\u0440",
|
|
109
|
+
"\u0628\u06C7\u0644\u0627\u0646": "\u0431\u04B1\u043B\u0430\u043D",
|
|
110
|
+
"\u0628\u0627\u0642\u064A": "\u0431\u0430\u049B\u0438",
|
|
111
|
+
"\u0628\u0627\u0642\u064A\u062A": "\u0431\u0430\u049B\u0438\u0442",
|
|
112
|
+
"\u0628\u0627\u0642\u0649\u062A": "\u0431\u0430\u049B\u044B\u0442",
|
|
113
|
+
"\u0646\u0627\u0633\u0649\u0631": "\u043D\u0430\u0441\u044B\u0440",
|
|
114
|
+
"\u062F\u0627\u06CB\u0644\u06D5\u062A": "\u0434\u04D9\u0443\u043B\u0435\u0442",
|
|
115
|
+
"\u0628\u0648\u0644": "\u0431\u043E\u043B",
|
|
116
|
+
"\u0628\u0648\u0644\u0633\u0649\u0646": "\u0431\u043E\u043B\u0441\u044B\u043D",
|
|
117
|
+
"\u0628\u0648\u0644\u0644\u0627": "\u0431\u043E\u043B\u043B\u0430",
|
|
118
|
+
"\u0642\u0648\u062C\u0627": "\u049B\u043E\u0436\u0430",
|
|
119
|
+
"\u0639\u0648\u062C\u0627": "\u0493\u043E\u0436\u0430",
|
|
120
|
+
"\u0646\u0627\u0631": "\u043D\u0430\u0440",
|
|
121
|
+
"\u0646\u0627\u0632\u0627\u0631": "\u043D\u0430\u0437\u0430\u0440",
|
|
122
|
+
"\u0633\u0627\u0644\u064A": "\u0441\u0430\u043B\u0438",
|
|
123
|
+
"\u062F\u064A\u0627\u0633": "\u0434\u0438\u0430\u0441",
|
|
124
|
+
"\u062C\u0649\u0628\u06D5\u0643": "\u0436\u0456\u0431\u0435\u043A",
|
|
125
|
+
"\u062D\u0627\u0643\u0649\u0645": "\u0445\u0430\u043A\u0456\u043C",
|
|
126
|
+
"\u0643\u064A\u0631\u0627": "\u043A\u0438\u0440\u0430",
|
|
127
|
+
"\u064A\u0643\u0627\u0645\u0627\u0644": "\u0438\u043A\u0430\u043C\u0430\u043B",
|
|
106
128
|
"\u062C\u06C7\u06AD\u06AF\u0648": "\u0416\u04B1\u04A3\u0433\u043E",
|
|
107
129
|
"\u0634\u064A": "\u0421\u0438",
|
|
108
130
|
"\u062C\u064A\u0646\u067E\u064A\u06AD": "\u0426\u0437\u0438\u043D\u044C\u043F\u0438\u043D",
|
|
109
131
|
"\u0643\u0649\u062A\u0627\u067E": "\u043A\u0456\u0442\u0430\u043F",
|
|
110
132
|
"\u0631\u0627\u062D\u0645\u06D5\u062A": "\u0440\u0430\u0445\u043C\u0435\u0442",
|
|
133
|
+
"\u0627\u062D\u0645\u06D5\u062A": "\u0430\u0445\u043C\u0435\u0442",
|
|
134
|
+
"\u0645\u0627\u062D\u0645\u06D5\u062A": "\u043C\u0430\u0445\u043C\u0435\u0442",
|
|
135
|
+
"\u064A\u0627\u0631": "\u0438\u044F\u0440",
|
|
136
|
+
"\u064A\u0627\u0644": "\u0438\u044F\u043B",
|
|
137
|
+
"\u064A\u0627\u0632": "\u0438\u044F\u0437",
|
|
111
138
|
"\u0627\u06CB\u0649\u0644": "\u0430\u0443\u044B\u043B",
|
|
112
139
|
"\u06AF\u0628": "\u0413\u0411",
|
|
113
140
|
"\u067E\u0631\u0648\u06AF\u0631\u06D5\u0633": "\u043F\u0440\u043E\u0433\u0440\u0435\u0441\u0441",
|
|
@@ -412,6 +439,75 @@ var COMPOUND_PIVOT_ROOTS = [
|
|
|
412
439
|
"\u0648\u064A\u0649\u0646",
|
|
413
440
|
"\u062A\u06C7\u0633\u062A\u0649\u06AF"
|
|
414
441
|
];
|
|
442
|
+
var NAME_PREFIX_COMPONENTS = [
|
|
443
|
+
"\u0621\u0627\u0628\u062F\u0649",
|
|
444
|
+
"\u0627\u0628\u062F\u0649",
|
|
445
|
+
"\u062F\u0627\u06CB\u0644\u06D5\u062A",
|
|
446
|
+
"\u06AF\u06C7\u0644",
|
|
447
|
+
"\u0628\u06D5\u0643",
|
|
448
|
+
"\u0646\u06C7\u0631"
|
|
449
|
+
];
|
|
450
|
+
var NAME_SUFFIX_COMPONENTS = [
|
|
451
|
+
"\u0645\u06C7\u062D\u0627\u0645\u0628\u06D5\u062A",
|
|
452
|
+
"\u0627\u062D\u0645\u06D5\u062A",
|
|
453
|
+
"\u062D\u0627\u0644\u0649\u0642",
|
|
454
|
+
"\u0642\u0627\u0633\u0649\u0645",
|
|
455
|
+
"\u0633\u0627\u062F\u0649\u0642",
|
|
456
|
+
"\u0633\u0627\u0644\u0627\u0645",
|
|
457
|
+
"\u0628\u0648\u0644\u0633\u0649\u0646",
|
|
458
|
+
"\u0628\u0648\u0644\u0644\u0627",
|
|
459
|
+
"\u0646\u0627\u0633\u0649\u0631",
|
|
460
|
+
"\u0646\u0627\u0632\u0627\u0631",
|
|
461
|
+
"\u0628\u064A\u0631\u0627",
|
|
462
|
+
"\u0628\u064A\u0628\u0627",
|
|
463
|
+
"\u064A\u067E\u0627",
|
|
464
|
+
"\u0631\u064A\u067E\u0627",
|
|
465
|
+
"\u0646\u064A\u067E\u0627",
|
|
466
|
+
"\u064A\u0645\u0627",
|
|
467
|
+
"\u0633\u064A\u0645\u0627",
|
|
468
|
+
"\u0645\u064A\u0644\u0627",
|
|
469
|
+
"\u0628\u064A\u0644\u0627",
|
|
470
|
+
"\u062C\u064A\u062F\u0627",
|
|
471
|
+
"\u0632\u064A\u0644\u0627",
|
|
472
|
+
"\u0641\u064A\u0631\u0627",
|
|
473
|
+
"\u0646\u0627\u0631\u0627",
|
|
474
|
+
"\u0644\u064A\u067E\u0627",
|
|
475
|
+
"\u0644\u064A\u0645\u0627",
|
|
476
|
+
"\u062F\u064A\u0628\u0627",
|
|
477
|
+
"\u062F\u064A\u0644\u0627",
|
|
478
|
+
"\u062F\u064A\u0631\u0627",
|
|
479
|
+
"\u0644\u064A\u0646\u0627",
|
|
480
|
+
"\u0643\u064A\u0631\u0627",
|
|
481
|
+
"\u0646\u0646\u0627",
|
|
482
|
+
"\u0627\u062F\u0627",
|
|
483
|
+
"\u0628\u0627\u0631\u0649\u0633",
|
|
484
|
+
"\u0643\u06D5\u0644\u062F\u0649",
|
|
485
|
+
"\u06AF\u06D5\u0644\u062F\u0649",
|
|
486
|
+
"\u0628\u06C7\u0644\u0627\u0646",
|
|
487
|
+
"\u064A\u0643\u0627\u0645\u0627\u0644",
|
|
488
|
+
"\u0628\u0627\u0642\u0649\u062A",
|
|
489
|
+
"\u0628\u0627\u0642\u064A\u062A",
|
|
490
|
+
"\u0628\u0627\u0642\u064A",
|
|
491
|
+
"\u0642\u0648\u062C\u0627",
|
|
492
|
+
"\u0639\u0648\u062C\u0627",
|
|
493
|
+
"\u0633\u0627\u0644\u064A",
|
|
494
|
+
"\u0642\u0627\u0644\u064A",
|
|
495
|
+
"\u062F\u064A\u064A\u0627\u0631",
|
|
496
|
+
"\u064A\u064A\u0627\u0631",
|
|
497
|
+
"\u064A\u064A\u0627\u0632",
|
|
498
|
+
"\u064A\u064A\u0627\u0633",
|
|
499
|
+
"\u064A\u064A\u0627",
|
|
500
|
+
"\u064A\u0627\u0631",
|
|
501
|
+
"\u064A\u0627\u0644",
|
|
502
|
+
"\u064A\u0627\u0632",
|
|
503
|
+
"\u0646\u0627\u0631",
|
|
504
|
+
"\u062C\u0627\u0646",
|
|
505
|
+
"\u0646\u06C7\u0631",
|
|
506
|
+
"\u0628\u06D5\u0643",
|
|
507
|
+
"\u0628\u0627\u064A",
|
|
508
|
+
"\u062D\u0627\u0646",
|
|
509
|
+
"\u06AF\u06C7\u0644"
|
|
510
|
+
];
|
|
415
511
|
var IMPLICIT_SOFT_ROOTS = /* @__PURE__ */ new Set([
|
|
416
512
|
"\u06C7\u0645\u0649\u062A",
|
|
417
513
|
"\u062A\u06C7\u0628",
|
|
@@ -547,6 +643,7 @@ var ArabicToCyrillicConverter = class {
|
|
|
547
643
|
);
|
|
548
644
|
}
|
|
549
645
|
this.disambiguator = options.disambiguator ?? new NoopDisambiguator();
|
|
646
|
+
this.nameYSequenceStyle = options.nameYSequenceStyle ?? "normalize";
|
|
550
647
|
for (const prefix of LOANWORD_PREFIXES) {
|
|
551
648
|
this.loanwordPrefixTrie.insert(prefix);
|
|
552
649
|
}
|
|
@@ -604,6 +701,36 @@ var ArabicToCyrillicConverter = class {
|
|
|
604
701
|
}
|
|
605
702
|
return false;
|
|
606
703
|
}
|
|
704
|
+
crossesProtectedNameEnding(word, splitIndex) {
|
|
705
|
+
return NAME_SUFFIX_COMPONENTS.some((ending) => {
|
|
706
|
+
if (!word.endsWith(ending)) {
|
|
707
|
+
return false;
|
|
708
|
+
}
|
|
709
|
+
const protectedStart = word.length - ending.length;
|
|
710
|
+
return splitIndex > protectedStart && splitIndex < word.length;
|
|
711
|
+
});
|
|
712
|
+
}
|
|
713
|
+
convertNameYSequence(word) {
|
|
714
|
+
const normalized = {
|
|
715
|
+
"\u064A\u064A\u0627": "\u0438\u044F",
|
|
716
|
+
"\u064A\u064A\u0627\u0631": "\u0438\u044F\u0440",
|
|
717
|
+
"\u064A\u064A\u0627\u0632": "\u0438\u044F\u0437",
|
|
718
|
+
"\u064A\u064A\u0627\u0633": "\u0438\u044F\u0441"
|
|
719
|
+
};
|
|
720
|
+
const preserved = {
|
|
721
|
+
"\u064A\u064A\u0627": "\u0438\u0439\u0430",
|
|
722
|
+
"\u064A\u064A\u0627\u0631": "\u0438\u0439\u0430\u0440",
|
|
723
|
+
"\u064A\u064A\u0627\u0632": "\u0438\u0439\u0430\u0437",
|
|
724
|
+
"\u064A\u064A\u0627\u0633": "\u0438\u0439\u0430\u0441"
|
|
725
|
+
};
|
|
726
|
+
if (!(word in normalized)) {
|
|
727
|
+
return null;
|
|
728
|
+
}
|
|
729
|
+
if (this.nameYSequenceStyle === "preserve") {
|
|
730
|
+
return preserved[word];
|
|
731
|
+
}
|
|
732
|
+
return normalized[word];
|
|
733
|
+
}
|
|
607
734
|
isValidSuffixSequence(suffix) {
|
|
608
735
|
if (!suffix) {
|
|
609
736
|
return true;
|
|
@@ -654,6 +781,16 @@ var ArabicToCyrillicConverter = class {
|
|
|
654
781
|
if (word.includes("-")) {
|
|
655
782
|
return word.split("-");
|
|
656
783
|
}
|
|
784
|
+
for (const prefix of NAME_PREFIX_COMPONENTS) {
|
|
785
|
+
if (word.startsWith(prefix) && word.length > prefix.length) {
|
|
786
|
+
return [word.slice(0, prefix.length), word.slice(prefix.length)];
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
for (const suffix of NAME_SUFFIX_COMPONENTS) {
|
|
790
|
+
if (word.endsWith(suffix) && word.length > suffix.length) {
|
|
791
|
+
return [word.slice(0, -suffix.length), suffix];
|
|
792
|
+
}
|
|
793
|
+
}
|
|
657
794
|
if (word.startsWith(this.HAMZA)) {
|
|
658
795
|
return [word];
|
|
659
796
|
}
|
|
@@ -700,6 +837,9 @@ var ArabicToCyrillicConverter = class {
|
|
|
700
837
|
if (![...prefix].some((char) => this.arabicVowels.has(char))) {
|
|
701
838
|
continue;
|
|
702
839
|
}
|
|
840
|
+
if (this.crossesProtectedNameEnding(word, length)) {
|
|
841
|
+
continue;
|
|
842
|
+
}
|
|
703
843
|
if (this.isValidSuffixSequence(suffix)) {
|
|
704
844
|
return { matchType: "anonymous", base: prefix, suffix };
|
|
705
845
|
}
|
|
@@ -798,12 +938,20 @@ var ArabicToCyrillicConverter = class {
|
|
|
798
938
|
if (!word) {
|
|
799
939
|
return word;
|
|
800
940
|
}
|
|
941
|
+
const nameYSequence = this.convertNameYSequence(word);
|
|
942
|
+
if (nameYSequence) {
|
|
943
|
+
return nameYSequence;
|
|
944
|
+
}
|
|
801
945
|
if (hasKey(EXCEPTIONS, word)) {
|
|
802
946
|
return EXCEPTIONS[word];
|
|
803
947
|
}
|
|
804
948
|
if (hasKey(PROPER_NOUNS, word)) {
|
|
805
949
|
return PROPER_NOUNS[word];
|
|
806
950
|
}
|
|
951
|
+
const segments = this.segmentCompoundWord(word);
|
|
952
|
+
if (segments.length > 1) {
|
|
953
|
+
return word.includes("-") ? segments.map((segment) => this.convertWord(segment)).join("-") : segments.map((segment) => this.convertWord(segment)).join("");
|
|
954
|
+
}
|
|
807
955
|
if (LOANWORD_EXACT.has(word)) {
|
|
808
956
|
return this.convertWordInternal(word);
|
|
809
957
|
}
|