nlptoolkit-morphologicalanalysis 1.0.13 → 1.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/Corpus/DisambiguationCorpus.d.ts +7 -0
- package/dist/Corpus/DisambiguationCorpus.js +7 -0
- package/dist/Corpus/DisambiguationCorpus.js.map +1 -1
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.d.ts +71 -3
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js +164 -41
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js.map +1 -1
- package/dist/MorphologicalAnalysis/FsmParse.d.ts +9 -0
- package/dist/MorphologicalAnalysis/FsmParse.js +15 -0
- package/dist/MorphologicalAnalysis/FsmParse.js.map +1 -1
- package/dist/MorphologicalAnalysis/InflectionalGroup.js +3 -2
- package/dist/MorphologicalAnalysis/InflectionalGroup.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphologicalParse.d.ts +98 -0
- package/dist/MorphologicalAnalysis/MorphologicalParse.js +161 -10
- package/dist/MorphologicalAnalysis/MorphologicalParse.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphologicalTag.d.ts +9 -1
- package/dist/MorphologicalAnalysis/MorphologicalTag.js +8 -0
- package/dist/MorphologicalAnalysis/MorphologicalTag.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphotacticEngine.d.ts +47 -0
- package/dist/MorphologicalAnalysis/MorphotacticEngine.js +51 -1
- package/dist/MorphologicalAnalysis/MorphotacticEngine.js.map +1 -1
- package/dist/MorphologicalAnalysis/Transition.d.ts +22 -8
- package/dist/MorphologicalAnalysis/Transition.js +25 -9
- package/dist/MorphologicalAnalysis/Transition.js.map +1 -1
- package/package.json +2 -2
- package/parses/ac/314/247/304/261kla.txt +57 -3
- package/parses/ak.txt +72 -3
- package/parses/aksa.txt +40 -2
- package/parses/anla.txt +57 -3
- package/parses/azal.txt +63 -4
- package/parses/bo/314/210l.txt +53 -3
- package/parses/bul.txt +53 -3
- package/parses/cenk.txt +8 -0
- package/parses/cevapla.txt +74 -4
- package/parses/cos/314/247.txt +53 -3
- package/parses/c/314/247o/314/210k.txt +54 -3
- package/parses/c/314/247/304/261k.txt +59 -3
- package/parses/del.txt +47 -3
- package/parses/doldur.txt +47 -3
- package/parses/emlak.txt +2 -0
- package/parses/git.txt +59 -3
- package/parses/giy.txt +59 -3
- package/parses/go/314/210c/314/247.txt +59 -3
- package/parses/go/314/210ster.txt +63 -4
- package/parses/hal.txt +20 -4
- package/parses/kalp.txt +29 -4
- package/parses/kavur.txt +80 -5
- package/parses/kaydol.txt +69 -4
- package/parses/resim.txt +14 -0
- package/parses/s/304/261ska.txt +24 -0
- package/parses/ye.txt +40 -2
- package/parses/yemek.txt +6 -0
- package/parses/y/304/261ka.txt +90 -5
- package/parses/y/304/261ldo/314/210nu/314/210mu/314/210.txt +6 -0
- package/pronunciations.txt +490 -0
- package/source/Corpus/DisambiguationCorpus.ts +7 -0
- package/source/MorphologicalAnalysis/FsmMorphologicalAnalyzer.ts +171 -45
- package/source/MorphologicalAnalysis/FsmParse.ts +16 -1
- package/source/MorphologicalAnalysis/InflectionalGroup.ts +3 -2
- package/source/MorphologicalAnalysis/MorphologicalParse.ts +161 -10
- package/source/MorphologicalAnalysis/MorphologicalTag.ts +9 -1
- package/source/MorphologicalAnalysis/MorphotacticEngine.ts +51 -1
- package/source/MorphologicalAnalysis/Transition.ts +25 -9
- package/suffixes.txt +6313 -0
- package/tests/DisambiguationCorpusTest.js +14 -0
- package/tests/DisambiguationCorpusTest.js.map +1 -0
- package/tests/FiniteStateMachineTest.js +96 -0
- package/tests/FiniteStateMachineTest.js.map +1 -0
- package/tests/FiniteStateMachineTest.ts +1 -1
- package/tests/FsmMorphologicalAnalyzerTest.js +250 -0
- package/tests/FsmMorphologicalAnalyzerTest.js.map +1 -0
- package/tests/FsmMorphologicalAnalyzerTest.ts +12 -6
- package/tests/FsmParseListTest.js +100 -0
- package/tests/FsmParseListTest.js.map +1 -0
- package/tests/FsmParseTest.js +68 -0
- package/tests/FsmParseTest.js.map +1 -0
- package/tests/InflectionalGroupTest.js +86 -0
- package/tests/InflectionalGroupTest.js.map +1 -0
- package/tests/MorphologicalParseTest.js +154 -0
- package/tests/MorphologicalParseTest.js.map +1 -0
- package/tests/TransitionTest.js +184 -0
- package/tests/TransitionTest.js.map +1 -0
- package/tests/TransitionTest.ts +8 -0
- package/turkish_finite_state_machine.xml +11 -3
|
@@ -15,15 +15,18 @@ import {Sentence} from "nlptoolkit-corpus/dist/Sentence";
|
|
|
15
15
|
import {Word} from "nlptoolkit-dictionary/dist/Dictionary/Word";
|
|
16
16
|
import {State} from "./State";
|
|
17
17
|
import {Queue} from "nlptoolkit-datastructure/dist/Queue";
|
|
18
|
+
import {FileUtils} from "nlptoolkit-util/dist/FileUtils";
|
|
18
19
|
|
|
19
20
|
export class FsmMorphologicalAnalyzer {
|
|
20
21
|
|
|
21
22
|
private dictionaryTrie: Trie
|
|
23
|
+
private suffixTrie: Trie
|
|
22
24
|
private parsedSurfaceForms: Map<string, string> = undefined
|
|
23
|
-
private
|
|
25
|
+
private pronunciations: Map<string, string> = undefined
|
|
26
|
+
private readonly finiteStateMachine: FiniteStateMachine
|
|
24
27
|
private static MAX_DISTANCE = 2
|
|
25
|
-
private dictionary: TxtDictionary
|
|
26
|
-
private cache: LRUCache<string, FsmParseList> = undefined
|
|
28
|
+
private readonly dictionary: TxtDictionary
|
|
29
|
+
private readonly cache: LRUCache<string, FsmParseList> = undefined
|
|
27
30
|
private mostUsedPatterns: Map<string, RegExp> = new Map<string, RegExp>()
|
|
28
31
|
|
|
29
32
|
/**
|
|
@@ -51,22 +54,59 @@ export class FsmMorphologicalAnalyzer {
|
|
|
51
54
|
} else {
|
|
52
55
|
this.finiteStateMachine = new FiniteStateMachine(fileName);
|
|
53
56
|
}
|
|
57
|
+
this.prepareSuffixTrie();
|
|
54
58
|
this.dictionaryTrie = this.dictionary.prepareTrie();
|
|
55
59
|
if (cacheSize > 0){
|
|
56
60
|
this.cache = new LRUCache<string, FsmParseList>(cacheSize);
|
|
57
61
|
}
|
|
62
|
+
this.addPronunciations("pronunciations.txt");
|
|
58
63
|
}
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
65
|
+
/**
|
|
66
|
+
* Constructs and returns the reverse string of a given string.
|
|
67
|
+
* @param s String to be reversed.
|
|
68
|
+
* @return Reverse of a given string.
|
|
69
|
+
*/
|
|
70
|
+
private reverseString(s: string): string{
|
|
71
|
+
let result = ""
|
|
72
|
+
for (let i = s.length - 1; i >= 0; i--){
|
|
73
|
+
result += s[i]
|
|
74
|
+
}
|
|
75
|
+
return result
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
|
|
80
|
+
* suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
|
|
81
|
+
* which can be then used to match a given word for its possible suffix content.
|
|
82
|
+
*/
|
|
83
|
+
private prepareSuffixTrie(){
|
|
84
|
+
this.suffixTrie = new Trie()
|
|
85
|
+
let data = fs.readFileSync("suffixes.txt", 'utf8')
|
|
63
86
|
let lines = data.split("\n")
|
|
64
|
-
for (let
|
|
65
|
-
let
|
|
66
|
-
this.
|
|
87
|
+
for (let suffix of lines) {
|
|
88
|
+
let reverseSuffix = this.reverseString(suffix)
|
|
89
|
+
this.suffixTrie.addWord(reverseSuffix, new Word(reverseSuffix))
|
|
67
90
|
}
|
|
68
91
|
}
|
|
69
92
|
|
|
93
|
+
/**
|
|
94
|
+
* Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
|
|
95
|
+
* which have at least one morphological analysis in Turkish.
|
|
96
|
+
* @param fileName Input file containing analyzable surface forms and their root forms.
|
|
97
|
+
*/
|
|
98
|
+
addParsedSurfaceForms(fileName: string){
|
|
99
|
+
this.parsedSurfaceForms = FileUtils.readHashMap(fileName)
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/**
|
|
103
|
+
* Reads the file for foreign words and their pronunciations.
|
|
104
|
+
* @param fileName Input file containing foreign words and their pronunciations.
|
|
105
|
+
*/
|
|
106
|
+
addPronunciations(fileName: string){
|
|
107
|
+
this.pronunciations = FileUtils.readHashMap(fileName)
|
|
108
|
+
}
|
|
109
|
+
|
|
70
110
|
/**
|
|
71
111
|
* The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
|
|
72
112
|
* First it determines whether the given morphologicalParse is the root verb and whether it contains a verb tag.
|
|
@@ -662,7 +702,7 @@ export class FsmMorphologicalAnalyzer {
|
|
|
662
702
|
*/
|
|
663
703
|
private parseWordLength(fsmParse: Array<FsmParse>, maxLength: number): Array<FsmParse>{
|
|
664
704
|
let result = new Array<FsmParse>();
|
|
665
|
-
let
|
|
705
|
+
let resultTransitionList = new Array<string>();
|
|
666
706
|
let parseQueue = new Queue<FsmParse>(1000)
|
|
667
707
|
parseQueue.enqueueAll(fsmParse)
|
|
668
708
|
while (!parseQueue.isEmpty()) {
|
|
@@ -672,11 +712,11 @@ export class FsmMorphologicalAnalyzer {
|
|
|
672
712
|
let currentState = currentFsmParse.getFinalSuffix();
|
|
673
713
|
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
674
714
|
if (currentState.isEndState() && currentSurfaceForm.length <= maxLength) {
|
|
675
|
-
let
|
|
676
|
-
if (!
|
|
715
|
+
let currentTransitionList = currentSurfaceForm + " " + currentFsmParse.getFsmParseTransitionList()
|
|
716
|
+
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
677
717
|
result.push(currentFsmParse);
|
|
678
718
|
currentFsmParse.constructInflectionalGroups();
|
|
679
|
-
|
|
719
|
+
resultTransitionList.push(currentTransitionList);
|
|
680
720
|
}
|
|
681
721
|
}
|
|
682
722
|
this.addNewParsesFromCurrentParseLength(currentFsmParse, parseQueue, maxLength, root);
|
|
@@ -694,7 +734,7 @@ export class FsmMorphologicalAnalyzer {
|
|
|
694
734
|
*/
|
|
695
735
|
private parseWordSurfaceForm(fsmParse: Array<FsmParse>, surfaceForm: string): Array<FsmParse>{
|
|
696
736
|
let result = new Array<FsmParse>();
|
|
697
|
-
let
|
|
737
|
+
let resultTransitionList = new Array<string>();
|
|
698
738
|
let parseQueue = new Queue<FsmParse>(1000)
|
|
699
739
|
parseQueue.enqueueAll(fsmParse)
|
|
700
740
|
while (!parseQueue.isEmpty()) {
|
|
@@ -704,11 +744,11 @@ export class FsmMorphologicalAnalyzer {
|
|
|
704
744
|
let currentState = currentFsmParse.getFinalSuffix();
|
|
705
745
|
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
706
746
|
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
707
|
-
let
|
|
708
|
-
if (!
|
|
747
|
+
let currentTransitionList = currentFsmParse.getFsmParseTransitionList()
|
|
748
|
+
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
709
749
|
result.push(currentFsmParse);
|
|
710
750
|
currentFsmParse.constructInflectionalGroups();
|
|
711
|
-
|
|
751
|
+
resultTransitionList.push(currentTransitionList);
|
|
712
752
|
}
|
|
713
753
|
}
|
|
714
754
|
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
|
|
@@ -973,6 +1013,15 @@ export class FsmMorphologicalAnalyzer {
|
|
|
973
1013
|
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
974
1014
|
}
|
|
975
1015
|
|
|
1016
|
+
/**
|
|
1017
|
+
* This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
|
|
1018
|
+
* the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
|
|
1019
|
+
* mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
|
|
1020
|
+
* is compiled and put in the mostUsedPatterns.
|
|
1021
|
+
* @param expr Pattern to check
|
|
1022
|
+
* @param value String to match the pattern
|
|
1023
|
+
* @return True if the string matches the pattern, false otherwise.
|
|
1024
|
+
*/
|
|
976
1025
|
private patternMatches(expr: string, value: string): boolean{
|
|
977
1026
|
let p = this.mostUsedPatterns.get(expr);
|
|
978
1027
|
if (p == undefined){
|
|
@@ -1011,6 +1060,40 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1011
1060
|
return this.patternMatches("^.*[0-9].*$", surfaceForm) && this.patternMatches("^.*[a-zA-ZçöğüşıÇÖĞÜŞİ].*$", surfaceForm);
|
|
1012
1061
|
}
|
|
1013
1062
|
|
|
1063
|
+
/**
|
|
1064
|
+
* Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
|
|
1065
|
+
* for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
|
|
1066
|
+
* way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
|
|
1067
|
+
* have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
|
|
1068
|
+
* surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
|
|
1069
|
+
* a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
|
|
1070
|
+
* will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
|
|
1071
|
+
* with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
|
|
1072
|
+
* with 'morfolojiklik'.
|
|
1073
|
+
* @param surfaceForm Surface form for which we will identify a possible new root form.
|
|
1074
|
+
* @return Possible new root form.
|
|
1075
|
+
*/
|
|
1076
|
+
private rootOfPossiblyNewWord(surfaceForm: string): Array<TxtWord>{
|
|
1077
|
+
let words = this.suffixTrie.getWordsWithPrefix(this.reverseString(surfaceForm))
|
|
1078
|
+
let candidateWord = null
|
|
1079
|
+
let candidateList = new Array<TxtWord>();
|
|
1080
|
+
for (let word of words){
|
|
1081
|
+
candidateWord = surfaceForm.substring(0, surfaceForm.length - word.getName().length)
|
|
1082
|
+
let newWord
|
|
1083
|
+
if (candidateWord.endsWith("ğ")){
|
|
1084
|
+
candidateWord = candidateWord.substring(0, candidateWord.length - 1) + "k"
|
|
1085
|
+
newWord = new TxtWord(candidateWord, "CL_ISIM")
|
|
1086
|
+
newWord.addFlag("IS_SD")
|
|
1087
|
+
} else {
|
|
1088
|
+
newWord = new TxtWord(candidateWord, "CL_ISIM")
|
|
1089
|
+
newWord.addFlag("CL_FIIL")
|
|
1090
|
+
}
|
|
1091
|
+
candidateList.push(newWord)
|
|
1092
|
+
this.dictionaryTrie.addWord(candidateWord, newWord)
|
|
1093
|
+
}
|
|
1094
|
+
return candidateList
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1014
1097
|
/**
|
|
1015
1098
|
* The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
|
|
1016
1099
|
* then, if the size of the currentParse is 0, and given surfaceForm is a proper noun, it adds the surfaceForm
|
|
@@ -1028,14 +1111,19 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1028
1111
|
if (currentParse.size() == 0) {
|
|
1029
1112
|
let fsmParse = new Array<FsmParse>();
|
|
1030
1113
|
if (this.isProperNoun(surfaceForm)) {
|
|
1031
|
-
fsmParse.push(new FsmParse(surfaceForm, this.finiteStateMachine.getState("ProperRoot")))
|
|
1032
|
-
}
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1114
|
+
fsmParse.push(new FsmParse(surfaceForm, this.finiteStateMachine.getState("ProperRoot")))
|
|
1115
|
+
}
|
|
1116
|
+
if (this.isCode(surfaceForm)) {
|
|
1117
|
+
fsmParse.push(new FsmParse(surfaceForm, this.finiteStateMachine.getState("CodeRoot")))
|
|
1118
|
+
}
|
|
1119
|
+
let newCandidateList = this.rootOfPossiblyNewWord(surfaceForm)
|
|
1120
|
+
if (newCandidateList.length != 0){
|
|
1121
|
+
for (let word of newCandidateList) {
|
|
1122
|
+
fsmParse.push(new FsmParse(word, this.finiteStateMachine.getState("VerbalRoot")))
|
|
1123
|
+
fsmParse.push(new FsmParse(word, this.finiteStateMachine.getState("NominalRoot")))
|
|
1037
1124
|
}
|
|
1038
1125
|
}
|
|
1126
|
+
fsmParse.push(new FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")))
|
|
1039
1127
|
return new FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
|
|
1040
1128
|
} else {
|
|
1041
1129
|
return currentParse;
|
|
@@ -1144,22 +1232,43 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1144
1232
|
return word == "" && count > 1;
|
|
1145
1233
|
}
|
|
1146
1234
|
|
|
1235
|
+
/**
|
|
1236
|
+
* Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
|
|
1237
|
+
* @param surfaceForm Surface form to be checked.
|
|
1238
|
+
* @return True if the surface form is in percent form
|
|
1239
|
+
*/
|
|
1147
1240
|
private isPercent(surfaceForm: string): boolean{
|
|
1148
1241
|
return this.patternMatches("^%(\\d\\d|\\d)$", surfaceForm) ||
|
|
1149
1242
|
this.patternMatches("^%(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1150
1243
|
}
|
|
1151
1244
|
|
|
1245
|
+
/**
|
|
1246
|
+
* Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
|
|
1247
|
+
* @param surfaceForm Surface form to be checked.
|
|
1248
|
+
* @return True if the surface form is in time form
|
|
1249
|
+
*/
|
|
1152
1250
|
private isTime(surfaceForm: string): boolean{
|
|
1153
1251
|
return this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1154
1252
|
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm);
|
|
1155
1253
|
}
|
|
1156
1254
|
|
|
1255
|
+
/**
|
|
1256
|
+
* Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
|
|
1257
|
+
* 3.45-4.67.
|
|
1258
|
+
* @param surfaceForm Surface form to be checked.
|
|
1259
|
+
* @return True if the surface form is in range form
|
|
1260
|
+
*/
|
|
1157
1261
|
private isRange(surfaceForm: string): boolean{
|
|
1158
1262
|
return this.patternMatches("^\\d+-\\d+$", surfaceForm) ||
|
|
1159
1263
|
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1160
1264
|
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)$", surfaceForm);
|
|
1161
1265
|
}
|
|
1162
1266
|
|
|
1267
|
+
/**
|
|
1268
|
+
* Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
|
|
1269
|
+
* @param surfaceForm Surface form to be checked.
|
|
1270
|
+
* @return True if the surface form is in date form
|
|
1271
|
+
*/
|
|
1163
1272
|
private isDate(surfaceForm: string): boolean{
|
|
1164
1273
|
return this.patternMatches("^(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+$", surfaceForm) ||
|
|
1165
1274
|
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
@@ -1183,7 +1292,9 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1183
1292
|
* @return fsmParseList which holds the analysis.
|
|
1184
1293
|
*/
|
|
1185
1294
|
morphologicalAnalysis(surfaceForm: string): FsmParseList{
|
|
1186
|
-
let lowerCased = surfaceForm.toLocaleLowerCase("tr")
|
|
1295
|
+
let lowerCased = surfaceForm.toLocaleLowerCase("tr")
|
|
1296
|
+
let possibleRootLowerCased = "", pronunciation = ""
|
|
1297
|
+
let isRootReplaced = false
|
|
1187
1298
|
if (this.parsedSurfaceForms != undefined && this.parsedSurfaceForms.has(lowerCased) &&
|
|
1188
1299
|
!this.isInteger(surfaceForm) && !this.isDouble(surfaceForm) && !this.isPercent(surfaceForm) &&
|
|
1189
1300
|
!this.isTime(surfaceForm) && !this.isRange(surfaceForm) && !this.isDate(surfaceForm)){
|
|
@@ -1195,9 +1306,9 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1195
1306
|
return this.cache.get(surfaceForm);
|
|
1196
1307
|
}
|
|
1197
1308
|
if (this.patternMatches("^(\\w|Ç|Ş|İ|Ü|Ö)\\.$",surfaceForm)) {
|
|
1198
|
-
this.dictionaryTrie.addWord(
|
|
1309
|
+
this.dictionaryTrie.addWord(lowerCased, new TxtWord(lowerCased, "IS_OA"));
|
|
1199
1310
|
}
|
|
1200
|
-
let defaultFsmParse = this.analysis(
|
|
1311
|
+
let defaultFsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1201
1312
|
if (defaultFsmParse.length > 0) {
|
|
1202
1313
|
let fsmParseList = new FsmParseList(defaultFsmParse);
|
|
1203
1314
|
if (this.cache != undefined) {
|
|
@@ -1211,48 +1322,58 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1211
1322
|
if (possibleRoot != "") {
|
|
1212
1323
|
if (possibleRoot.includes("/") || possibleRoot.includes("\\/")) {
|
|
1213
1324
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_KESIR"));
|
|
1214
|
-
fsmParse = this.analysis(
|
|
1325
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1215
1326
|
} else {
|
|
1216
1327
|
if (this.isDate(possibleRoot)) {
|
|
1217
1328
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_DATE"));
|
|
1218
|
-
fsmParse = this.analysis(
|
|
1329
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1219
1330
|
} else {
|
|
1220
1331
|
if (this.patternMatches("^\\d+/\\d+$", possibleRoot)) {
|
|
1221
1332
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_KESIR"));
|
|
1222
|
-
fsmParse = this.analysis(
|
|
1333
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1223
1334
|
} else {
|
|
1224
1335
|
if (this.isPercent(possibleRoot)) {
|
|
1225
1336
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_PERCENT"));
|
|
1226
|
-
fsmParse = this.analysis(
|
|
1337
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1227
1338
|
} else {
|
|
1228
1339
|
if (this.isTime(surfaceForm)) {
|
|
1229
1340
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_ZAMAN"));
|
|
1230
|
-
fsmParse = this.analysis(
|
|
1341
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1231
1342
|
} else {
|
|
1232
1343
|
if (this.isRange(surfaceForm)) {
|
|
1233
1344
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_RANGE"));
|
|
1234
|
-
fsmParse = this.analysis(
|
|
1345
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1235
1346
|
} else {
|
|
1236
1347
|
if (this.isInteger(possibleRoot)) {
|
|
1237
1348
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_SAYI"));
|
|
1238
|
-
fsmParse = this.analysis(
|
|
1349
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1239
1350
|
} else {
|
|
1240
1351
|
if (this.isDouble(possibleRoot)) {
|
|
1241
1352
|
this.dictionaryTrie.addWord(possibleRoot, new TxtWord(possibleRoot, "IS_REELSAYI"));
|
|
1242
|
-
fsmParse = this.analysis(
|
|
1353
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1243
1354
|
} else {
|
|
1244
|
-
if (Word.isCapital(possibleRoot)) {
|
|
1245
|
-
let newWord = undefined
|
|
1246
|
-
|
|
1247
|
-
|
|
1355
|
+
if (Word.isCapital(possibleRoot) || "QXW".includes(possibleRoot.substring(0, 1))) {
|
|
1356
|
+
let newWord = undefined
|
|
1357
|
+
possibleRootLowerCased = possibleRoot.toLocaleLowerCase("tr");
|
|
1358
|
+
if (this.pronunciations.has(possibleRootLowerCased)){
|
|
1359
|
+
isRootReplaced = true
|
|
1360
|
+
pronunciation = this.pronunciations.get(possibleRootLowerCased)
|
|
1361
|
+
if (this.dictionary.getWord(pronunciation) != null) {
|
|
1362
|
+
(<TxtWord> this.dictionary.getWord(pronunciation)).addFlag("IS_OA");
|
|
1363
|
+
} else {
|
|
1364
|
+
newWord = new TxtWord(pronunciation, "IS_OA");
|
|
1365
|
+
this.dictionaryTrie.addWord(pronunciation, newWord);
|
|
1366
|
+
}
|
|
1367
|
+
let replacedWord = pronunciation + lowerCased.substring(possibleRootLowerCased.length);
|
|
1368
|
+
fsmParse = this.analysis(replacedWord, this.isProperNoun(surfaceForm));
|
|
1248
1369
|
} else {
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
fsmParse = this.analysis(
|
|
1370
|
+
if (this.dictionary.getWord(possibleRootLowerCased) != null) {
|
|
1371
|
+
(<TxtWord> this.dictionary.getWord(possibleRootLowerCased)).addFlag("IS_OA");
|
|
1372
|
+
} else {
|
|
1373
|
+
newWord = new TxtWord(possibleRootLowerCased, "IS_OA");
|
|
1374
|
+
this.dictionaryTrie.addWord(possibleRootLowerCased, newWord);
|
|
1375
|
+
}
|
|
1376
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1256
1377
|
}
|
|
1257
1378
|
}
|
|
1258
1379
|
}
|
|
@@ -1265,6 +1386,11 @@ export class FsmMorphologicalAnalyzer {
|
|
|
1265
1386
|
}
|
|
1266
1387
|
}
|
|
1267
1388
|
}
|
|
1389
|
+
if (!isRootReplaced){
|
|
1390
|
+
for (let parse of fsmParse){
|
|
1391
|
+
parse.restoreOriginalForm(possibleRootLowerCased, pronunciation)
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1268
1394
|
let fsmParseList = new FsmParseList(fsmParse);
|
|
1269
1395
|
if (this.cache != undefined && fsmParseList.size() > 0) {
|
|
1270
1396
|
this.cache.add(surfaceForm, fsmParseList);
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import {MorphologicalParse} from "./MorphologicalParse";
|
|
2
2
|
import {State} from "./State";
|
|
3
|
-
import {Word} from "nlptoolkit-dictionary/dist/Dictionary/Word";
|
|
4
3
|
import {TxtWord} from "nlptoolkit-dictionary/dist/Dictionary/TxtWord";
|
|
5
4
|
import {InflectionalGroup} from "./InflectionalGroup";
|
|
6
5
|
import {Transition} from "./Transition";
|
|
@@ -675,4 +674,20 @@ export class FsmParse extends MorphologicalParse{
|
|
|
675
674
|
toString(): string{
|
|
676
675
|
return this.getFsmParseTransitionList()
|
|
677
676
|
}
|
|
677
|
+
|
|
678
|
+
/**
|
|
679
|
+
* In order to morphologically parse special proper nouns in Turkish, whose affixes obeys not the original but their
|
|
680
|
+
* pronunciations, the morphologicalAnalysis method replaces the original word with its pronunciation and do the
|
|
681
|
+
* rest. This method reverts it back, that is it restores its original form by replacing the pronunciations in the
|
|
682
|
+
* parses with the original form.
|
|
683
|
+
* @param original Original form of the proper noun.
|
|
684
|
+
* @param pronunciation Pronunciation of the proper noun.
|
|
685
|
+
*/
|
|
686
|
+
restoreOriginalForm(original: string, pronunciation: string){
|
|
687
|
+
this.root = new TxtWord(original, "IS_OA");
|
|
688
|
+
this.form = original + this.form.substring(pronunciation.length);
|
|
689
|
+
for (let i = 0; i < this.formList.length; i++) {
|
|
690
|
+
this.formList[i] = original + this.formList[i].substring(pronunciation.length)
|
|
691
|
+
}
|
|
692
|
+
}
|
|
678
693
|
}
|
|
@@ -28,7 +28,8 @@ export class InflectionalGroup {
|
|
|
28
28
|
"INF2", "INF3", "BSTAG", "ESTAG", "BTTAG",
|
|
29
29
|
"ETTAG", "BDTAG", "EDTAG", "INF1", "ASLONGAS",
|
|
30
30
|
"DIST", "ADAMANTLY", "PERCENT", "WITHOUTBEINGABLETOHAVEDONESO", "DIM",
|
|
31
|
-
"PERS", "FRACTION", "HASHTAG", "EMAIL", "DATE",
|
|
31
|
+
"PERS", "FRACTION", "HASHTAG", "EMAIL", "DATE",
|
|
32
|
+
"CODE", "METRIC", "POL", "URGE"];
|
|
32
33
|
static morphoTags = [MorphologicalTag.NOUN, MorphologicalTag.ADVERB, MorphologicalTag.ADJECTIVE,
|
|
33
34
|
MorphologicalTag.VERB, MorphologicalTag.A1SG, MorphologicalTag.A2SG, MorphologicalTag.A3SG, MorphologicalTag.A1PL,
|
|
34
35
|
MorphologicalTag.A2PL, MorphologicalTag.A3PL, MorphologicalTag.P1SG, MorphologicalTag.P2SG, MorphologicalTag.P3SG, MorphologicalTag.P1PL,
|
|
@@ -55,7 +56,7 @@ export class InflectionalGroup {
|
|
|
55
56
|
MorphologicalTag.ENDOFTITLE, MorphologicalTag.BEGINNINGOFDOCUMENT, MorphologicalTag.ENDOFDOCUMENT, MorphologicalTag.INFINITIVE, MorphologicalTag.ASLONGAS,
|
|
56
57
|
MorphologicalTag.DISTRIBUTIVE, MorphologicalTag.ADAMANTLY, MorphologicalTag.PERCENT, MorphologicalTag.WITHOUTBEINGABLETOHAVEDONESO, MorphologicalTag.DIMENSION,
|
|
57
58
|
MorphologicalTag.PERSONALPRONOUN, MorphologicalTag.FRACTION, MorphologicalTag.HASHTAG, MorphologicalTag.EMAIL, MorphologicalTag.DATE,
|
|
58
|
-
MorphologicalTag.CODE, MorphologicalTag.METRIC];
|
|
59
|
+
MorphologicalTag.CODE, MorphologicalTag.METRIC, MorphologicalTag.POLITE, MorphologicalTag.URGE];
|
|
59
60
|
|
|
60
61
|
/**
|
|
61
62
|
* The getMorphologicalTag method takes a String tag as an input and if the input matches with one of the elements of
|