nlptoolkit-morphologicalanalysis 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -0
- package/dist/Corpus/DisambiguatedWord.d.ts +20 -0
- package/dist/Corpus/DisambiguatedWord.js +38 -0
- package/dist/Corpus/DisambiguatedWord.js.map +1 -0
- package/dist/Corpus/DisambiguationCorpus.d.ts +4 -0
- package/dist/Corpus/DisambiguationCorpus.js +54 -0
- package/dist/Corpus/DisambiguationCorpus.js.map +1 -0
- package/dist/MorphologicalAnalysis/FiniteStateMachine.d.ts +63 -0
- package/dist/MorphologicalAnalysis/FiniteStateMachine.js +178 -0
- package/dist/MorphologicalAnalysis/FiniteStateMachine.js.map +1 -0
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.d.ts +399 -0
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js +1255 -0
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js.map +1 -0
- package/dist/MorphologicalAnalysis/FsmParse.d.ts +290 -0
- package/dist/MorphologicalAnalysis/FsmParse.js +684 -0
- package/dist/MorphologicalAnalysis/FsmParse.js.map +1 -0
- package/dist/MorphologicalAnalysis/FsmParseList.d.ts +96 -0
- package/dist/MorphologicalAnalysis/FsmParseList.js +242 -0
- package/dist/MorphologicalAnalysis/FsmParseList.js.map +1 -0
- package/dist/MorphologicalAnalysis/InflectionalGroup.d.ts +77 -0
- package/dist/MorphologicalAnalysis/InflectionalGroup.js +213 -0
- package/dist/MorphologicalAnalysis/InflectionalGroup.js.map +1 -0
- package/dist/MorphologicalAnalysis/MetamorphicParse.d.ts +63 -0
- package/dist/MorphologicalAnalysis/MetamorphicParse.js +592 -0
- package/dist/MorphologicalAnalysis/MetamorphicParse.js.map +1 -0
- package/dist/MorphologicalAnalysis/MorphologicalParse.d.ts +301 -0
- package/dist/MorphologicalAnalysis/MorphologicalParse.js +969 -0
- package/dist/MorphologicalAnalysis/MorphologicalParse.js.map +1 -0
- package/dist/MorphologicalAnalysis/MorphologicalTag.d.ts +510 -0
- package/dist/MorphologicalAnalysis/MorphologicalTag.js +525 -0
- package/dist/MorphologicalAnalysis/MorphologicalTag.js.map +1 -0
- package/dist/MorphologicalAnalysis/State.d.ts +40 -0
- package/dist/MorphologicalAnalysis/State.js +64 -0
- package/dist/MorphologicalAnalysis/State.js.map +1 -0
- package/dist/MorphologicalAnalysis/Transition.d.ts +159 -0
- package/dist/MorphologicalAnalysis/Transition.js +751 -0
- package/dist/MorphologicalAnalysis/Transition.js.map +1 -0
- package/index.js +12 -0
- package/package.json +30 -0
- package/penntreebank.txt +208431 -0
- package/source/Corpus/DisambiguatedWord.ts +29 -0
- package/source/Corpus/DisambiguationCorpus.ts +39 -0
- package/source/MorphologicalAnalysis/FiniteStateMachine.ts +165 -0
- package/source/MorphologicalAnalysis/FsmMorphologicalAnalyzer.ts +1256 -0
- package/source/MorphologicalAnalysis/FsmParse.ts +664 -0
- package/source/MorphologicalAnalysis/FsmParseList.ts +238 -0
- package/source/MorphologicalAnalysis/InflectionalGroup.ts +210 -0
- package/source/MorphologicalAnalysis/MetamorphicParse.ts +589 -0
- package/source/MorphologicalAnalysis/MorphologicalParse.ts +995 -0
- package/source/MorphologicalAnalysis/MorphologicalTag.ts +510 -0
- package/source/MorphologicalAnalysis/State.ts +59 -0
- package/source/MorphologicalAnalysis/Transition.ts +733 -0
- package/source/tsconfig.json +13 -0
- package/tests/DisambiguationCorpusTest.ts +12 -0
- package/tests/FiniteStateMachineTest.ts +87 -0
- package/tests/FsmMorphologicalAnalyzerTest.ts +204 -0
- package/tests/FsmParseListTest.ts +90 -0
- package/tests/FsmParseTest.ts +66 -0
- package/tests/InflectionalGroupTest.ts +84 -0
- package/tests/MorphologicalParseTest.ts +152 -0
- package/tests/TransitionTest.ts +174 -0
- package/tsconfig.json +15 -0
- package/turkish_dictionary.txt +62120 -0
- package/turkish_finite_state_machine.xml +1887 -0
- package/turkish_misspellings.txt +148932 -0
|
@@ -0,0 +1,1255 @@
|
|
|
1
|
+
(function (factory) {
|
|
2
|
+
if (typeof module === "object" && typeof module.exports === "object") {
|
|
3
|
+
var v = factory(require, exports);
|
|
4
|
+
if (v !== undefined) module.exports = v;
|
|
5
|
+
}
|
|
6
|
+
else if (typeof define === "function" && define.amd) {
|
|
7
|
+
define(["require", "exports", "./FiniteStateMachine", "nlptoolkit-dictionary/dist/Dictionary/TxtDictionary", "nlptoolkit-datastructure/dist/LRUCache", "./FsmParseList", "nlptoolkit-dictionary/dist/Dictionary/WordComparator", "fs", "./Transition", "./MorphologicalTag", "nlptoolkit-dictionary/dist/Dictionary/TxtWord", "./FsmParse", "nlptoolkit-corpus/dist/Sentence", "nlptoolkit-dictionary/dist/Dictionary/Word", "./State"], factory);
|
|
8
|
+
}
|
|
9
|
+
})(function (require, exports) {
|
|
10
|
+
"use strict";
|
|
11
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
|
+
exports.FsmMorphologicalAnalyzer = void 0;
|
|
13
|
+
const FiniteStateMachine_1 = require("./FiniteStateMachine");
|
|
14
|
+
const TxtDictionary_1 = require("nlptoolkit-dictionary/dist/Dictionary/TxtDictionary");
|
|
15
|
+
const LRUCache_1 = require("nlptoolkit-datastructure/dist/LRUCache");
|
|
16
|
+
const FsmParseList_1 = require("./FsmParseList");
|
|
17
|
+
const WordComparator_1 = require("nlptoolkit-dictionary/dist/Dictionary/WordComparator");
|
|
18
|
+
const fs = require("fs");
|
|
19
|
+
const Transition_1 = require("./Transition");
|
|
20
|
+
const MorphologicalTag_1 = require("./MorphologicalTag");
|
|
21
|
+
const TxtWord_1 = require("nlptoolkit-dictionary/dist/Dictionary/TxtWord");
|
|
22
|
+
const FsmParse_1 = require("./FsmParse");
|
|
23
|
+
const Sentence_1 = require("nlptoolkit-corpus/dist/Sentence");
|
|
24
|
+
const Word_1 = require("nlptoolkit-dictionary/dist/Dictionary/Word");
|
|
25
|
+
const State_1 = require("./State");
|
|
26
|
+
class FsmMorphologicalAnalyzer {
|
|
27
|
+
/**
|
|
28
|
+
* Another constructor of FsmMorphologicalAnalyzer class. It generates a new TxtDictionary type dictionary from
|
|
29
|
+
* given input dictionary, with given inputs fileName and cacheSize.
|
|
30
|
+
*
|
|
31
|
+
* @param fileName the file to read the finite state machine.
|
|
32
|
+
* @param dictionaryFileNameOrDictionary the dictionary file that will be used to generate dictionaryTrie.
|
|
33
|
+
* @param cacheSize the size of the LRUCache.
|
|
34
|
+
*/
|
|
35
|
+
constructor(fileName, dictionaryFileNameOrDictionary, cacheSize) {
|
|
36
|
+
this.parsedSurfaceForms = undefined;
|
|
37
|
+
this.cache = undefined;
|
|
38
|
+
this.mostUsedPatterns = new Map();
|
|
39
|
+
if (dictionaryFileNameOrDictionary == undefined) {
|
|
40
|
+
this.dictionary = new TxtDictionary_1.TxtDictionary();
|
|
41
|
+
}
|
|
42
|
+
else {
|
|
43
|
+
if (dictionaryFileNameOrDictionary instanceof TxtDictionary_1.TxtDictionary) {
|
|
44
|
+
this.dictionary = dictionaryFileNameOrDictionary;
|
|
45
|
+
}
|
|
46
|
+
else {
|
|
47
|
+
this.dictionary = new TxtDictionary_1.TxtDictionary(WordComparator_1.WordComparator.TURKISH, dictionaryFileNameOrDictionary);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
if (fileName == undefined) {
|
|
51
|
+
this.finiteStateMachine = new FiniteStateMachine_1.FiniteStateMachine();
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
this.finiteStateMachine = new FiniteStateMachine_1.FiniteStateMachine(fileName);
|
|
55
|
+
}
|
|
56
|
+
this.dictionaryTrie = this.dictionary.prepareTrie();
|
|
57
|
+
if (cacheSize > 0) {
|
|
58
|
+
this.cache = new LRUCache_1.LRUCache(cacheSize);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
addParsedSurfaceForms(fileName) {
|
|
62
|
+
this.parsedSurfaceForms = new Map();
|
|
63
|
+
let data = fs.readFileSync(fileName, 'utf8');
|
|
64
|
+
let lines = data.split("\n");
|
|
65
|
+
for (let line of lines) {
|
|
66
|
+
let items = line.split(" ");
|
|
67
|
+
this.parsedSurfaceForms.set(items[0], items[1]);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
|
|
72
|
+
* First it determines whether the given morphologicalParse is the root verb and whether it contains a verb tag.
|
|
73
|
+
* Then it creates new transition with -mak and creates a new {@link Set} result.
|
|
74
|
+
* <p>
|
|
75
|
+
* It takes the given {@link MetamorphicParse} input as currentWord and if there is a compound word starting with the
|
|
76
|
+
* currentWord, it gets this compoundWord from dictionaryTrie. If there is a compoundWord and the difference of the
|
|
77
|
+
* currentWord and compundWords is less than 3 than compoundWord is added to the result, otherwise currentWord is added.
|
|
78
|
+
* <p>
|
|
79
|
+
* Then it gets the root from parse input as a currentRoot. If it is not null, and morphologicalParse input is verb,
|
|
80
|
+
* it directly adds the verb to result after making transition to currentRoot with currentWord String. Else, it creates a new
|
|
81
|
+
* transition with -lar and make this transition then adds to the result.
|
|
82
|
+
*
|
|
83
|
+
* @param morphologicalParse {@link MorphologicalParse} type input.
|
|
84
|
+
* @param metamorphicParse {@link MetamorphicParse} type input.
|
|
85
|
+
* @return {@link HashSet} result.
|
|
86
|
+
*/
|
|
87
|
+
getPossibleWords(morphologicalParse, metamorphicParse) {
|
|
88
|
+
let isRootVerb = morphologicalParse.getRootPos() == "VERB";
|
|
89
|
+
let containsVerb = morphologicalParse.containsTag(MorphologicalTag_1.MorphologicalTag.VERB);
|
|
90
|
+
let verbTransition = new Transition_1.Transition("mAk");
|
|
91
|
+
let result = new Set();
|
|
92
|
+
if (metamorphicParse == undefined || metamorphicParse.getWord() == undefined) {
|
|
93
|
+
return result;
|
|
94
|
+
}
|
|
95
|
+
let currentWord = metamorphicParse.getWord().getName();
|
|
96
|
+
let pluralIndex = -1;
|
|
97
|
+
let compoundWord = this.dictionaryTrie.getCompoundWordStartingWith(currentWord);
|
|
98
|
+
if (!isRootVerb) {
|
|
99
|
+
if (compoundWord != null && compoundWord.getName().length - currentWord.length < 3) {
|
|
100
|
+
result.add(compoundWord.getName());
|
|
101
|
+
}
|
|
102
|
+
result.add(currentWord);
|
|
103
|
+
}
|
|
104
|
+
let currentRoot = this.dictionary.getWord(metamorphicParse.getWord().getName());
|
|
105
|
+
if (currentRoot == undefined && compoundWord != undefined) {
|
|
106
|
+
currentRoot = compoundWord;
|
|
107
|
+
}
|
|
108
|
+
if (currentRoot != undefined) {
|
|
109
|
+
if (isRootVerb) {
|
|
110
|
+
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
111
|
+
result.add(verbWord);
|
|
112
|
+
}
|
|
113
|
+
let pluralWord = undefined;
|
|
114
|
+
for (let i = 1; i < metamorphicParse.size(); i++) {
|
|
115
|
+
let transition = new Transition_1.Transition(metamorphicParse.getMetaMorpheme(i), undefined, undefined);
|
|
116
|
+
if (metamorphicParse.getMetaMorpheme(i) == "lAr") {
|
|
117
|
+
pluralWord = currentWord;
|
|
118
|
+
pluralIndex = i + 1;
|
|
119
|
+
}
|
|
120
|
+
currentWord = transition.makeTransition(currentRoot, currentWord);
|
|
121
|
+
result.add(currentWord);
|
|
122
|
+
if (containsVerb) {
|
|
123
|
+
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
124
|
+
result.add(verbWord);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
if (pluralWord != null) {
|
|
128
|
+
currentWord = pluralWord;
|
|
129
|
+
for (let i = pluralIndex; i < metamorphicParse.size(); i++) {
|
|
130
|
+
let transition = new Transition_1.Transition(metamorphicParse.getMetaMorpheme(i), undefined, undefined);
|
|
131
|
+
currentWord = transition.makeTransition(currentRoot, currentWord);
|
|
132
|
+
result.add(currentWord);
|
|
133
|
+
if (containsVerb) {
|
|
134
|
+
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
135
|
+
result.add(verbWord);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return result;
|
|
141
|
+
}
|
|
142
|
+
/**
|
|
143
|
+
* The getDictionary method is used to get TxtDictionary.
|
|
144
|
+
*
|
|
145
|
+
* @return TxtDictionary type dictionary.
|
|
146
|
+
*/
|
|
147
|
+
getDictionary() {
|
|
148
|
+
return this.dictionary;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* The getFiniteStateMachine method is used to get FiniteStateMachine.
|
|
152
|
+
*
|
|
153
|
+
* @return FiniteStateMachine type finiteStateMachine.
|
|
154
|
+
*/
|
|
155
|
+
getFiniteStateMachine() {
|
|
156
|
+
return this.finiteStateMachine;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* The isPossibleSubstring method first checks whether given short and long strings are equal to root word.
|
|
160
|
+
* Then, compares both short and long strings' chars till the last two chars of short string. In the presence of mismatch,
|
|
161
|
+
* false is returned. On the other hand, it counts the distance between two strings until it becomes greater than 2,
|
|
162
|
+
* which is the MAX_DISTANCE also finds the index of the last char.
|
|
163
|
+
* <p>
|
|
164
|
+
* If the substring is a rootWord and equals to 'ben', which is a special case or root holds the lastIdropsDuringSuffixation or
|
|
165
|
+
* lastIdropsDuringPassiveSuffixation conditions, then it returns true if distance is not greater than MAX_DISTANCE.
|
|
166
|
+
* <p>
|
|
167
|
+
* On the other hand, if the shortStrong ends with one of these chars 'e, a, p, ç, t, k' and 't 's a rootWord with
|
|
168
|
+
* the conditions of rootSoftenDuringSuffixation, vowelEChangesToIDuringYSuffixation, vowelAChangesToIDuringYSuffixation
|
|
169
|
+
* or endingKChangesIntoG then it returns true if the last index is not equal to 2 and distance is not greater than
|
|
170
|
+
* MAX_DISTANCE and false otherwise.
|
|
171
|
+
*
|
|
172
|
+
* @param shortString the possible substring.
|
|
173
|
+
* @param longString the long string to compare with substring.
|
|
174
|
+
* @param root the root of the long string.
|
|
175
|
+
* @return true if given substring is the actual substring of the longString, false otherwise.
|
|
176
|
+
*/
|
|
177
|
+
isPossibleSubstring(shortString, longString, root) {
|
|
178
|
+
let rootWord = ((shortString == root.getName()) || longString == root.getName());
|
|
179
|
+
let distance = 0, last = 1;
|
|
180
|
+
for (let j = 0; j < shortString.length; j++) {
|
|
181
|
+
if (shortString.charAt(j) != longString.charAt(j)) {
|
|
182
|
+
if (j < shortString.length - 2) {
|
|
183
|
+
return false;
|
|
184
|
+
}
|
|
185
|
+
last = shortString.length - j;
|
|
186
|
+
distance++;
|
|
187
|
+
if (distance > FsmMorphologicalAnalyzer.MAX_DISTANCE) {
|
|
188
|
+
break;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
if (rootWord && (root.getName() == "ben" || root.getName() == "sen" ||
|
|
193
|
+
root.lastIdropsDuringSuffixation() || root.lastIdropsDuringPassiveSuffixation())) {
|
|
194
|
+
return (distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE);
|
|
195
|
+
}
|
|
196
|
+
else {
|
|
197
|
+
if (shortString.endsWith("e") || shortString.endsWith("a") || shortString.endsWith("p") ||
|
|
198
|
+
shortString.endsWith("ç") || shortString.endsWith("t") || shortString.endsWith("k") ||
|
|
199
|
+
(rootWord && (root.rootSoftenDuringSuffixation() || root.vowelEChangesToIDuringYSuffixation() ||
|
|
200
|
+
root.vowelAChangesToIDuringYSuffixation() || root.endingKChangesIntoG()))) {
|
|
201
|
+
return (last != 2 && distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 1);
|
|
202
|
+
}
|
|
203
|
+
else {
|
|
204
|
+
return (distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 2);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* The initializeParseList method initializes the given given fsm ArrayList with given root words by parsing them.
|
|
210
|
+
* <p>
|
|
211
|
+
* It checks many conditions;
|
|
212
|
+
* isPlural; if root holds the condition then it gets the state with the name of NominalRootPlural, then
|
|
213
|
+
* creates a new parsing and adds this to the input fsmParse Arraylist.
|
|
214
|
+
* Ex : Açıktohumlular
|
|
215
|
+
* <p>
|
|
216
|
+
* !isPlural and isPortmanteauEndingWithSI, if root holds the conditions then it gets the state with the
|
|
217
|
+
* name of NominalRootNoPossesive.
|
|
218
|
+
* Ex : Balarısı
|
|
219
|
+
* <p>
|
|
220
|
+
* !isPlural and isPortmanteau, if root holds the conditions then it gets the state with the name of
|
|
221
|
+
* CompoundNounRoot.
|
|
222
|
+
* Ex : Aslanağızı
|
|
223
|
+
* <p>
|
|
224
|
+
* !isPlural, !isPortmanteau and isHeader, if root holds the conditions then it gets the state with the
|
|
225
|
+
* name of HeaderRoot.
|
|
226
|
+
* Ex : </title>
|
|
227
|
+
* <p>
|
|
228
|
+
* !isPlural, !isPortmanteau and isInterjection, if root holds the conditions then it gets the state
|
|
229
|
+
* with the name of InterjectionRoot.
|
|
230
|
+
* Ex : Hey, Aa
|
|
231
|
+
* <p>
|
|
232
|
+
* !isPlural, !isPortmanteau and isDuplicate, if root holds the conditions then it gets the state
|
|
233
|
+
* with the name of DuplicateRoot.
|
|
234
|
+
* Ex : Allak,
|
|
235
|
+
* <p>
|
|
236
|
+
* !isPlural, !isPortmanteau and isNumeral, if root holds the conditions then it gets the state
|
|
237
|
+
* with the name of CardinalRoot.
|
|
238
|
+
* Ex : Yüz, bin
|
|
239
|
+
* <p>
|
|
240
|
+
* !isPlural, !isPortmanteau and isReal, if root holds the conditions then it gets the state
|
|
241
|
+
* with the name of RealRoot.
|
|
242
|
+
* Ex : 1.2
|
|
243
|
+
* <p>
|
|
244
|
+
* !isPlural, !isPortmanteau and isFraction, if root holds the conditions then it gets the state
|
|
245
|
+
* with the name of FractionRoot.
|
|
246
|
+
* Ex : 1/2
|
|
247
|
+
* <p>
|
|
248
|
+
* !isPlural, !isPortmanteau and isDate, if root holds the conditions then it gets the state
|
|
249
|
+
* with the name of DateRoot.
|
|
250
|
+
* Ex : 11/06/2018
|
|
251
|
+
* <p>
|
|
252
|
+
* !isPlural, !isPortmanteau and isPercent, if root holds the conditions then it gets the state
|
|
253
|
+
* with the name of PercentRoot.
|
|
254
|
+
* Ex : %12.5
|
|
255
|
+
* <p>
|
|
256
|
+
* !isPlural, !isPortmanteau and isRange, if root holds the conditions then it gets the state
|
|
257
|
+
* with the name of RangeRoot.
|
|
258
|
+
* Ex : 3-5
|
|
259
|
+
* <p>
|
|
260
|
+
* !isPlural, !isPortmanteau and isTime, if root holds the conditions then it gets the state
|
|
261
|
+
* with the name of TimeRoot.
|
|
262
|
+
* Ex : 13:16:08
|
|
263
|
+
* <p>
|
|
264
|
+
* !isPlural, !isPortmanteau and isOrdinal, if root holds the conditions then it gets the state
|
|
265
|
+
* with the name of OrdinalRoot.
|
|
266
|
+
* Ex : Altıncı
|
|
267
|
+
* <p>
|
|
268
|
+
* !isPlural, !isPortmanteau, and isVerb if root holds the conditions then it gets the state
|
|
269
|
+
* with the name of VerbalRoot. Or isPassive, then it gets the state with the name of PassiveHn.
|
|
270
|
+
* Ex : Anla (!isPAssive)
|
|
271
|
+
* Ex : Çağrıl (isPassive)
|
|
272
|
+
* <p>
|
|
273
|
+
* !isPlural, !isPortmanteau and isPronoun, if root holds the conditions then it gets the state
|
|
274
|
+
* with the name of PronounRoot. There are 6 different Pronoun state names, REFLEX, QUANT, QUANTPLURAL, DEMONS, PERS, QUES.
|
|
275
|
+
* REFLEX = Reflexive Pronouns Ex : kendi
|
|
276
|
+
* QUANT = Quantitative Pronouns Ex : öbür, hep, kimse, hiçbiri, bazı, kimi, biri
|
|
277
|
+
* QUANTPLURAL = Quantitative Plural Pronouns Ex : tümü, çoğu, hepsi
|
|
278
|
+
* DEMONS = Demonstrative Pronouns Ex : o, bu, şu
|
|
279
|
+
* PERS = Personal Pronouns Ex : ben, sen, o, biz, siz, onlar
|
|
280
|
+
* QUES = Interrogatıve Pronouns Ex : nere, ne, kim, hangi
|
|
281
|
+
* <p>
|
|
282
|
+
* !isPlural, !isPortmanteau and isAdjective, if root holds the conditions then it gets the state
|
|
283
|
+
* with the name of AdjectiveRoot.
|
|
284
|
+
* Ex : Absürt, Abes
|
|
285
|
+
* <p>
|
|
286
|
+
* !isPlural, !isPortmanteau and isPureAdjective, if root holds the conditions then it gets the state
|
|
287
|
+
* with the name of Adjective.
|
|
288
|
+
* Ex : Geçmiş, Cam
|
|
289
|
+
* <p>
|
|
290
|
+
* !isPlural, !isPortmanteau and isNominal, if root holds the conditions then it gets the state
|
|
291
|
+
* with the name of NominalRoot.
|
|
292
|
+
* Ex : Görüş
|
|
293
|
+
* <p>
|
|
294
|
+
* !isPlural, !isPortmanteau and isProper, if root holds the conditions then it gets the state
|
|
295
|
+
* with the name of ProperRoot.
|
|
296
|
+
* Ex : Abdi
|
|
297
|
+
* <p>
|
|
298
|
+
* !isPlural, !isPortmanteau and isQuestion, if root holds the conditions then it gets the state
|
|
299
|
+
* with the name of QuestionRoot.
|
|
300
|
+
* Ex : Mi, mü
|
|
301
|
+
* <p>
|
|
302
|
+
* !isPlural, !isPortmanteau and isDeterminer, if root holds the conditions then it gets the state
|
|
303
|
+
* with the name of DeterminerRoot.
|
|
304
|
+
* Ex : Çok, bir
|
|
305
|
+
* <p>
|
|
306
|
+
* !isPlural, !isPortmanteau and isConjunction, if root holds the conditions then it gets the state
|
|
307
|
+
* with the name of ConjunctionRoot.
|
|
308
|
+
* Ex : Ama , ancak
|
|
309
|
+
* <p>
|
|
310
|
+
* !isPlural, !isPortmanteau and isPostP, if root holds the conditions then it gets the state
|
|
311
|
+
* with the name of PostP.
|
|
312
|
+
* Ex : Ait, dair
|
|
313
|
+
* <p>
|
|
314
|
+
* !isPlural, !isPortmanteau and isAdverb, if root holds the conditions then it gets the state
|
|
315
|
+
* with the name of AdverbRoot.
|
|
316
|
+
* Ex : Acilen
|
|
317
|
+
*
|
|
318
|
+
* @param fsmParse ArrayList to initialize.
|
|
319
|
+
* @param root word to check properties and add to fsmParse according to them.
|
|
320
|
+
* @param isProper is used to check a word is proper or not.
|
|
321
|
+
*/
|
|
322
|
+
initializeParseList(fsmParse, root, isProper) {
|
|
323
|
+
let currentFsmParse;
|
|
324
|
+
if (root.isPlural()) {
|
|
325
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootPlural"));
|
|
326
|
+
fsmParse.push(currentFsmParse);
|
|
327
|
+
}
|
|
328
|
+
else {
|
|
329
|
+
if (root.isPortmanteauEndingWithSI()) {
|
|
330
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
331
|
+
fsmParse.push(currentFsmParse);
|
|
332
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
333
|
+
fsmParse.push(currentFsmParse);
|
|
334
|
+
}
|
|
335
|
+
else {
|
|
336
|
+
if (root.isPortmanteau()) {
|
|
337
|
+
if (root.isPortmanteauFacedVowelEllipsis()) {
|
|
338
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
339
|
+
fsmParse.push(currentFsmParse);
|
|
340
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + root.getName().charAt(root.getName().length - 1) + root.getName().charAt(root.getName().length - 2), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
341
|
+
}
|
|
342
|
+
else {
|
|
343
|
+
if (root.isPortmanteauFacedSoftening()) {
|
|
344
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
345
|
+
fsmParse.push(currentFsmParse);
|
|
346
|
+
switch (root.getName().charAt(root.getName().length - 2)) {
|
|
347
|
+
case 'b':
|
|
348
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'p', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
349
|
+
break;
|
|
350
|
+
case 'c':
|
|
351
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'ç', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
352
|
+
break;
|
|
353
|
+
case 'd':
|
|
354
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 't', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
355
|
+
break;
|
|
356
|
+
case 'ğ':
|
|
357
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'k', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
358
|
+
break;
|
|
359
|
+
default:
|
|
360
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
else {
|
|
364
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
fsmParse.push(currentFsmParse);
|
|
368
|
+
}
|
|
369
|
+
else {
|
|
370
|
+
if (root.isHeader()) {
|
|
371
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("HeaderRoot"));
|
|
372
|
+
fsmParse.push(currentFsmParse);
|
|
373
|
+
}
|
|
374
|
+
if (root.isInterjection()) {
|
|
375
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("InterjectionRoot"));
|
|
376
|
+
fsmParse.push(currentFsmParse);
|
|
377
|
+
}
|
|
378
|
+
if (root.isDuplicate()) {
|
|
379
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DuplicateRoot"));
|
|
380
|
+
fsmParse.push(currentFsmParse);
|
|
381
|
+
}
|
|
382
|
+
if (root.isNumeral()) {
|
|
383
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("CardinalRoot"));
|
|
384
|
+
fsmParse.push(currentFsmParse);
|
|
385
|
+
}
|
|
386
|
+
if (root.isReal()) {
|
|
387
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("RealRoot"));
|
|
388
|
+
fsmParse.push(currentFsmParse);
|
|
389
|
+
}
|
|
390
|
+
if (root.isFraction()) {
|
|
391
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("FractionRoot"));
|
|
392
|
+
fsmParse.push(currentFsmParse);
|
|
393
|
+
}
|
|
394
|
+
if (root.isDate()) {
|
|
395
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DateRoot"));
|
|
396
|
+
fsmParse.push(currentFsmParse);
|
|
397
|
+
}
|
|
398
|
+
if (root.isPercent()) {
|
|
399
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PercentRoot"));
|
|
400
|
+
fsmParse.push(currentFsmParse);
|
|
401
|
+
}
|
|
402
|
+
if (root.isRange()) {
|
|
403
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("RangeRoot"));
|
|
404
|
+
fsmParse.push(currentFsmParse);
|
|
405
|
+
}
|
|
406
|
+
if (root.isTime()) {
|
|
407
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("TimeRoot"));
|
|
408
|
+
fsmParse.push(currentFsmParse);
|
|
409
|
+
}
|
|
410
|
+
if (root.isOrdinal()) {
|
|
411
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("OrdinalRoot"));
|
|
412
|
+
fsmParse.push(currentFsmParse);
|
|
413
|
+
}
|
|
414
|
+
if (root.isVerb() || root.isPassive()) {
|
|
415
|
+
if (root.verbType() != "") {
|
|
416
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot(" + root.verbType() + ")"));
|
|
417
|
+
}
|
|
418
|
+
else {
|
|
419
|
+
if (!root.isPassive()) {
|
|
420
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot"));
|
|
421
|
+
}
|
|
422
|
+
else {
|
|
423
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PassiveHn"));
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
fsmParse.push(currentFsmParse);
|
|
427
|
+
}
|
|
428
|
+
if (root.isPronoun()) {
|
|
429
|
+
if (root.getName() == "kendi") {
|
|
430
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(REFLEX)"));
|
|
431
|
+
fsmParse.push(currentFsmParse);
|
|
432
|
+
}
|
|
433
|
+
if (root.getName() == "öbür" || root.getName() == "öteki" || root.getName() == "hep" || root.getName() == "kimse" || root.getName() == "diğeri" || root.getName() == "hiçbiri" || root.getName() == "böylesi" || root.getName() == "birbiri" || root.getName() == "birbirleri" || root.getName() == "biri" || root.getName() == "başkası" || root.getName() == "bazı" || root.getName() == "kimi") {
|
|
434
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANT)"));
|
|
435
|
+
fsmParse.push(currentFsmParse);
|
|
436
|
+
}
|
|
437
|
+
if (root.getName() == "tümü" || root.getName() == "topu" || root.getName() == "herkes" || root.getName() == "cümlesi" || root.getName() == "çoğu" || root.getName() == "birçoğu" || root.getName() == "birkaçı" || root.getName() == "birçokları" || root.getName() == "hepsi") {
|
|
438
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANTPLURAL)"));
|
|
439
|
+
fsmParse.push(currentFsmParse);
|
|
440
|
+
}
|
|
441
|
+
if (root.getName() == "o" || root.getName() == "bu" || root.getName() == "şu") {
|
|
442
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(DEMONS)"));
|
|
443
|
+
fsmParse.push(currentFsmParse);
|
|
444
|
+
}
|
|
445
|
+
if (root.getName() == "ben" || root.getName() == "sen" || root.getName() == "o" || root.getName() == "biz" || root.getName() == "siz" || root.getName() == "onlar") {
|
|
446
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(PERS)"));
|
|
447
|
+
fsmParse.push(currentFsmParse);
|
|
448
|
+
}
|
|
449
|
+
if (root.getName() == "nere" || root.getName() == "ne" || root.getName() == "kaçı" || root.getName() == "kim" || root.getName() == "hangi") {
|
|
450
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUES)"));
|
|
451
|
+
fsmParse.push(currentFsmParse);
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
if (root.isAdjective()) {
|
|
455
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("AdjectiveRoot"));
|
|
456
|
+
fsmParse.push(currentFsmParse);
|
|
457
|
+
}
|
|
458
|
+
if (root.isPureAdjective()) {
|
|
459
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("Adjective"));
|
|
460
|
+
fsmParse.push(currentFsmParse);
|
|
461
|
+
}
|
|
462
|
+
if (root.isNominal()) {
|
|
463
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
464
|
+
fsmParse.push(currentFsmParse);
|
|
465
|
+
}
|
|
466
|
+
if (root.isAbbreviation()) {
|
|
467
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
468
|
+
fsmParse.push(currentFsmParse);
|
|
469
|
+
}
|
|
470
|
+
if (root.isProperNoun() && isProper) {
|
|
471
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("ProperRoot"));
|
|
472
|
+
fsmParse.push(currentFsmParse);
|
|
473
|
+
}
|
|
474
|
+
if (root.isQuestion()) {
|
|
475
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("QuestionRoot"));
|
|
476
|
+
fsmParse.push(currentFsmParse);
|
|
477
|
+
}
|
|
478
|
+
if (root.isDeterminer()) {
|
|
479
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DeterminerRoot"));
|
|
480
|
+
fsmParse.push(currentFsmParse);
|
|
481
|
+
}
|
|
482
|
+
if (root.isConjunction()) {
|
|
483
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("ConjunctionRoot"));
|
|
484
|
+
fsmParse.push(currentFsmParse);
|
|
485
|
+
}
|
|
486
|
+
if (root.isPostP()) {
|
|
487
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PostP"));
|
|
488
|
+
fsmParse.push(currentFsmParse);
|
|
489
|
+
}
|
|
490
|
+
if (root.isAdverb()) {
|
|
491
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("AdverbRoot"));
|
|
492
|
+
fsmParse.push(currentFsmParse);
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
/**
|
|
499
|
+
* The initializeParseListFromRoot method is used to create an {@link Array} which consists of initial fsm parsings.
|
|
500
|
+
* First, traverses this HashSet and uses each word as a root and calls initializeParseList method with this root
|
|
501
|
+
* and Array.
|
|
502
|
+
* <p>
|
|
503
|
+
*
|
|
504
|
+
* @param parseList ArrayList to initialize.
|
|
505
|
+
* @param root the root form to generate initial parse list.
|
|
506
|
+
* @param isProper is used to check a word is proper or not.
|
|
507
|
+
*/
|
|
508
|
+
initializeParseListFromRoot(parseList, root, isProper) {
|
|
509
|
+
this.initializeParseList(parseList, root, isProper);
|
|
510
|
+
if (root.obeysAndNotObeysVowelHarmonyDuringAgglutination()) {
|
|
511
|
+
let newRoot = root.clone();
|
|
512
|
+
newRoot.removeFlag("IS_UU");
|
|
513
|
+
newRoot.removeFlag("IS_UUU");
|
|
514
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
515
|
+
}
|
|
516
|
+
if (root.rootSoftenAndNotSoftenDuringSuffixation()) {
|
|
517
|
+
let newRoot = root.clone();
|
|
518
|
+
newRoot.removeFlag("IS_SD");
|
|
519
|
+
newRoot.removeFlag("IS_SDD");
|
|
520
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
521
|
+
}
|
|
522
|
+
if (root.lastIDropsAndNotDropDuringSuffixation()) {
|
|
523
|
+
let newRoot = root.clone();
|
|
524
|
+
newRoot.removeFlag("IS_UD");
|
|
525
|
+
newRoot.removeFlag("IS_UDD");
|
|
526
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
527
|
+
}
|
|
528
|
+
if (root.duplicatesAndNotDuplicatesDuringSuffixation()) {
|
|
529
|
+
let newRoot = root.clone();
|
|
530
|
+
newRoot.removeFlag("IS_ST");
|
|
531
|
+
newRoot.removeFlag("IS_STT");
|
|
532
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
533
|
+
}
|
|
534
|
+
if (root.endingKChangesIntoG() && root.containsFlag("IS_OA")) {
|
|
535
|
+
let newRoot = root.clone();
|
|
536
|
+
newRoot.removeFlag("IS_OA");
|
|
537
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
/**
|
|
541
|
+
* The initializeParseListFromSurfaceForm method is used to create an {@link Array} which consists of initial fsm parsings. First,
|
|
542
|
+
* it calls getWordsWithPrefix methods by using input String surfaceForm and generates a {@link Set}. Then, traverses
|
|
543
|
+
* this HashSet and uses each word as a root and calls initializeParseListFromRoot method with this root and ArrayList.
|
|
544
|
+
* <p>
|
|
545
|
+
*
|
|
546
|
+
* @param surfaceForm the String used to generate a HashSet of words.
|
|
547
|
+
* @param isProper is used to check a word is proper or not.
|
|
548
|
+
* @return initialFsmParse ArrayList.
|
|
549
|
+
*/
|
|
550
|
+
initializeParseListFromSurfaceForm(surfaceForm, isProper) {
|
|
551
|
+
let initialFsmParse = new Array();
|
|
552
|
+
if (surfaceForm.length == 0) {
|
|
553
|
+
return initialFsmParse;
|
|
554
|
+
}
|
|
555
|
+
let words = this.dictionaryTrie.getWordsWithPrefix(surfaceForm);
|
|
556
|
+
for (let word of words) {
|
|
557
|
+
let root = word;
|
|
558
|
+
this.initializeParseListFromRoot(initialFsmParse, root, isProper);
|
|
559
|
+
}
|
|
560
|
+
return initialFsmParse;
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* The addNewParsesFromCurrentParse method initially gets the final suffixes from input currentFsmParse called as currentState,
|
|
564
|
+
* and by using the currentState information it gets the new analysis. Then loops through each currentState's transition.
|
|
565
|
+
* If the currentTransition is possible, it makes the transition.
|
|
566
|
+
*
|
|
567
|
+
* @param currentFsmParse FsmParse type input.
|
|
568
|
+
* @param fsmParse an ArrayList of FsmParse.
|
|
569
|
+
* @param maxLength Maximum length of the parse.
|
|
570
|
+
* @param root TxtWord used to make transition.
|
|
571
|
+
*/
|
|
572
|
+
addNewParsesFromCurrentParseLength(currentFsmParse, fsmParse, maxLength, root) {
|
|
573
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
574
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
575
|
+
for (let currentTransition of this.finiteStateMachine.getTransitions(currentState)) {
|
|
576
|
+
if (currentTransition.transitionPossibleFromParse(currentFsmParse) && (currentSurfaceForm != root.getName() ||
|
|
577
|
+
(currentSurfaceForm == root.getName() && currentTransition.transitionPossibleFromRoot(root, currentState)))) {
|
|
578
|
+
let tmp = currentTransition.makeTransition(root, currentSurfaceForm, currentFsmParse.getStartState());
|
|
579
|
+
if (tmp.length <= maxLength) {
|
|
580
|
+
let newFsmParse = currentFsmParse.clone();
|
|
581
|
+
newFsmParse.addSuffix(currentTransition.toState(), tmp, currentTransition.getWith(), currentTransition.toString(), currentTransition.toPos());
|
|
582
|
+
newFsmParse.setAgreement(currentTransition.getWith());
|
|
583
|
+
fsmParse.push(newFsmParse);
|
|
584
|
+
}
|
|
585
|
+
}
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
/**
|
|
589
|
+
* The addNewParsesFromCurrentParse method initially gets the final suffixes from input currentFsmParse called as currentState,
|
|
590
|
+
* and by using the currentState information it gets the currentSurfaceForm. Then loops through each currentState's transition.
|
|
591
|
+
* If the currentTransition is possible, it makes the transition
|
|
592
|
+
*
|
|
593
|
+
* @param currentFsmParse FsmParse type input.
|
|
594
|
+
* @param fsmParse an ArrayList of FsmParse.
|
|
595
|
+
* @param surfaceForm String to use during transition.
|
|
596
|
+
* @param root TxtWord used to make transition.
|
|
597
|
+
*/
|
|
598
|
+
addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, fsmParse, surfaceForm, root) {
|
|
599
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
600
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
601
|
+
for (let currentTransition of this.finiteStateMachine.getTransitions(currentState)) {
|
|
602
|
+
if (currentTransition.transitionPossible(currentFsmParse.getSurfaceForm(), surfaceForm) && currentTransition.transitionPossibleFromParse(currentFsmParse) && (currentSurfaceForm != root.getName() || (currentSurfaceForm == root.getName() && currentTransition.transitionPossibleFromRoot(root, currentState)))) {
|
|
603
|
+
let tmp = currentTransition.makeTransition(root, currentSurfaceForm, currentFsmParse.getStartState());
|
|
604
|
+
if ((tmp.length < surfaceForm.length && this.isPossibleSubstring(tmp, surfaceForm, root)) || (tmp.length == surfaceForm.length && (root.lastIdropsDuringSuffixation() || (tmp == surfaceForm)))) {
|
|
605
|
+
let newFsmParse = currentFsmParse.clone();
|
|
606
|
+
newFsmParse.addSuffix(currentTransition.toState(), tmp, currentTransition.getWith(), currentTransition.toString(), currentTransition.toPos());
|
|
607
|
+
newFsmParse.setAgreement(currentTransition.getWith());
|
|
608
|
+
fsmParse.push(newFsmParse);
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
/**
|
|
614
|
+
* The parseExists method is used to check the existence of the parse.
|
|
615
|
+
*
|
|
616
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
617
|
+
* @param surfaceForm String to use during transition.
|
|
618
|
+
* @return true when the currentState is end state and input surfaceForm id equal to currentSurfaceForm, otherwise false.
|
|
619
|
+
*/
|
|
620
|
+
parseExists(fsmParse, surfaceForm) {
|
|
621
|
+
while (fsmParse.length > 0) {
|
|
622
|
+
let currentFsmParse = fsmParse[0];
|
|
623
|
+
fsmParse.splice(0, 1);
|
|
624
|
+
let root = currentFsmParse.getWord();
|
|
625
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
626
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
627
|
+
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
628
|
+
return true;
|
|
629
|
+
}
|
|
630
|
+
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, fsmParse, surfaceForm, root);
|
|
631
|
+
}
|
|
632
|
+
return false;
|
|
633
|
+
}
|
|
634
|
+
/**
|
|
635
|
+
* The parseWord method is used to parse a given fsmParse. It simply adds new parses to the current parse by
|
|
636
|
+
* using addNewParsesFromCurrentParse method.
|
|
637
|
+
*
|
|
638
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
639
|
+
* @param maxLength maximum length of the surfaceform.
|
|
640
|
+
* @return result {@link Array} which has the currentFsmParse.
|
|
641
|
+
*/
|
|
642
|
+
parseWordLength(fsmParse, maxLength) {
|
|
643
|
+
let result = new Array();
|
|
644
|
+
while (fsmParse.length > 0) {
|
|
645
|
+
let currentFsmParse = fsmParse[0];
|
|
646
|
+
fsmParse.splice(0, 1);
|
|
647
|
+
let root = currentFsmParse.getWord();
|
|
648
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
649
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
650
|
+
if (currentState.isEndState() && currentSurfaceForm.length <= maxLength) {
|
|
651
|
+
let exists = false;
|
|
652
|
+
for (let i = 0; i < result.length; i++) {
|
|
653
|
+
if (currentFsmParse.getSuffixList() == result[i].getSuffixList()) {
|
|
654
|
+
exists = true;
|
|
655
|
+
break;
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
if (!exists) {
|
|
659
|
+
result.push(currentFsmParse);
|
|
660
|
+
currentFsmParse.constructInflectionalGroups();
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
this.addNewParsesFromCurrentParseLength(currentFsmParse, fsmParse, maxLength, root);
|
|
664
|
+
}
|
|
665
|
+
return result;
|
|
666
|
+
}
|
|
667
|
+
/**
|
|
668
|
+
* The parseWord method is used to parse a given fsmParse. It simply adds new parses to the current parse by
|
|
669
|
+
* using addNewParsesFromCurrentParse method.
|
|
670
|
+
*
|
|
671
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
672
|
+
* @param surfaceForm String to use during transition.
|
|
673
|
+
* @return result {@link Array} which has the currentFsmParse.
|
|
674
|
+
*/
|
|
675
|
+
parseWordSurfaceForm(fsmParse, surfaceForm) {
|
|
676
|
+
let result = new Array();
|
|
677
|
+
while (fsmParse.length > 0) {
|
|
678
|
+
let currentFsmParse = fsmParse[0];
|
|
679
|
+
fsmParse.splice(0, 1);
|
|
680
|
+
let root = currentFsmParse.getWord();
|
|
681
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
682
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
683
|
+
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
684
|
+
let exists = false;
|
|
685
|
+
for (let i = 0; i < result.length; i++) {
|
|
686
|
+
if (currentFsmParse.getSuffixList() == result[i].getSuffixList()) {
|
|
687
|
+
exists = true;
|
|
688
|
+
break;
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
if (!exists) {
|
|
692
|
+
result.push(currentFsmParse);
|
|
693
|
+
currentFsmParse.constructInflectionalGroups();
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, fsmParse, surfaceForm, root);
|
|
697
|
+
}
|
|
698
|
+
return result;
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* The morphologicalAnalysis with 3 inputs is used to initialize an {@link Array} and add a new FsmParse
|
|
702
|
+
* with given root and state.
|
|
703
|
+
*
|
|
704
|
+
* @param root TxtWord input.
|
|
705
|
+
* @param surfaceForm String input to use for parsing.
|
|
706
|
+
* @param state String input.
|
|
707
|
+
* @return parseWord method with newly populated FsmParse ArrayList and input surfaceForm.
|
|
708
|
+
*/
|
|
709
|
+
morphologicalAnalysisFromRoot(root, surfaceForm, state) {
|
|
710
|
+
let initialFsmParse = new Array();
|
|
711
|
+
if (state != undefined) {
|
|
712
|
+
initialFsmParse.push(new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState(state)));
|
|
713
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
714
|
+
}
|
|
715
|
+
else {
|
|
716
|
+
this.initializeParseListFromRoot(initialFsmParse, root, this.isProperNoun(surfaceForm));
|
|
717
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
distinctSurfaceFormList(parseList) {
|
|
721
|
+
let items = new Set();
|
|
722
|
+
for (let parse of parseList) {
|
|
723
|
+
items.add(parse.getSurfaceForm());
|
|
724
|
+
}
|
|
725
|
+
return items;
|
|
726
|
+
}
|
|
727
|
+
/**
|
|
728
|
+
* The generateAllParses with 2 inputs is used to generate all parses with given root. Then it calls initializeParseListFromRoot method to initialize list with newly created ArrayList, input root,
|
|
729
|
+
* and maximum length.
|
|
730
|
+
*
|
|
731
|
+
* @param root TxtWord input.
|
|
732
|
+
* @param maxLength Maximum length of the surface form.
|
|
733
|
+
* @return parseWord method with newly populated FsmParse ArrayList and maximum length.
|
|
734
|
+
*/
|
|
735
|
+
generateAllParses(root, maxLength) {
|
|
736
|
+
let initialFsmParse = new Array();
|
|
737
|
+
if (root.isProperNoun()) {
|
|
738
|
+
this.initializeParseListFromRoot(initialFsmParse, root, true);
|
|
739
|
+
}
|
|
740
|
+
this.initializeParseListFromRoot(initialFsmParse, root, false);
|
|
741
|
+
return this.parseWordLength(initialFsmParse, maxLength);
|
|
742
|
+
}
|
|
743
|
+
/**
|
|
744
|
+
* Replaces previous lemma in the sentence with the new lemma. Both lemma can contain multiple words.
|
|
745
|
+
* @param original Original sentence to be replaced with.
|
|
746
|
+
* @param previousWord Root word in the original sentence
|
|
747
|
+
* @param newWord New word to be replaced.
|
|
748
|
+
* @return Newly generated sentence by replacing the previous word in the original sentence with the new word.
|
|
749
|
+
*/
|
|
750
|
+
replaceWord(original, previousWord, newWord) {
|
|
751
|
+
let previousWordSplitted = undefined, newWordSplitted = undefined;
|
|
752
|
+
let result = new Sentence_1.Sentence();
|
|
753
|
+
let replacedWord = undefined;
|
|
754
|
+
let previousWordMultiple = previousWord.includes(" ");
|
|
755
|
+
let newWordMultiple = newWord.includes(" ");
|
|
756
|
+
let lastWord;
|
|
757
|
+
if (previousWordMultiple) {
|
|
758
|
+
previousWordSplitted = previousWord.split(" ");
|
|
759
|
+
lastWord = previousWordSplitted[previousWordSplitted.length - 1];
|
|
760
|
+
}
|
|
761
|
+
else {
|
|
762
|
+
lastWord = previousWord;
|
|
763
|
+
}
|
|
764
|
+
let newRootWord;
|
|
765
|
+
if (newWordMultiple) {
|
|
766
|
+
newWordSplitted = newWord.split(" ");
|
|
767
|
+
newRootWord = newWordSplitted[newWordSplitted.length - 1];
|
|
768
|
+
}
|
|
769
|
+
else {
|
|
770
|
+
newRootWord = newWord;
|
|
771
|
+
}
|
|
772
|
+
let newRootTxtWord = this.dictionary.getWord(newRootWord);
|
|
773
|
+
let parseList = this.morphologicalAnalysisFromSentence(original);
|
|
774
|
+
let i;
|
|
775
|
+
for (i = 0; i < parseList.length; i++) {
|
|
776
|
+
let replaced = false;
|
|
777
|
+
for (let j = 0; j < parseList[i].size(); j++) {
|
|
778
|
+
if (parseList[i].getFsmParse(j).getWord().getName() == lastWord && newRootTxtWord != undefined) {
|
|
779
|
+
replaced = true;
|
|
780
|
+
replacedWord = parseList[i].getFsmParse(j).replaceRootWord(newRootTxtWord);
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
if (replaced && replacedWord != null) {
|
|
784
|
+
if (previousWordMultiple) {
|
|
785
|
+
for (let k = 0; k < i - previousWordSplitted.length + 1; k++) {
|
|
786
|
+
result.addWord(original.getWord(k));
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
if (newWordMultiple) {
|
|
790
|
+
for (let k = 0; k < newWordSplitted.length - 1; k++) {
|
|
791
|
+
if (result.wordCount() == 0) {
|
|
792
|
+
result.addWord(new Word_1.Word((newWordSplitted[k].charAt(0) + "").toLocaleUpperCase("tr") + newWordSplitted[k].substring(1)));
|
|
793
|
+
}
|
|
794
|
+
else {
|
|
795
|
+
result.addWord(new Word_1.Word(newWordSplitted[k]));
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
if (result.wordCount() == 0) {
|
|
800
|
+
replacedWord = (replacedWord.charAt(0) + "").toLocaleUpperCase("tr") + replacedWord.substring(1);
|
|
801
|
+
}
|
|
802
|
+
result.addWord(new Word_1.Word(replacedWord));
|
|
803
|
+
if (previousWordMultiple) {
|
|
804
|
+
i++;
|
|
805
|
+
break;
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
else {
|
|
809
|
+
if (!previousWordMultiple) {
|
|
810
|
+
result.addWord(original.getWord(i));
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
}
|
|
814
|
+
if (previousWordMultiple) {
|
|
815
|
+
for (; i < parseList.length; i++) {
|
|
816
|
+
result.addWord(original.getWord(i));
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
return result;
|
|
820
|
+
}
|
|
821
|
+
/**
|
|
822
|
+
* The analysisExists method checks several cases. If the given surfaceForm is a punctuation or double then it
|
|
823
|
+
* returns true. If it is not a root word, then it initializes the parse list and returns the parseExists method with
|
|
824
|
+
* this newly initialized list and surfaceForm.
|
|
825
|
+
*
|
|
826
|
+
* @param rootWord TxtWord root.
|
|
827
|
+
* @param surfaceForm String input.
|
|
828
|
+
* @param isProper boolean variable indicates a word is proper or not.
|
|
829
|
+
* @return true if surfaceForm is punctuation or double, otherwise returns parseExist method with given surfaceForm.
|
|
830
|
+
*/
|
|
831
|
+
analysisExists(rootWord, surfaceForm, isProper) {
|
|
832
|
+
if (Word_1.Word.isPunctuation(surfaceForm)) {
|
|
833
|
+
return true;
|
|
834
|
+
}
|
|
835
|
+
if (this.isDouble(surfaceForm)) {
|
|
836
|
+
return true;
|
|
837
|
+
}
|
|
838
|
+
let initialFsmParse;
|
|
839
|
+
if (rootWord != null) {
|
|
840
|
+
initialFsmParse = new Array();
|
|
841
|
+
this.initializeParseListFromRoot(initialFsmParse, rootWord, isProper);
|
|
842
|
+
}
|
|
843
|
+
else {
|
|
844
|
+
initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
|
|
845
|
+
}
|
|
846
|
+
return this.parseExists(initialFsmParse, surfaceForm);
|
|
847
|
+
}
|
|
848
|
+
/**
|
|
849
|
+
* The analysis method is used by the morphologicalAnalysis method. It gets String surfaceForm as an input and checks
|
|
850
|
+
* its type such as punctuation, number or compares with the regex for date, fraction, percent, time, range, hashtag,
|
|
851
|
+
* and mail or checks its variable type as integer or double. After finding the right case for given surfaceForm, it calls
|
|
852
|
+
* constructInflectionalGroups method which creates sub-word units.
|
|
853
|
+
*
|
|
854
|
+
* @param surfaceForm String to analyse.
|
|
855
|
+
* @param isProper is used to indicate the proper words.
|
|
856
|
+
* @return ArrayList type initialFsmParse which holds the analyses.
|
|
857
|
+
*/
|
|
858
|
+
analysis(surfaceForm, isProper) {
|
|
859
|
+
let initialFsmParse, fsmParse;
|
|
860
|
+
if (Word_1.Word.isPunctuation(surfaceForm) && surfaceForm != "%") {
|
|
861
|
+
initialFsmParse = new Array();
|
|
862
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Punctuation"), true, true));
|
|
863
|
+
fsmParse.constructInflectionalGroups();
|
|
864
|
+
initialFsmParse.push(fsmParse);
|
|
865
|
+
return initialFsmParse;
|
|
866
|
+
}
|
|
867
|
+
if (this.isNumber(surfaceForm)) {
|
|
868
|
+
initialFsmParse = new Array();
|
|
869
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("CardinalRoot"), true, true));
|
|
870
|
+
fsmParse.constructInflectionalGroups();
|
|
871
|
+
initialFsmParse.push(fsmParse);
|
|
872
|
+
return initialFsmParse;
|
|
873
|
+
}
|
|
874
|
+
if (this.patternMatches("^\\d+/\\d+$", surfaceForm)) {
|
|
875
|
+
initialFsmParse = new Array();
|
|
876
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
877
|
+
fsmParse.constructInflectionalGroups();
|
|
878
|
+
initialFsmParse.push(fsmParse);
|
|
879
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
880
|
+
fsmParse.constructInflectionalGroups();
|
|
881
|
+
initialFsmParse.push(fsmParse);
|
|
882
|
+
return initialFsmParse;
|
|
883
|
+
}
|
|
884
|
+
if (this.isDate(surfaceForm)) {
|
|
885
|
+
initialFsmParse = new Array();
|
|
886
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
887
|
+
fsmParse.constructInflectionalGroups();
|
|
888
|
+
initialFsmParse.push(fsmParse);
|
|
889
|
+
return initialFsmParse;
|
|
890
|
+
}
|
|
891
|
+
if (this.patternMatches("^\\d+\\\\/\\d+$", surfaceForm)) {
|
|
892
|
+
initialFsmParse = new Array();
|
|
893
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
894
|
+
fsmParse.constructInflectionalGroups();
|
|
895
|
+
initialFsmParse.push(fsmParse);
|
|
896
|
+
return initialFsmParse;
|
|
897
|
+
}
|
|
898
|
+
if (surfaceForm == "%" || this.isPercent(surfaceForm)) {
|
|
899
|
+
initialFsmParse = new Array();
|
|
900
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("PercentRoot"), true, true));
|
|
901
|
+
fsmParse.constructInflectionalGroups();
|
|
902
|
+
initialFsmParse.push(fsmParse);
|
|
903
|
+
return initialFsmParse;
|
|
904
|
+
}
|
|
905
|
+
if (this.isTime(surfaceForm)) {
|
|
906
|
+
initialFsmParse = new Array();
|
|
907
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("TimeRoot"), true, true));
|
|
908
|
+
fsmParse.constructInflectionalGroups();
|
|
909
|
+
initialFsmParse.push(fsmParse);
|
|
910
|
+
return initialFsmParse;
|
|
911
|
+
}
|
|
912
|
+
if (this.isRange(surfaceForm)) {
|
|
913
|
+
initialFsmParse = new Array();
|
|
914
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("RangeRoot"), true, true));
|
|
915
|
+
fsmParse.constructInflectionalGroups();
|
|
916
|
+
initialFsmParse.push(fsmParse);
|
|
917
|
+
return initialFsmParse;
|
|
918
|
+
}
|
|
919
|
+
if (surfaceForm.startsWith("#")) {
|
|
920
|
+
initialFsmParse = new Array();
|
|
921
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Hashtag"), true, true));
|
|
922
|
+
fsmParse.constructInflectionalGroups();
|
|
923
|
+
initialFsmParse.push(fsmParse);
|
|
924
|
+
return initialFsmParse;
|
|
925
|
+
}
|
|
926
|
+
if (surfaceForm.includes("@")) {
|
|
927
|
+
initialFsmParse = new Array();
|
|
928
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Email"), true, true));
|
|
929
|
+
fsmParse.constructInflectionalGroups();
|
|
930
|
+
initialFsmParse.push(fsmParse);
|
|
931
|
+
return initialFsmParse;
|
|
932
|
+
}
|
|
933
|
+
if (surfaceForm.endsWith(".") && this.isInteger(surfaceForm.substring(0, surfaceForm.length - 1))) {
|
|
934
|
+
initialFsmParse = new Array();
|
|
935
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm.substring(0, surfaceForm.length - 1)), this.finiteStateMachine.getState("OrdinalRoot"));
|
|
936
|
+
fsmParse.constructInflectionalGroups();
|
|
937
|
+
initialFsmParse.push(fsmParse);
|
|
938
|
+
return initialFsmParse;
|
|
939
|
+
}
|
|
940
|
+
if (this.isInteger(surfaceForm)) {
|
|
941
|
+
initialFsmParse = new Array();
|
|
942
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm), this.finiteStateMachine.getState("CardinalRoot"));
|
|
943
|
+
fsmParse.constructInflectionalGroups();
|
|
944
|
+
initialFsmParse.push(fsmParse);
|
|
945
|
+
return initialFsmParse;
|
|
946
|
+
}
|
|
947
|
+
if (this.isDouble(surfaceForm)) {
|
|
948
|
+
initialFsmParse = new Array();
|
|
949
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseFloat(surfaceForm), this.finiteStateMachine.getState("RealRoot"));
|
|
950
|
+
fsmParse.constructInflectionalGroups();
|
|
951
|
+
initialFsmParse.push(fsmParse);
|
|
952
|
+
return initialFsmParse;
|
|
953
|
+
}
|
|
954
|
+
initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
|
|
955
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
956
|
+
}
|
|
957
|
+
patternMatches(expr, value) {
|
|
958
|
+
let p = this.mostUsedPatterns.get(expr);
|
|
959
|
+
if (p == undefined) {
|
|
960
|
+
p = RegExp(expr);
|
|
961
|
+
this.mostUsedPatterns.set(expr, p);
|
|
962
|
+
}
|
|
963
|
+
return value.match(p) != null;
|
|
964
|
+
}
|
|
965
|
+
/**
|
|
966
|
+
* The isProperNoun method takes surfaceForm String as input and checks its each char whether they are in the range
|
|
967
|
+
* of letters between A to Z or one of the Turkish letters such as İ, Ü, Ğ, Ş, Ç, and Ö.
|
|
968
|
+
*
|
|
969
|
+
* @param surfaceForm String to check for proper noun.
|
|
970
|
+
* @return false if surfaceForm is null or length of 0, return true if it is a letter.
|
|
971
|
+
*/
|
|
972
|
+
isProperNoun(surfaceForm) {
|
|
973
|
+
if (surfaceForm == undefined || surfaceForm.length == 0) {
|
|
974
|
+
return false;
|
|
975
|
+
}
|
|
976
|
+
return (surfaceForm.charAt(0) >= 'A' && surfaceForm.charAt(0) <= 'Z') || (surfaceForm.charAt(0) == '\u0130') ||
|
|
977
|
+
(surfaceForm.charAt(0) == '\u00dc') || (surfaceForm.charAt(0) == '\u011e') || (surfaceForm.charAt(0) == '\u015e') ||
|
|
978
|
+
(surfaceForm.charAt(0) == '\u00c7') || (surfaceForm.charAt(0) == '\u00d6'); // İ, Ü, Ğ, Ş, Ç, Ö
|
|
979
|
+
}
|
|
980
|
+
/**
|
|
981
|
+
* The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
|
|
982
|
+
* then, if the size of the currentParse is 0, and given surfaceForm is a proper noun, it adds the surfaceForm
|
|
983
|
+
* whose state name is ProperRoot to an {@link Array}, of it is not a proper noon, it adds the surfaceForm
|
|
984
|
+
* whose state name is NominalRoot to the {@link Array}.
|
|
985
|
+
*
|
|
986
|
+
* @param surfaceForm String to analyse.
|
|
987
|
+
* @return FsmParseList type currentParse which holds morphological analysis of the surfaceForm.
|
|
988
|
+
*/
|
|
989
|
+
robustMorphologicalAnalysis(surfaceForm) {
|
|
990
|
+
if (surfaceForm == undefined || surfaceForm == "") {
|
|
991
|
+
return new FsmParseList_1.FsmParseList(new Array());
|
|
992
|
+
}
|
|
993
|
+
let currentParse = this.morphologicalAnalysis(surfaceForm);
|
|
994
|
+
if (currentParse.size() == 0) {
|
|
995
|
+
let fsmParse = new Array();
|
|
996
|
+
if (this.isProperNoun(surfaceForm)) {
|
|
997
|
+
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("ProperRoot")));
|
|
998
|
+
return new FsmParseList_1.FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
|
|
999
|
+
}
|
|
1000
|
+
else {
|
|
1001
|
+
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")));
|
|
1002
|
+
return new FsmParseList_1.FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
1005
|
+
else {
|
|
1006
|
+
return currentParse;
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
/**
|
|
1010
|
+
* The morphologicalAnalysis is used for debug purposes.
|
|
1011
|
+
*
|
|
1012
|
+
* @param sentence to get word from.
|
|
1013
|
+
* @return FsmParseList type result.
|
|
1014
|
+
*/
|
|
1015
|
+
morphologicalAnalysisFromSentence(sentence) {
|
|
1016
|
+
let result = new Array();
|
|
1017
|
+
for (let i = 0; i < sentence.wordCount(); i++) {
|
|
1018
|
+
let originalForm = sentence.getWord(i).getName();
|
|
1019
|
+
let spellCorrectedForm = this.dictionary.getCorrectForm(originalForm);
|
|
1020
|
+
if (spellCorrectedForm == undefined) {
|
|
1021
|
+
spellCorrectedForm = originalForm;
|
|
1022
|
+
}
|
|
1023
|
+
let wordFsmParseList = this.morphologicalAnalysis(spellCorrectedForm);
|
|
1024
|
+
result.push(wordFsmParseList);
|
|
1025
|
+
}
|
|
1026
|
+
return result;
|
|
1027
|
+
}
|
|
1028
|
+
/**
|
|
1029
|
+
* The robustMorphologicalAnalysis method takes just one argument as an input. It gets the name of the words from
|
|
1030
|
+
* input sentence then calls robustMorphologicalAnalysis with surfaceForm.
|
|
1031
|
+
*
|
|
1032
|
+
* @param sentence Sentence type input used to get surfaceForm.
|
|
1033
|
+
* @return FsmParseList array which holds the result of the analysis.
|
|
1034
|
+
*/
|
|
1035
|
+
robustMorphologicalAnalysisFromSentence(sentence) {
|
|
1036
|
+
let result = new Array();
|
|
1037
|
+
for (let i = 0; i < sentence.wordCount(); i++) {
|
|
1038
|
+
let originalForm = sentence.getWord(i).getName();
|
|
1039
|
+
let spellCorrectedForm = this.dictionary.getCorrectForm(originalForm);
|
|
1040
|
+
if (spellCorrectedForm == undefined) {
|
|
1041
|
+
spellCorrectedForm = originalForm;
|
|
1042
|
+
}
|
|
1043
|
+
let fsmParseList = this.robustMorphologicalAnalysis(spellCorrectedForm);
|
|
1044
|
+
result.push(fsmParseList);
|
|
1045
|
+
}
|
|
1046
|
+
return result;
|
|
1047
|
+
}
|
|
1048
|
+
/**
|
|
1049
|
+
* The isInteger method compares input surfaceForm with regex \+?\d+ and returns the result.
|
|
1050
|
+
* Supports positive integer checks only.
|
|
1051
|
+
*
|
|
1052
|
+
* @param surfaceForm String to check.
|
|
1053
|
+
* @return true if surfaceForm matches with the regex.
|
|
1054
|
+
*/
|
|
1055
|
+
isInteger(surfaceForm) {
|
|
1056
|
+
if (!this.patternMatches("^\\+?\\d+$", surfaceForm))
|
|
1057
|
+
return false;
|
|
1058
|
+
let len = surfaceForm.length;
|
|
1059
|
+
if (len < 10) {
|
|
1060
|
+
return true; //Most common scenario. Return after a single check.
|
|
1061
|
+
}
|
|
1062
|
+
else {
|
|
1063
|
+
if (len > 10) {
|
|
1064
|
+
return false;
|
|
1065
|
+
}
|
|
1066
|
+
else {
|
|
1067
|
+
return surfaceForm >= "2147483647";
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
}
|
|
1071
|
+
/**
|
|
1072
|
+
* The isDouble method compares input surfaceForm with regex \+?(\d+)?\.\d* and returns the result.
|
|
1073
|
+
*
|
|
1074
|
+
* @param surfaceForm String to check.
|
|
1075
|
+
* @return true if surfaceForm matches with the regex.
|
|
1076
|
+
*/
|
|
1077
|
+
isDouble(surfaceForm) {
|
|
1078
|
+
return this.patternMatches("^\\+?(\\d+)?\\.\\d*$", surfaceForm);
|
|
1079
|
+
}
|
|
1080
|
+
/**
|
|
1081
|
+
* The isNumber method compares input surfaceForm with the array of written numbers and returns the result.
|
|
1082
|
+
*
|
|
1083
|
+
* @param surfaceForm String to check.
|
|
1084
|
+
* @return true if surfaceForm matches with the regex.
|
|
1085
|
+
*/
|
|
1086
|
+
isNumber(surfaceForm) {
|
|
1087
|
+
let count = 0;
|
|
1088
|
+
let numbers = ["bir", "iki", "üç", "dört", "beş", "altı", "yedi", "sekiz", "dokuz",
|
|
1089
|
+
"on", "yirmi", "otuz", "kırk", "elli", "altmış", "yetmiş", "seksen", "doksan",
|
|
1090
|
+
"yüz", "bin", "milyon", "milyar", "trilyon", "katrilyon"];
|
|
1091
|
+
let word = surfaceForm;
|
|
1092
|
+
while (word != "") {
|
|
1093
|
+
let found = false;
|
|
1094
|
+
for (let number of numbers) {
|
|
1095
|
+
if (word.startsWith(number)) {
|
|
1096
|
+
found = true;
|
|
1097
|
+
count++;
|
|
1098
|
+
word = word.substring(number.length);
|
|
1099
|
+
break;
|
|
1100
|
+
}
|
|
1101
|
+
}
|
|
1102
|
+
if (!found) {
|
|
1103
|
+
break;
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
return word == "" && count > 1;
|
|
1107
|
+
}
|
|
1108
|
+
isPercent(surfaceForm) {
|
|
1109
|
+
return this.patternMatches("^%(\\d\\d|\\d)$", surfaceForm) ||
|
|
1110
|
+
this.patternMatches("^%(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1111
|
+
}
|
|
1112
|
+
isTime(surfaceForm) {
|
|
1113
|
+
return this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1114
|
+
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm);
|
|
1115
|
+
}
|
|
1116
|
+
isRange(surfaceForm) {
|
|
1117
|
+
return this.patternMatches("^\\d+-\\d+$", surfaceForm) ||
|
|
1118
|
+
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1119
|
+
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)$", surfaceForm);
|
|
1120
|
+
}
|
|
1121
|
+
isDate(surfaceForm) {
|
|
1122
|
+
return this.patternMatches("^(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+$", surfaceForm) ||
|
|
1123
|
+
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1124
|
+
}
|
|
1125
|
+
/**
|
|
1126
|
+
* The morphologicalAnalysis method is used to analyse a FsmParseList by comparing with the regex.
|
|
1127
|
+
* It creates an {@link Array} fsmParse to hold the result of the analysis method. For each surfaceForm input,
|
|
1128
|
+
* it gets a substring and considers it as a possibleRoot. Then compares with the regex.
|
|
1129
|
+
* <p>
|
|
1130
|
+
* If the surfaceForm input string matches with Turkish chars like Ç, Ş, İ, Ü, Ö, it adds the surfaceForm to Trie with IS_OA tag.
|
|
1131
|
+
* If the possibleRoot contains /, then it is added to the Trie with IS_KESIR tag.
|
|
1132
|
+
* If the possibleRoot contains \d\d|\d)/(\d\d|\d)/\d+, then it is added to the Trie with IS_DATE tag.
|
|
1133
|
+
* If the possibleRoot contains \\d\d|\d, then it is added to the Trie with IS_PERCENT tag.
|
|
1134
|
+
* If the possibleRoot contains \d\d|\d):(\d\d|\d):(\d\d|\d), then it is added to the Trie with IS_ZAMAN tag.
|
|
1135
|
+
* If the possibleRoot contains \d+-\d+, then it is added to the Trie with IS_RANGE tag.
|
|
1136
|
+
* If the possibleRoot is an Integer, then it is added to the Trie with IS_SAYI tag.
|
|
1137
|
+
* If the possibleRoot is a Double, then it is added to the Trie with IS_REELSAYI tag.
|
|
1138
|
+
*
|
|
1139
|
+
* @param surfaceForm String to analyse.
|
|
1140
|
+
* @return fsmParseList which holds the analysis.
|
|
1141
|
+
*/
|
|
1142
|
+
morphologicalAnalysis(surfaceForm) {
|
|
1143
|
+
let lowerCased = surfaceForm.toLocaleLowerCase("tr");
|
|
1144
|
+
if (this.parsedSurfaceForms != undefined && this.parsedSurfaceForms.has(lowerCased) &&
|
|
1145
|
+
!this.isInteger(surfaceForm) && !this.isDouble(surfaceForm) && !this.isPercent(surfaceForm) &&
|
|
1146
|
+
!this.isTime(surfaceForm) && !this.isRange(surfaceForm) && !this.isDate(surfaceForm)) {
|
|
1147
|
+
let parses = new Array();
|
|
1148
|
+
parses.push(new FsmParse_1.FsmParse(new Word_1.Word(this.parsedSurfaceForms.get(lowerCased))));
|
|
1149
|
+
return new FsmParseList_1.FsmParseList(parses);
|
|
1150
|
+
}
|
|
1151
|
+
if (this.cache != undefined && this.cache.contains(surfaceForm)) {
|
|
1152
|
+
return this.cache.get(surfaceForm);
|
|
1153
|
+
}
|
|
1154
|
+
if (this.patternMatches("^(\\w|Ç|Ş|İ|Ü|Ö)\\.$", surfaceForm)) {
|
|
1155
|
+
this.dictionaryTrie.addWord(surfaceForm.toLocaleLowerCase("tr"), new TxtWord_1.TxtWord(surfaceForm.toLocaleLowerCase("tr"), "IS_OA"));
|
|
1156
|
+
}
|
|
1157
|
+
let defaultFsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1158
|
+
if (defaultFsmParse.length > 0) {
|
|
1159
|
+
let fsmParseList = new FsmParseList_1.FsmParseList(defaultFsmParse);
|
|
1160
|
+
if (this.cache != undefined) {
|
|
1161
|
+
this.cache.add(surfaceForm, fsmParseList);
|
|
1162
|
+
}
|
|
1163
|
+
return fsmParseList;
|
|
1164
|
+
}
|
|
1165
|
+
let fsmParse = new Array();
|
|
1166
|
+
if (surfaceForm.includes("'")) {
|
|
1167
|
+
let possibleRoot = surfaceForm.substring(0, surfaceForm.indexOf('\''));
|
|
1168
|
+
if (possibleRoot != "") {
|
|
1169
|
+
if (possibleRoot.includes("/") || possibleRoot.includes("\\/")) {
|
|
1170
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
|
|
1171
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1172
|
+
}
|
|
1173
|
+
else {
|
|
1174
|
+
if (this.isDate(possibleRoot)) {
|
|
1175
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_DATE"));
|
|
1176
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1177
|
+
}
|
|
1178
|
+
else {
|
|
1179
|
+
if (this.patternMatches("^\\d+/\\d+$", possibleRoot)) {
|
|
1180
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
|
|
1181
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1182
|
+
}
|
|
1183
|
+
else {
|
|
1184
|
+
if (this.isPercent(possibleRoot)) {
|
|
1185
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_PERCENT"));
|
|
1186
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1187
|
+
}
|
|
1188
|
+
else {
|
|
1189
|
+
if (this.isTime(surfaceForm)) {
|
|
1190
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_ZAMAN"));
|
|
1191
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1192
|
+
}
|
|
1193
|
+
else {
|
|
1194
|
+
if (this.isRange(surfaceForm)) {
|
|
1195
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_RANGE"));
|
|
1196
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1197
|
+
}
|
|
1198
|
+
else {
|
|
1199
|
+
if (this.isInteger(possibleRoot)) {
|
|
1200
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_SAYI"));
|
|
1201
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1202
|
+
}
|
|
1203
|
+
else {
|
|
1204
|
+
if (this.isDouble(possibleRoot)) {
|
|
1205
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_REELSAYI"));
|
|
1206
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1207
|
+
}
|
|
1208
|
+
else {
|
|
1209
|
+
if (Word_1.Word.isCapital(possibleRoot)) {
|
|
1210
|
+
let newWord = undefined;
|
|
1211
|
+
if (this.dictionary.getWord(possibleRoot.toLocaleLowerCase("tr")) != null) {
|
|
1212
|
+
this.dictionary.getWord(possibleRoot.toLocaleLowerCase("tr")).addFlag("IS_OA");
|
|
1213
|
+
}
|
|
1214
|
+
else {
|
|
1215
|
+
newWord = new TxtWord_1.TxtWord(possibleRoot.toLocaleLowerCase("tr"), "IS_OA");
|
|
1216
|
+
this.dictionaryTrie.addWord(possibleRoot.toLocaleLowerCase("tr"), newWord);
|
|
1217
|
+
}
|
|
1218
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1219
|
+
if (fsmParse.length == 0 && newWord != undefined) {
|
|
1220
|
+
newWord.addFlag("IS_KIS");
|
|
1221
|
+
fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
}
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
}
|
|
1229
|
+
}
|
|
1230
|
+
}
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
let fsmParseList = new FsmParseList_1.FsmParseList(fsmParse);
|
|
1235
|
+
if (this.cache != undefined && fsmParseList.size() > 0) {
|
|
1236
|
+
this.cache.add(surfaceForm, fsmParseList);
|
|
1237
|
+
}
|
|
1238
|
+
return fsmParseList;
|
|
1239
|
+
}
|
|
1240
|
+
/**
|
|
1241
|
+
* The morphologicalAnalysisExists method calls analysisExists to check the existence of the analysis with given
|
|
1242
|
+
* root and surfaceForm.
|
|
1243
|
+
*
|
|
1244
|
+
* @param surfaceForm String to check.
|
|
1245
|
+
* @param rootWord TxtWord input root.
|
|
1246
|
+
* @return true an analysis exists, otherwise return false.
|
|
1247
|
+
*/
|
|
1248
|
+
morphologicalAnalysisExists(rootWord, surfaceForm) {
|
|
1249
|
+
return this.analysisExists(rootWord, surfaceForm.toLocaleLowerCase("tr"), true);
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
exports.FsmMorphologicalAnalyzer = FsmMorphologicalAnalyzer;
|
|
1253
|
+
FsmMorphologicalAnalyzer.MAX_DISTANCE = 2;
|
|
1254
|
+
});
|
|
1255
|
+
//# sourceMappingURL=FsmMorphologicalAnalyzer.js.map
|