nlptoolkit-morphologicalanalysis 1.0.18 → 1.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Corpus/DisambiguatedWord.js +26 -35
- package/dist/Corpus/DisambiguatedWord.js.map +1 -1
- package/dist/Corpus/DisambiguationCorpus.js +70 -47
- package/dist/Corpus/DisambiguationCorpus.js.map +1 -1
- package/dist/MorphologicalAnalysis/FiniteStateMachine.js +148 -158
- package/dist/MorphologicalAnalysis/FiniteStateMachine.js.map +1 -1
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js +1281 -1254
- package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js.map +1 -1
- package/dist/MorphologicalAnalysis/FsmParse.js +596 -603
- package/dist/MorphologicalAnalysis/FsmParse.js.map +1 -1
- package/dist/MorphologicalAnalysis/FsmParseList.js +263 -273
- package/dist/MorphologicalAnalysis/FsmParseList.js.map +1 -1
- package/dist/MorphologicalAnalysis/InflectionalGroup.js +152 -162
- package/dist/MorphologicalAnalysis/InflectionalGroup.js.map +1 -1
- package/dist/MorphologicalAnalysis/MetamorphicParse.js +120 -129
- package/dist/MorphologicalAnalysis/MetamorphicParse.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphologicalParse.js +1037 -1046
- package/dist/MorphologicalAnalysis/MorphologicalParse.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphologicalTag.js +530 -540
- package/dist/MorphologicalAnalysis/MorphologicalTag.js.map +1 -1
- package/dist/MorphologicalAnalysis/MorphotacticEngine.js +230 -240
- package/dist/MorphologicalAnalysis/MorphotacticEngine.js.map +1 -1
- package/dist/MorphologicalAnalysis/State.js +54 -60
- package/dist/MorphologicalAnalysis/State.js.map +1 -1
- package/dist/MorphologicalAnalysis/Transition.js +408 -418
- package/dist/MorphologicalAnalysis/Transition.js.map +1 -1
- package/dist/index.js +19 -25
- package/dist/index.js.map +1 -1
- package/package.json +8 -7
- package/tests/FsmParseListTest.ts +3 -3
- package/tests/FsmParseTest.ts +1 -1
- package/tsconfig.json +4 -3
- package/turkish_dictionary.txt +9114 -9114
- package/source/tsconfig.json +0 -13
|
@@ -1,160 +1,200 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
5
7
|
}
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
35
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
+
exports.FsmMorphologicalAnalyzer = void 0;
|
|
37
|
+
const Trie_1 = require("nlptoolkit-dictionary/dist/Dictionary/Trie/Trie");
|
|
38
|
+
const FiniteStateMachine_1 = require("./FiniteStateMachine");
|
|
39
|
+
const TxtDictionary_1 = require("nlptoolkit-dictionary/dist/Dictionary/TxtDictionary");
|
|
40
|
+
const LRUCache_1 = require("nlptoolkit-datastructure/dist/LRUCache");
|
|
41
|
+
const FsmParseList_1 = require("./FsmParseList");
|
|
42
|
+
const WordComparator_1 = require("nlptoolkit-dictionary/dist/Dictionary/WordComparator");
|
|
43
|
+
const fs = __importStar(require("fs"));
|
|
44
|
+
const Transition_1 = require("./Transition");
|
|
45
|
+
const MorphologicalTag_1 = require("./MorphologicalTag");
|
|
46
|
+
const TxtWord_1 = require("nlptoolkit-dictionary/dist/Dictionary/TxtWord");
|
|
47
|
+
const FsmParse_1 = require("./FsmParse");
|
|
48
|
+
const Sentence_1 = require("nlptoolkit-corpus/dist/Sentence");
|
|
49
|
+
const Word_1 = require("nlptoolkit-dictionary/dist/Dictionary/Word");
|
|
50
|
+
const State_1 = require("./State");
|
|
51
|
+
const Queue_1 = require("nlptoolkit-datastructure/dist/Queue");
|
|
52
|
+
const FileUtils_1 = require("nlptoolkit-util/dist/FileUtils");
|
|
53
|
+
class FsmMorphologicalAnalyzer {
|
|
54
|
+
dictionaryTrie;
|
|
55
|
+
suffixTrie;
|
|
56
|
+
parsedSurfaceForms = undefined;
|
|
57
|
+
pronunciations = undefined;
|
|
58
|
+
finiteStateMachine;
|
|
59
|
+
static MAX_DISTANCE = 2;
|
|
60
|
+
dictionary;
|
|
61
|
+
cache = undefined;
|
|
62
|
+
mostUsedPatterns = new Map();
|
|
63
|
+
/**
|
|
64
|
+
* Another constructor of FsmMorphologicalAnalyzer class. It generates a new TxtDictionary type dictionary from
|
|
65
|
+
* given input dictionary, with given inputs fileName and cacheSize.
|
|
66
|
+
*
|
|
67
|
+
* @param fileName the file to read the finite state machine.
|
|
68
|
+
* @param dictionaryFileNameOrDictionary the dictionary file that will be used to generate dictionaryTrie.
|
|
69
|
+
* @param cacheSize the size of the LRUCache.
|
|
70
|
+
*/
|
|
71
|
+
constructor(fileName, dictionaryFileNameOrDictionary, cacheSize) {
|
|
72
|
+
if (dictionaryFileNameOrDictionary == undefined) {
|
|
73
|
+
this.dictionary = new TxtDictionary_1.TxtDictionary();
|
|
74
|
+
}
|
|
75
|
+
else {
|
|
76
|
+
if (dictionaryFileNameOrDictionary instanceof TxtDictionary_1.TxtDictionary) {
|
|
77
|
+
this.dictionary = dictionaryFileNameOrDictionary;
|
|
56
78
|
}
|
|
57
79
|
else {
|
|
58
|
-
this.
|
|
59
|
-
}
|
|
60
|
-
this.prepareSuffixTrie();
|
|
61
|
-
this.dictionaryTrie = this.dictionary.prepareTrie();
|
|
62
|
-
if (cacheSize > 0) {
|
|
63
|
-
this.cache = new LRUCache_1.LRUCache(cacheSize);
|
|
64
|
-
}
|
|
65
|
-
this.addPronunciations("pronunciations.txt");
|
|
66
|
-
}
|
|
67
|
-
/**
|
|
68
|
-
* Constructs and returns the reverse string of a given string.
|
|
69
|
-
* @param s String to be reversed.
|
|
70
|
-
* @return Reverse of a given string.
|
|
71
|
-
*/
|
|
72
|
-
reverseString(s) {
|
|
73
|
-
let result = "";
|
|
74
|
-
for (let i = s.length - 1; i >= 0; i--) {
|
|
75
|
-
result += s[i];
|
|
80
|
+
this.dictionary = new TxtDictionary_1.TxtDictionary(WordComparator_1.WordComparator.TURKISH, dictionaryFileNameOrDictionary);
|
|
76
81
|
}
|
|
82
|
+
}
|
|
83
|
+
if (fileName == undefined) {
|
|
84
|
+
this.finiteStateMachine = new FiniteStateMachine_1.FiniteStateMachine();
|
|
85
|
+
}
|
|
86
|
+
else {
|
|
87
|
+
this.finiteStateMachine = new FiniteStateMachine_1.FiniteStateMachine(fileName);
|
|
88
|
+
}
|
|
89
|
+
this.prepareSuffixTrie();
|
|
90
|
+
this.dictionaryTrie = this.dictionary.prepareTrie();
|
|
91
|
+
if (cacheSize > 0) {
|
|
92
|
+
this.cache = new LRUCache_1.LRUCache(cacheSize);
|
|
93
|
+
}
|
|
94
|
+
this.addPronunciations("pronunciations.txt");
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Constructs and returns the reverse string of a given string.
|
|
98
|
+
* @param s String to be reversed.
|
|
99
|
+
* @return Reverse of a given string.
|
|
100
|
+
*/
|
|
101
|
+
reverseString(s) {
|
|
102
|
+
let result = "";
|
|
103
|
+
for (let i = s.length - 1; i >= 0; i--) {
|
|
104
|
+
result += s[i];
|
|
105
|
+
}
|
|
106
|
+
return result;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
|
|
110
|
+
* suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
|
|
111
|
+
* which can be then used to match a given word for its possible suffix content.
|
|
112
|
+
*/
|
|
113
|
+
prepareSuffixTrie() {
|
|
114
|
+
this.suffixTrie = new Trie_1.Trie();
|
|
115
|
+
let data = fs.readFileSync("suffixes.txt", 'utf8');
|
|
116
|
+
let lines = data.split("\n");
|
|
117
|
+
for (let suffix of lines) {
|
|
118
|
+
let reverseSuffix = this.reverseString(suffix);
|
|
119
|
+
this.suffixTrie.addWord(reverseSuffix, new Word_1.Word(reverseSuffix));
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
|
|
124
|
+
* which have at least one morphological analysis in Turkish.
|
|
125
|
+
* @param fileName Input file containing analyzable surface forms and their root forms.
|
|
126
|
+
*/
|
|
127
|
+
addParsedSurfaceForms(fileName) {
|
|
128
|
+
this.parsedSurfaceForms = FileUtils_1.FileUtils.readHashMap(fileName);
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Reads the file for foreign words and their pronunciations.
|
|
132
|
+
* @param fileName Input file containing foreign words and their pronunciations.
|
|
133
|
+
*/
|
|
134
|
+
addPronunciations(fileName) {
|
|
135
|
+
this.pronunciations = FileUtils_1.FileUtils.readHashMap(fileName);
|
|
136
|
+
}
|
|
137
|
+
/**
|
|
138
|
+
* The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
|
|
139
|
+
* First it determines whether the given morphologicalParse is the root verb and whether it contains a verb tag.
|
|
140
|
+
* Then it creates new transition with -mak and creates a new {@link Set} result.
|
|
141
|
+
* <p>
|
|
142
|
+
* It takes the given {@link MetamorphicParse} input as currentWord and if there is a compound word starting with the
|
|
143
|
+
* currentWord, it gets this compoundWord from dictionaryTrie. If there is a compoundWord and the difference of the
|
|
144
|
+
* currentWord and compundWords is less than 3 than compoundWord is added to the result, otherwise currentWord is added.
|
|
145
|
+
* <p>
|
|
146
|
+
* Then it gets the root from parse input as a currentRoot. If it is not null, and morphologicalParse input is verb,
|
|
147
|
+
* it directly adds the verb to result after making transition to currentRoot with currentWord String. Else, it creates a new
|
|
148
|
+
* transition with -lar and make this transition then adds to the result.
|
|
149
|
+
*
|
|
150
|
+
* @param morphologicalParse {@link MorphologicalParse} type input.
|
|
151
|
+
* @param metamorphicParse {@link MetamorphicParse} type input.
|
|
152
|
+
* @return {@link HashSet} result.
|
|
153
|
+
*/
|
|
154
|
+
getPossibleWords(morphologicalParse, metamorphicParse) {
|
|
155
|
+
let isRootVerb = morphologicalParse.getRootPos() == "VERB";
|
|
156
|
+
let containsVerb = morphologicalParse.containsTag(MorphologicalTag_1.MorphologicalTag.VERB);
|
|
157
|
+
let verbTransition = new Transition_1.Transition("mAk");
|
|
158
|
+
let result = new Set();
|
|
159
|
+
if (metamorphicParse == undefined || metamorphicParse.getWord() == undefined) {
|
|
77
160
|
return result;
|
|
78
161
|
}
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
let lines = data.split("\n");
|
|
88
|
-
for (let suffix of lines) {
|
|
89
|
-
let reverseSuffix = this.reverseString(suffix);
|
|
90
|
-
this.suffixTrie.addWord(reverseSuffix, new Word_1.Word(reverseSuffix));
|
|
91
|
-
}
|
|
162
|
+
let currentWord = metamorphicParse.getWord().getName();
|
|
163
|
+
let pluralIndex = -1;
|
|
164
|
+
let compoundWord = this.dictionaryTrie.getCompoundWordStartingWith(currentWord);
|
|
165
|
+
if (!isRootVerb) {
|
|
166
|
+
if (compoundWord != null && compoundWord.getName().length - currentWord.length < 3) {
|
|
167
|
+
result.add(compoundWord.getName());
|
|
168
|
+
}
|
|
169
|
+
result.add(currentWord);
|
|
92
170
|
}
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
/**
|
|
109
|
-
* The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
|
|
110
|
-
* First it determines whether the given morphologicalParse is the root verb and whether it contains a verb tag.
|
|
111
|
-
* Then it creates new transition with -mak and creates a new {@link Set} result.
|
|
112
|
-
* <p>
|
|
113
|
-
* It takes the given {@link MetamorphicParse} input as currentWord and if there is a compound word starting with the
|
|
114
|
-
* currentWord, it gets this compoundWord from dictionaryTrie. If there is a compoundWord and the difference of the
|
|
115
|
-
* currentWord and compundWords is less than 3 than compoundWord is added to the result, otherwise currentWord is added.
|
|
116
|
-
* <p>
|
|
117
|
-
* Then it gets the root from parse input as a currentRoot. If it is not null, and morphologicalParse input is verb,
|
|
118
|
-
* it directly adds the verb to result after making transition to currentRoot with currentWord String. Else, it creates a new
|
|
119
|
-
* transition with -lar and make this transition then adds to the result.
|
|
120
|
-
*
|
|
121
|
-
* @param morphologicalParse {@link MorphologicalParse} type input.
|
|
122
|
-
* @param metamorphicParse {@link MetamorphicParse} type input.
|
|
123
|
-
* @return {@link HashSet} result.
|
|
124
|
-
*/
|
|
125
|
-
getPossibleWords(morphologicalParse, metamorphicParse) {
|
|
126
|
-
let isRootVerb = morphologicalParse.getRootPos() == "VERB";
|
|
127
|
-
let containsVerb = morphologicalParse.containsTag(MorphologicalTag_1.MorphologicalTag.VERB);
|
|
128
|
-
let verbTransition = new Transition_1.Transition("mAk");
|
|
129
|
-
let result = new Set();
|
|
130
|
-
if (metamorphicParse == undefined || metamorphicParse.getWord() == undefined) {
|
|
131
|
-
return result;
|
|
132
|
-
}
|
|
133
|
-
let currentWord = metamorphicParse.getWord().getName();
|
|
134
|
-
let pluralIndex = -1;
|
|
135
|
-
let compoundWord = this.dictionaryTrie.getCompoundWordStartingWith(currentWord);
|
|
136
|
-
if (!isRootVerb) {
|
|
137
|
-
if (compoundWord != null && compoundWord.getName().length - currentWord.length < 3) {
|
|
138
|
-
result.add(compoundWord.getName());
|
|
171
|
+
let currentRoot = this.dictionary.getWord(metamorphicParse.getWord().getName());
|
|
172
|
+
if (currentRoot == undefined && compoundWord != undefined) {
|
|
173
|
+
currentRoot = compoundWord;
|
|
174
|
+
}
|
|
175
|
+
if (currentRoot != undefined) {
|
|
176
|
+
if (isRootVerb) {
|
|
177
|
+
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
178
|
+
result.add(verbWord);
|
|
179
|
+
}
|
|
180
|
+
let pluralWord = undefined;
|
|
181
|
+
for (let i = 1; i < metamorphicParse.size(); i++) {
|
|
182
|
+
let transition = new Transition_1.Transition(metamorphicParse.getMetaMorpheme(i), undefined, undefined);
|
|
183
|
+
if (metamorphicParse.getMetaMorpheme(i) == "lAr") {
|
|
184
|
+
pluralWord = currentWord;
|
|
185
|
+
pluralIndex = i + 1;
|
|
139
186
|
}
|
|
187
|
+
currentWord = transition.makeTransition(currentRoot, currentWord);
|
|
140
188
|
result.add(currentWord);
|
|
141
|
-
|
|
142
|
-
let currentRoot = this.dictionary.getWord(metamorphicParse.getWord().getName());
|
|
143
|
-
if (currentRoot == undefined && compoundWord != undefined) {
|
|
144
|
-
currentRoot = compoundWord;
|
|
145
|
-
}
|
|
146
|
-
if (currentRoot != undefined) {
|
|
147
|
-
if (isRootVerb) {
|
|
189
|
+
if (containsVerb) {
|
|
148
190
|
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
149
191
|
result.add(verbWord);
|
|
150
192
|
}
|
|
151
|
-
|
|
152
|
-
|
|
193
|
+
}
|
|
194
|
+
if (pluralWord != null) {
|
|
195
|
+
currentWord = pluralWord;
|
|
196
|
+
for (let i = pluralIndex; i < metamorphicParse.size(); i++) {
|
|
153
197
|
let transition = new Transition_1.Transition(metamorphicParse.getMetaMorpheme(i), undefined, undefined);
|
|
154
|
-
if (metamorphicParse.getMetaMorpheme(i) == "lAr") {
|
|
155
|
-
pluralWord = currentWord;
|
|
156
|
-
pluralIndex = i + 1;
|
|
157
|
-
}
|
|
158
198
|
currentWord = transition.makeTransition(currentRoot, currentWord);
|
|
159
199
|
result.add(currentWord);
|
|
160
200
|
if (containsVerb) {
|
|
@@ -162,1212 +202,1200 @@
|
|
|
162
202
|
result.add(verbWord);
|
|
163
203
|
}
|
|
164
204
|
}
|
|
165
|
-
if (pluralWord != null) {
|
|
166
|
-
currentWord = pluralWord;
|
|
167
|
-
for (let i = pluralIndex; i < metamorphicParse.size(); i++) {
|
|
168
|
-
let transition = new Transition_1.Transition(metamorphicParse.getMetaMorpheme(i), undefined, undefined);
|
|
169
|
-
currentWord = transition.makeTransition(currentRoot, currentWord);
|
|
170
|
-
result.add(currentWord);
|
|
171
|
-
if (containsVerb) {
|
|
172
|
-
let verbWord = verbTransition.makeTransition(currentRoot, currentWord);
|
|
173
|
-
result.add(verbWord);
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
205
|
}
|
|
178
|
-
return result;
|
|
179
206
|
}
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
207
|
+
return result;
|
|
208
|
+
}
|
|
209
|
+
/**
|
|
210
|
+
* The getDictionary method is used to get TxtDictionary.
|
|
211
|
+
*
|
|
212
|
+
* @return TxtDictionary type dictionary.
|
|
213
|
+
*/
|
|
214
|
+
getDictionary() {
|
|
215
|
+
return this.dictionary;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* The getFiniteStateMachine method is used to get FiniteStateMachine.
|
|
219
|
+
*
|
|
220
|
+
* @return FiniteStateMachine type finiteStateMachine.
|
|
221
|
+
*/
|
|
222
|
+
getFiniteStateMachine() {
|
|
223
|
+
return this.finiteStateMachine;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* The isPossibleSubstring method first checks whether given short and long strings are equal to root word.
|
|
227
|
+
* Then, compares both short and long strings' chars till the last two chars of short string. In the presence of mismatch,
|
|
228
|
+
* false is returned. On the other hand, it counts the distance between two strings until it becomes greater than 2,
|
|
229
|
+
* which is the MAX_DISTANCE also finds the index of the last char.
|
|
230
|
+
* <p>
|
|
231
|
+
* If the substring is a rootWord and equals to 'ben', which is a special case or root holds the lastIdropsDuringSuffixation or
|
|
232
|
+
* lastIdropsDuringPassiveSuffixation conditions, then it returns true if distance is not greater than MAX_DISTANCE.
|
|
233
|
+
* <p>
|
|
234
|
+
* On the other hand, if the shortStrong ends with one of these chars 'e, a, p, ç, t, k' and 't 's a rootWord with
|
|
235
|
+
* the conditions of rootSoftenDuringSuffixation, vowelEChangesToIDuringYSuffixation, vowelAChangesToIDuringYSuffixation
|
|
236
|
+
* or endingKChangesIntoG then it returns true if the last index is not equal to 2 and distance is not greater than
|
|
237
|
+
* MAX_DISTANCE and false otherwise.
|
|
238
|
+
*
|
|
239
|
+
* @param shortString the possible substring.
|
|
240
|
+
* @param longString the long string to compare with substring.
|
|
241
|
+
* @param root the root of the long string.
|
|
242
|
+
* @return true if given substring is the actual substring of the longString, false otherwise.
|
|
243
|
+
*/
|
|
244
|
+
isPossibleSubstring(shortString, longString, root) {
|
|
245
|
+
let rootWord = ((shortString == root.getName()) || longString == root.getName());
|
|
246
|
+
let distance = 0, last = 1;
|
|
247
|
+
for (let j = 0; j < shortString.length; j++) {
|
|
248
|
+
if (shortString.charAt(j) != longString.charAt(j)) {
|
|
249
|
+
if (j < shortString.length - 2) {
|
|
250
|
+
return false;
|
|
251
|
+
}
|
|
252
|
+
last = shortString.length - j;
|
|
253
|
+
distance++;
|
|
254
|
+
if (distance > FsmMorphologicalAnalyzer.MAX_DISTANCE) {
|
|
255
|
+
break;
|
|
228
256
|
}
|
|
229
257
|
}
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
258
|
+
}
|
|
259
|
+
if (rootWord && (root.getName() == "ben" || root.getName() == "sen" ||
|
|
260
|
+
root.lastIdropsDuringSuffixation() || root.lastIdropsDuringPassiveSuffixation())) {
|
|
261
|
+
return (distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE);
|
|
262
|
+
}
|
|
263
|
+
else {
|
|
264
|
+
if (shortString.endsWith("e") || shortString.endsWith("a") || shortString.endsWith("p") ||
|
|
265
|
+
shortString.endsWith("ç") || shortString.endsWith("t") || shortString.endsWith("k") ||
|
|
266
|
+
(rootWord && (root.rootSoftenDuringSuffixation() || root.vowelEChangesToIDuringYSuffixation() ||
|
|
267
|
+
root.vowelAChangesToIDuringYSuffixation() || root.endingKChangesIntoG()))) {
|
|
268
|
+
return (last != 2 && distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 1);
|
|
233
269
|
}
|
|
234
270
|
else {
|
|
235
|
-
|
|
236
|
-
shortString.endsWith("ç") || shortString.endsWith("t") || shortString.endsWith("k") ||
|
|
237
|
-
(rootWord && (root.rootSoftenDuringSuffixation() || root.vowelEChangesToIDuringYSuffixation() ||
|
|
238
|
-
root.vowelAChangesToIDuringYSuffixation() || root.endingKChangesIntoG()))) {
|
|
239
|
-
return (last != 2 && distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 1);
|
|
240
|
-
}
|
|
241
|
-
else {
|
|
242
|
-
return (distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 2);
|
|
243
|
-
}
|
|
271
|
+
return (distance <= FsmMorphologicalAnalyzer.MAX_DISTANCE - 2);
|
|
244
272
|
}
|
|
245
273
|
}
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
274
|
+
}
|
|
275
|
+
/**
|
|
276
|
+
* The initializeParseList method initializes the given given fsm ArrayList with given root words by parsing them.
|
|
277
|
+
* <p>
|
|
278
|
+
* It checks many conditions;
|
|
279
|
+
* isPlural; if root holds the condition then it gets the state with the name of NominalRootPlural, then
|
|
280
|
+
* creates a new parsing and adds this to the input fsmParse Arraylist.
|
|
281
|
+
* Ex : Açıktohumlular
|
|
282
|
+
* <p>
|
|
283
|
+
* !isPlural and isPortmanteauEndingWithSI, if root holds the conditions then it gets the state with the
|
|
284
|
+
* name of NominalRootNoPossesive.
|
|
285
|
+
* Ex : Balarısı
|
|
286
|
+
* <p>
|
|
287
|
+
* !isPlural and isPortmanteau, if root holds the conditions then it gets the state with the name of
|
|
288
|
+
* CompoundNounRoot.
|
|
289
|
+
* Ex : Aslanağızı
|
|
290
|
+
* <p>
|
|
291
|
+
* !isPlural, !isPortmanteau and isHeader, if root holds the conditions then it gets the state with the
|
|
292
|
+
* name of HeaderRoot.
|
|
293
|
+
* Ex : </title>
|
|
294
|
+
* <p>
|
|
295
|
+
* !isPlural, !isPortmanteau and isInterjection, if root holds the conditions then it gets the state
|
|
296
|
+
* with the name of InterjectionRoot.
|
|
297
|
+
* Ex : Hey, Aa
|
|
298
|
+
* <p>
|
|
299
|
+
* !isPlural, !isPortmanteau and isDuplicate, if root holds the conditions then it gets the state
|
|
300
|
+
* with the name of DuplicateRoot.
|
|
301
|
+
* Ex : Allak,
|
|
302
|
+
* !isPlural, !isPortmanteau and isCode, if root holds the conditions then it gets the state
|
|
303
|
+
* with the name of CodeRoot.
|
|
304
|
+
* Ex : 9400f,
|
|
305
|
+
* <p>
|
|
306
|
+
* !isPlural, !isPortmanteau and isMetric, if root holds the conditions then it gets the state
|
|
307
|
+
* with the name of MetricRoot.
|
|
308
|
+
* Ex : 11x8x12,
|
|
309
|
+
* <p>
|
|
310
|
+
* !isPlural, !isPortmanteau and isNumeral, if root holds the conditions then it gets the state
|
|
311
|
+
* with the name of CardinalRoot.
|
|
312
|
+
* Ex : Yüz, bin
|
|
313
|
+
* <p>
|
|
314
|
+
* !isPlural, !isPortmanteau and isReal, if root holds the conditions then it gets the state
|
|
315
|
+
* with the name of RealRoot.
|
|
316
|
+
* Ex : 1.2
|
|
317
|
+
* <p>
|
|
318
|
+
* !isPlural, !isPortmanteau and isFraction, if root holds the conditions then it gets the state
|
|
319
|
+
* with the name of FractionRoot.
|
|
320
|
+
* Ex : 1/2
|
|
321
|
+
* <p>
|
|
322
|
+
* !isPlural, !isPortmanteau and isDate, if root holds the conditions then it gets the state
|
|
323
|
+
* with the name of DateRoot.
|
|
324
|
+
* Ex : 11/06/2018
|
|
325
|
+
* <p>
|
|
326
|
+
* !isPlural, !isPortmanteau and isPercent, if root holds the conditions then it gets the state
|
|
327
|
+
* with the name of PercentRoot.
|
|
328
|
+
* Ex : %12.5
|
|
329
|
+
* <p>
|
|
330
|
+
* !isPlural, !isPortmanteau and isRange, if root holds the conditions then it gets the state
|
|
331
|
+
* with the name of RangeRoot.
|
|
332
|
+
* Ex : 3-5
|
|
333
|
+
* <p>
|
|
334
|
+
* !isPlural, !isPortmanteau and isTime, if root holds the conditions then it gets the state
|
|
335
|
+
* with the name of TimeRoot.
|
|
336
|
+
* Ex : 13:16:08
|
|
337
|
+
* <p>
|
|
338
|
+
* !isPlural, !isPortmanteau and isOrdinal, if root holds the conditions then it gets the state
|
|
339
|
+
* with the name of OrdinalRoot.
|
|
340
|
+
* Ex : Altıncı
|
|
341
|
+
* <p>
|
|
342
|
+
* !isPlural, !isPortmanteau, and isVerb if root holds the conditions then it gets the state
|
|
343
|
+
* with the name of VerbalRoot. Or isPassive, then it gets the state with the name of PassiveHn.
|
|
344
|
+
* Ex : Anla (!isPAssive)
|
|
345
|
+
* Ex : Çağrıl (isPassive)
|
|
346
|
+
* <p>
|
|
347
|
+
* !isPlural, !isPortmanteau and isPronoun, if root holds the conditions then it gets the state
|
|
348
|
+
* with the name of PronounRoot. There are 6 different Pronoun state names, REFLEX, QUANT, QUANTPLURAL, DEMONS, PERS, QUES.
|
|
349
|
+
* REFLEX = Reflexive Pronouns Ex : kendi
|
|
350
|
+
* QUANT = Quantitative Pronouns Ex : öbür, hep, kimse, hiçbiri, bazı, kimi, biri
|
|
351
|
+
* QUANTPLURAL = Quantitative Plural Pronouns Ex : tümü, çoğu, hepsi
|
|
352
|
+
* DEMONS = Demonstrative Pronouns Ex : o, bu, şu
|
|
353
|
+
* PERS = Personal Pronouns Ex : ben, sen, o, biz, siz, onlar
|
|
354
|
+
* QUES = Interrogatıve Pronouns Ex : nere, ne, kim, hangi
|
|
355
|
+
* <p>
|
|
356
|
+
* !isPlural, !isPortmanteau and isAdjective, if root holds the conditions then it gets the state
|
|
357
|
+
* with the name of AdjectiveRoot.
|
|
358
|
+
* Ex : Absürt, Abes
|
|
359
|
+
* <p>
|
|
360
|
+
* !isPlural, !isPortmanteau and isPureAdjective, if root holds the conditions then it gets the state
|
|
361
|
+
* with the name of Adjective.
|
|
362
|
+
* Ex : Geçmiş, Cam
|
|
363
|
+
* <p>
|
|
364
|
+
* !isPlural, !isPortmanteau and isNominal, if root holds the conditions then it gets the state
|
|
365
|
+
* with the name of NominalRoot.
|
|
366
|
+
* Ex : Görüş
|
|
367
|
+
* <p>
|
|
368
|
+
* !isPlural, !isPortmanteau and isProper, if root holds the conditions then it gets the state
|
|
369
|
+
* with the name of ProperRoot.
|
|
370
|
+
* Ex : Abdi
|
|
371
|
+
* <p>
|
|
372
|
+
* !isPlural, !isPortmanteau and isQuestion, if root holds the conditions then it gets the state
|
|
373
|
+
* with the name of QuestionRoot.
|
|
374
|
+
* Ex : Mi, mü
|
|
375
|
+
* <p>
|
|
376
|
+
* !isPlural, !isPortmanteau and isDeterminer, if root holds the conditions then it gets the state
|
|
377
|
+
* with the name of DeterminerRoot.
|
|
378
|
+
* Ex : Çok, bir
|
|
379
|
+
* <p>
|
|
380
|
+
* !isPlural, !isPortmanteau and isConjunction, if root holds the conditions then it gets the state
|
|
381
|
+
* with the name of ConjunctionRoot.
|
|
382
|
+
* Ex : Ama , ancak
|
|
383
|
+
* <p>
|
|
384
|
+
* !isPlural, !isPortmanteau and isPostP, if root holds the conditions then it gets the state
|
|
385
|
+
* with the name of PostP.
|
|
386
|
+
* Ex : Ait, dair
|
|
387
|
+
* <p>
|
|
388
|
+
* !isPlural, !isPortmanteau and isAdverb, if root holds the conditions then it gets the state
|
|
389
|
+
* with the name of AdverbRoot.
|
|
390
|
+
* Ex : Acilen
|
|
391
|
+
*
|
|
392
|
+
* @param fsmParse ArrayList to initialize.
|
|
393
|
+
* @param root word to check properties and add to fsmParse according to them.
|
|
394
|
+
* @param isProper is used to check a word is proper or not.
|
|
395
|
+
*/
|
|
396
|
+
initializeParseList(fsmParse, root, isProper) {
|
|
397
|
+
let currentFsmParse;
|
|
398
|
+
if (root.isPlural()) {
|
|
399
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootPlural"));
|
|
400
|
+
fsmParse.push(currentFsmParse);
|
|
401
|
+
}
|
|
402
|
+
else {
|
|
403
|
+
if (root.isPortmanteauEndingWithSI()) {
|
|
404
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
405
|
+
fsmParse.push(currentFsmParse);
|
|
406
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
371
407
|
fsmParse.push(currentFsmParse);
|
|
372
408
|
}
|
|
373
409
|
else {
|
|
374
|
-
if (root.
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
if (root.isPortmanteauFacedVowelEllipsis()) {
|
|
410
|
+
if (root.isPortmanteau()) {
|
|
411
|
+
if (root.isPortmanteauFacedVowelEllipsis()) {
|
|
412
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
413
|
+
fsmParse.push(currentFsmParse);
|
|
414
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + root.getName().charAt(root.getName().length - 1) + root.getName().charAt(root.getName().length - 2), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
415
|
+
}
|
|
416
|
+
else {
|
|
417
|
+
if (root.isPortmanteauFacedSoftening()) {
|
|
383
418
|
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
384
419
|
fsmParse.push(currentFsmParse);
|
|
385
|
-
|
|
420
|
+
switch (root.getName().charAt(root.getName().length - 2)) {
|
|
421
|
+
case 'b':
|
|
422
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'p', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
423
|
+
break;
|
|
424
|
+
case 'c':
|
|
425
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'ç', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
426
|
+
break;
|
|
427
|
+
case 'd':
|
|
428
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 't', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
429
|
+
break;
|
|
430
|
+
case 'ğ':
|
|
431
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'k', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
432
|
+
break;
|
|
433
|
+
default:
|
|
434
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
435
|
+
}
|
|
386
436
|
}
|
|
387
437
|
else {
|
|
388
|
-
|
|
389
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRootNoPossesive"));
|
|
390
|
-
fsmParse.push(currentFsmParse);
|
|
391
|
-
switch (root.getName().charAt(root.getName().length - 2)) {
|
|
392
|
-
case 'b':
|
|
393
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'p', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
394
|
-
break;
|
|
395
|
-
case 'c':
|
|
396
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'ç', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
397
|
-
break;
|
|
398
|
-
case 'd':
|
|
399
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 't', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
400
|
-
break;
|
|
401
|
-
case 'ğ':
|
|
402
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 2) + 'k', this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
403
|
-
break;
|
|
404
|
-
default:
|
|
405
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
406
|
-
}
|
|
407
|
-
}
|
|
408
|
-
else {
|
|
409
|
-
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
410
|
-
}
|
|
438
|
+
currentFsmParse = new FsmParse_1.FsmParse(root.getName().substring(0, root.getName().length - 1), this.finiteStateMachine.getState("CompoundNounRoot"));
|
|
411
439
|
}
|
|
440
|
+
}
|
|
441
|
+
fsmParse.push(currentFsmParse);
|
|
442
|
+
}
|
|
443
|
+
else {
|
|
444
|
+
if (root.isHeader()) {
|
|
445
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("HeaderRoot"));
|
|
412
446
|
fsmParse.push(currentFsmParse);
|
|
413
447
|
}
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
if (root.
|
|
464
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
465
|
-
fsmParse.push(currentFsmParse);
|
|
448
|
+
if (root.isInterjection()) {
|
|
449
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("InterjectionRoot"));
|
|
450
|
+
fsmParse.push(currentFsmParse);
|
|
451
|
+
}
|
|
452
|
+
if (root.isDuplicate()) {
|
|
453
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DuplicateRoot"));
|
|
454
|
+
fsmParse.push(currentFsmParse);
|
|
455
|
+
}
|
|
456
|
+
if (root.isCode()) {
|
|
457
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("CodeRoot"));
|
|
458
|
+
fsmParse.push(currentFsmParse);
|
|
459
|
+
}
|
|
460
|
+
if (root.isMetric()) {
|
|
461
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("MetricRoot"));
|
|
462
|
+
fsmParse.push(currentFsmParse);
|
|
463
|
+
}
|
|
464
|
+
if (root.isNumeral()) {
|
|
465
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("CardinalRoot"));
|
|
466
|
+
fsmParse.push(currentFsmParse);
|
|
467
|
+
}
|
|
468
|
+
if (root.isReal()) {
|
|
469
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("RealRoot"));
|
|
470
|
+
fsmParse.push(currentFsmParse);
|
|
471
|
+
}
|
|
472
|
+
if (root.isFraction()) {
|
|
473
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("FractionRoot"));
|
|
474
|
+
fsmParse.push(currentFsmParse);
|
|
475
|
+
}
|
|
476
|
+
if (root.isDate()) {
|
|
477
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DateRoot"));
|
|
478
|
+
fsmParse.push(currentFsmParse);
|
|
479
|
+
}
|
|
480
|
+
if (root.isPercent()) {
|
|
481
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PercentRoot"));
|
|
482
|
+
fsmParse.push(currentFsmParse);
|
|
483
|
+
}
|
|
484
|
+
if (root.isRange()) {
|
|
485
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("RangeRoot"));
|
|
486
|
+
fsmParse.push(currentFsmParse);
|
|
487
|
+
}
|
|
488
|
+
if (root.isTime()) {
|
|
489
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("TimeRoot"));
|
|
490
|
+
fsmParse.push(currentFsmParse);
|
|
491
|
+
}
|
|
492
|
+
if (root.isOrdinal()) {
|
|
493
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("OrdinalRoot"));
|
|
494
|
+
fsmParse.push(currentFsmParse);
|
|
495
|
+
}
|
|
496
|
+
if (root.isVerb() || root.isPassive()) {
|
|
497
|
+
if (root.verbType() != "") {
|
|
498
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot(" + root.verbType() + ")"));
|
|
466
499
|
}
|
|
467
|
-
|
|
468
|
-
if (root.
|
|
469
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot
|
|
500
|
+
else {
|
|
501
|
+
if (!root.isPassive()) {
|
|
502
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot"));
|
|
470
503
|
}
|
|
471
504
|
else {
|
|
472
|
-
|
|
473
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("VerbalRoot"));
|
|
474
|
-
}
|
|
475
|
-
else {
|
|
476
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PassiveHn"));
|
|
477
|
-
}
|
|
478
|
-
}
|
|
479
|
-
fsmParse.push(currentFsmParse);
|
|
480
|
-
}
|
|
481
|
-
if (root.isPronoun()) {
|
|
482
|
-
if (root.getName() == "kendi") {
|
|
483
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(REFLEX)"));
|
|
484
|
-
fsmParse.push(currentFsmParse);
|
|
485
|
-
}
|
|
486
|
-
if (root.getName() == "öbür" || root.getName() == "öteki" || root.getName() == "hep" || root.getName() == "kimse" || root.getName() == "diğeri" || root.getName() == "hiçbiri" || root.getName() == "böylesi" || root.getName() == "birbiri" || root.getName() == "birbirleri" || root.getName() == "biri" || root.getName() == "başkası" || root.getName() == "bazı" || root.getName() == "kimi") {
|
|
487
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANT)"));
|
|
488
|
-
fsmParse.push(currentFsmParse);
|
|
505
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PassiveHn"));
|
|
489
506
|
}
|
|
490
|
-
if (root.getName() == "tümü" || root.getName() == "topu" || root.getName() == "herkes" || root.getName() == "cümlesi" || root.getName() == "çoğu" || root.getName() == "birçoğu" || root.getName() == "birkaçı" || root.getName() == "birçokları" || root.getName() == "hepsi") {
|
|
491
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANTPLURAL)"));
|
|
492
|
-
fsmParse.push(currentFsmParse);
|
|
493
|
-
}
|
|
494
|
-
if (root.getName() == "o" || root.getName() == "bu" || root.getName() == "şu") {
|
|
495
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(DEMONS)"));
|
|
496
|
-
fsmParse.push(currentFsmParse);
|
|
497
|
-
}
|
|
498
|
-
if (root.getName() == "ben" || root.getName() == "sen" || root.getName() == "o" || root.getName() == "biz" || root.getName() == "siz" || root.getName() == "onlar") {
|
|
499
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(PERS)"));
|
|
500
|
-
fsmParse.push(currentFsmParse);
|
|
501
|
-
}
|
|
502
|
-
if (root.getName() == "nere" || root.getName() == "ne" || root.getName() == "kaçı" || root.getName() == "kim" || root.getName() == "hangi") {
|
|
503
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUES)"));
|
|
504
|
-
fsmParse.push(currentFsmParse);
|
|
505
|
-
}
|
|
506
|
-
}
|
|
507
|
-
if (root.isAdjective()) {
|
|
508
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("AdjectiveRoot"));
|
|
509
|
-
fsmParse.push(currentFsmParse);
|
|
510
507
|
}
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
517
|
-
fsmParse.push(currentFsmParse);
|
|
518
|
-
}
|
|
519
|
-
if (root.isAbbreviation()) {
|
|
520
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
521
|
-
fsmParse.push(currentFsmParse);
|
|
522
|
-
}
|
|
523
|
-
if (root.isProperNoun() && isProper) {
|
|
524
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("ProperRoot"));
|
|
508
|
+
fsmParse.push(currentFsmParse);
|
|
509
|
+
}
|
|
510
|
+
if (root.isPronoun()) {
|
|
511
|
+
if (root.getName() == "kendi") {
|
|
512
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(REFLEX)"));
|
|
525
513
|
fsmParse.push(currentFsmParse);
|
|
526
514
|
}
|
|
527
|
-
if (root.
|
|
528
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
515
|
+
if (root.getName() == "öbür" || root.getName() == "öteki" || root.getName() == "hep" || root.getName() == "kimse" || root.getName() == "diğeri" || root.getName() == "hiçbiri" || root.getName() == "böylesi" || root.getName() == "birbiri" || root.getName() == "birbirleri" || root.getName() == "biri" || root.getName() == "başkası" || root.getName() == "bazı" || root.getName() == "kimi") {
|
|
516
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANT)"));
|
|
529
517
|
fsmParse.push(currentFsmParse);
|
|
530
518
|
}
|
|
531
|
-
if (root.
|
|
532
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
519
|
+
if (root.getName() == "tümü" || root.getName() == "topu" || root.getName() == "herkes" || root.getName() == "cümlesi" || root.getName() == "çoğu" || root.getName() == "birçoğu" || root.getName() == "birkaçı" || root.getName() == "birçokları" || root.getName() == "hepsi") {
|
|
520
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUANTPLURAL)"));
|
|
533
521
|
fsmParse.push(currentFsmParse);
|
|
534
522
|
}
|
|
535
|
-
if (root.
|
|
536
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
523
|
+
if (root.getName() == "o" || root.getName() == "bu" || root.getName() == "şu") {
|
|
524
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(DEMONS)"));
|
|
537
525
|
fsmParse.push(currentFsmParse);
|
|
538
526
|
}
|
|
539
|
-
if (root.
|
|
540
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
527
|
+
if (root.getName() == "ben" || root.getName() == "sen" || root.getName() == "o" || root.getName() == "biz" || root.getName() == "siz" || root.getName() == "onlar") {
|
|
528
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(PERS)"));
|
|
541
529
|
fsmParse.push(currentFsmParse);
|
|
542
530
|
}
|
|
543
|
-
if (root.
|
|
544
|
-
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("
|
|
531
|
+
if (root.getName() == "nere" || root.getName() == "ne" || root.getName() == "kaçı" || root.getName() == "kim" || root.getName() == "hangi") {
|
|
532
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PronounRoot(QUES)"));
|
|
545
533
|
fsmParse.push(currentFsmParse);
|
|
546
534
|
}
|
|
547
535
|
}
|
|
536
|
+
if (root.isAdjective()) {
|
|
537
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("AdjectiveRoot"));
|
|
538
|
+
fsmParse.push(currentFsmParse);
|
|
539
|
+
}
|
|
540
|
+
if (root.isPureAdjective()) {
|
|
541
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("Adjective"));
|
|
542
|
+
fsmParse.push(currentFsmParse);
|
|
543
|
+
}
|
|
544
|
+
if (root.isNominal()) {
|
|
545
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
546
|
+
fsmParse.push(currentFsmParse);
|
|
547
|
+
}
|
|
548
|
+
if (root.isAbbreviation()) {
|
|
549
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("NominalRoot"));
|
|
550
|
+
fsmParse.push(currentFsmParse);
|
|
551
|
+
}
|
|
552
|
+
if (root.isProperNoun() && isProper) {
|
|
553
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("ProperRoot"));
|
|
554
|
+
fsmParse.push(currentFsmParse);
|
|
555
|
+
}
|
|
556
|
+
if (root.isQuestion()) {
|
|
557
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("QuestionRoot"));
|
|
558
|
+
fsmParse.push(currentFsmParse);
|
|
559
|
+
}
|
|
560
|
+
if (root.isDeterminer()) {
|
|
561
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("DeterminerRoot"));
|
|
562
|
+
fsmParse.push(currentFsmParse);
|
|
563
|
+
}
|
|
564
|
+
if (root.isConjunction()) {
|
|
565
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("ConjunctionRoot"));
|
|
566
|
+
fsmParse.push(currentFsmParse);
|
|
567
|
+
}
|
|
568
|
+
if (root.isPostP()) {
|
|
569
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("PostP"));
|
|
570
|
+
fsmParse.push(currentFsmParse);
|
|
571
|
+
}
|
|
572
|
+
if (root.isAdverb()) {
|
|
573
|
+
currentFsmParse = new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState("AdverbRoot"));
|
|
574
|
+
fsmParse.push(currentFsmParse);
|
|
575
|
+
}
|
|
548
576
|
}
|
|
549
577
|
}
|
|
550
578
|
}
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
579
|
+
}
|
|
580
|
+
/**
|
|
581
|
+
* The initializeParseListFromRoot method is used to create an {@link Array} which consists of initial fsm parsings.
|
|
582
|
+
* First, traverses this HashSet and uses each word as a root and calls initializeParseList method with this root
|
|
583
|
+
* and Array.
|
|
584
|
+
* <p>
|
|
585
|
+
*
|
|
586
|
+
* @param parseList ArrayList to initialize.
|
|
587
|
+
* @param root the root form to generate initial parse list.
|
|
588
|
+
* @param isProper is used to check a word is proper or not.
|
|
589
|
+
*/
|
|
590
|
+
initializeParseListFromRoot(parseList, root, isProper) {
|
|
591
|
+
this.initializeParseList(parseList, root, isProper);
|
|
592
|
+
if (root.obeysAndNotObeysVowelHarmonyDuringAgglutination()) {
|
|
593
|
+
let newRoot = root.clone();
|
|
594
|
+
newRoot.removeFlag("IS_UU");
|
|
595
|
+
newRoot.removeFlag("IS_UUU");
|
|
596
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
597
|
+
}
|
|
598
|
+
if (root.rootSoftenAndNotSoftenDuringSuffixation()) {
|
|
599
|
+
let newRoot = root.clone();
|
|
600
|
+
newRoot.removeFlag("IS_SD");
|
|
601
|
+
newRoot.removeFlag("IS_SDD");
|
|
602
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
603
|
+
}
|
|
604
|
+
if (root.lastIDropsAndNotDropDuringSuffixation()) {
|
|
605
|
+
let newRoot = root.clone();
|
|
606
|
+
newRoot.removeFlag("IS_UD");
|
|
607
|
+
newRoot.removeFlag("IS_UDD");
|
|
608
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
609
|
+
}
|
|
610
|
+
if (root.duplicatesAndNotDuplicatesDuringSuffixation()) {
|
|
611
|
+
let newRoot = root.clone();
|
|
612
|
+
newRoot.removeFlag("IS_ST");
|
|
613
|
+
newRoot.removeFlag("IS_STT");
|
|
614
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
615
|
+
}
|
|
616
|
+
if (root.endingKChangesIntoG() && root.containsFlag("IS_OA")) {
|
|
617
|
+
let newRoot = root.clone();
|
|
618
|
+
newRoot.removeFlag("IS_OA");
|
|
619
|
+
this.initializeParseList(parseList, newRoot, isProper);
|
|
592
620
|
}
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
621
|
+
}
|
|
622
|
+
/**
|
|
623
|
+
* The initializeParseListFromSurfaceForm method is used to create an {@link Array} which consists of initial fsm parsings. First,
|
|
624
|
+
* it calls getWordsWithPrefix methods by using input String surfaceForm and generates a {@link Set}. Then, traverses
|
|
625
|
+
* this HashSet and uses each word as a root and calls initializeParseListFromRoot method with this root and ArrayList.
|
|
626
|
+
* <p>
|
|
627
|
+
*
|
|
628
|
+
* @param surfaceForm the String used to generate a HashSet of words.
|
|
629
|
+
* @param isProper is used to check a word is proper or not.
|
|
630
|
+
* @return initialFsmParse ArrayList.
|
|
631
|
+
*/
|
|
632
|
+
initializeParseListFromSurfaceForm(surfaceForm, isProper) {
|
|
633
|
+
let initialFsmParse = new Array();
|
|
634
|
+
if (surfaceForm.length == 0) {
|
|
635
|
+
return initialFsmParse;
|
|
636
|
+
}
|
|
637
|
+
let words = this.dictionaryTrie.getWordsWithPrefix(surfaceForm);
|
|
638
|
+
for (let word of words) {
|
|
639
|
+
let root = word;
|
|
640
|
+
this.initializeParseListFromRoot(initialFsmParse, root, isProper);
|
|
641
|
+
}
|
|
642
|
+
return initialFsmParse;
|
|
643
|
+
}
|
|
644
|
+
/**
|
|
645
|
+
* The addNewParsesFromCurrentParse method initially gets the final suffixes from input currentFsmParse called as currentState,
|
|
646
|
+
* and by using the currentState information it gets the new analysis. Then loops through each currentState's transition.
|
|
647
|
+
* If the currentTransition is possible, it makes the transition.
|
|
648
|
+
*
|
|
649
|
+
* @param currentFsmParse FsmParse type input.
|
|
650
|
+
* @param fsmParse an ArrayList of FsmParse.
|
|
651
|
+
* @param maxLength Maximum length of the parse.
|
|
652
|
+
* @param root TxtWord used to make transition.
|
|
653
|
+
*/
|
|
654
|
+
addNewParsesFromCurrentParseLength(currentFsmParse, fsmParse, maxLength, root) {
|
|
655
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
656
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
657
|
+
for (let currentTransition of this.finiteStateMachine.getTransitions(currentState)) {
|
|
658
|
+
if (currentTransition.transitionPossibleFromParse(currentFsmParse) && (currentSurfaceForm != root.getName() ||
|
|
659
|
+
(currentSurfaceForm == root.getName() && currentTransition.transitionPossibleFromRoot(root, currentState)))) {
|
|
660
|
+
let tmp = currentTransition.makeTransition(root, currentSurfaceForm, currentFsmParse.getStartState());
|
|
661
|
+
if (tmp.length <= maxLength) {
|
|
662
|
+
let newFsmParse = currentFsmParse.clone();
|
|
663
|
+
newFsmParse.addSuffix(currentTransition.toState(), tmp, currentTransition.getWith(), currentTransition.toString(), currentTransition.toPos());
|
|
664
|
+
newFsmParse.setAgreement(currentTransition.getWith());
|
|
665
|
+
fsmParse.enqueue(newFsmParse);
|
|
666
|
+
}
|
|
607
667
|
}
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
/**
|
|
671
|
+
* The addNewParsesFromCurrentParse method initially gets the final suffixes from input currentFsmParse called as currentState,
|
|
672
|
+
* and by using the currentState information it gets the currentSurfaceForm. Then loops through each currentState's transition.
|
|
673
|
+
* If the currentTransition is possible, it makes the transition
|
|
674
|
+
*
|
|
675
|
+
* @param currentFsmParse FsmParse type input.
|
|
676
|
+
* @param fsmParse an ArrayList of FsmParse.
|
|
677
|
+
* @param surfaceForm String to use during transition.
|
|
678
|
+
* @param root TxtWord used to make transition.
|
|
679
|
+
*/
|
|
680
|
+
addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, fsmParse, surfaceForm, root) {
|
|
681
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
682
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
683
|
+
for (let currentTransition of this.finiteStateMachine.getTransitions(currentState)) {
|
|
684
|
+
if (currentTransition.transitionPossible(currentFsmParse.getSurfaceForm(), surfaceForm) && currentTransition.transitionPossibleFromParse(currentFsmParse) && (currentSurfaceForm != root.getName() || (currentSurfaceForm == root.getName() && currentTransition.transitionPossibleFromRoot(root, currentState)))) {
|
|
685
|
+
let tmp = currentTransition.makeTransition(root, currentSurfaceForm, currentFsmParse.getStartState());
|
|
686
|
+
if ((tmp.length < surfaceForm.length && this.isPossibleSubstring(tmp, surfaceForm, root)) || (tmp.length == surfaceForm.length && (root.lastIdropsDuringSuffixation() || (tmp == surfaceForm)))) {
|
|
687
|
+
let newFsmParse = currentFsmParse.clone();
|
|
688
|
+
newFsmParse.addSuffix(currentTransition.toState(), tmp, currentTransition.getWith(), currentTransition.toString(), currentTransition.toPos());
|
|
689
|
+
newFsmParse.setAgreement(currentTransition.getWith());
|
|
690
|
+
fsmParse.enqueue(newFsmParse);
|
|
691
|
+
}
|
|
612
692
|
}
|
|
613
|
-
return initialFsmParse;
|
|
614
693
|
}
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
694
|
+
}
|
|
695
|
+
/**
|
|
696
|
+
* The parseExists method is used to check the existence of the parse.
|
|
697
|
+
*
|
|
698
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
699
|
+
* @param surfaceForm String to use during transition.
|
|
700
|
+
* @return true when the currentState is end state and input surfaceForm id equal to currentSurfaceForm, otherwise false.
|
|
701
|
+
*/
|
|
702
|
+
parseExists(fsmParse, surfaceForm) {
|
|
703
|
+
let parseQueue = new Queue_1.Queue(1000);
|
|
704
|
+
parseQueue.enqueueAll(fsmParse);
|
|
705
|
+
while (!parseQueue.isEmpty()) {
|
|
706
|
+
let currentFsmParse = parseQueue.peek();
|
|
707
|
+
parseQueue.dequeue();
|
|
708
|
+
let root = currentFsmParse.getWord();
|
|
626
709
|
let currentState = currentFsmParse.getFinalSuffix();
|
|
627
710
|
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
(currentSurfaceForm == root.getName() && currentTransition.transitionPossibleFromRoot(root, currentState)))) {
|
|
631
|
-
let tmp = currentTransition.makeTransition(root, currentSurfaceForm, currentFsmParse.getStartState());
|
|
632
|
-
if (tmp.length <= maxLength) {
|
|
633
|
-
let newFsmParse = currentFsmParse.clone();
|
|
634
|
-
newFsmParse.addSuffix(currentTransition.toState(), tmp, currentTransition.getWith(), currentTransition.toString(), currentTransition.toPos());
|
|
635
|
-
newFsmParse.setAgreement(currentTransition.getWith());
|
|
636
|
-
fsmParse.enqueue(newFsmParse);
|
|
637
|
-
}
|
|
638
|
-
}
|
|
711
|
+
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
712
|
+
return true;
|
|
639
713
|
}
|
|
714
|
+
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
|
|
640
715
|
}
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
716
|
+
return false;
|
|
717
|
+
}
|
|
718
|
+
/**
|
|
719
|
+
* The parseWord method is used to parse a given fsmParse. It simply adds new parses to the current parse by
|
|
720
|
+
* using addNewParsesFromCurrentParse method.
|
|
721
|
+
*
|
|
722
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
723
|
+
* @param maxLength maximum length of the surfaceform.
|
|
724
|
+
* @return result {@link Array} which has the currentFsmParse.
|
|
725
|
+
*/
|
|
726
|
+
parseWordLength(fsmParse, maxLength) {
|
|
727
|
+
let result = new Array();
|
|
728
|
+
let resultTransitionList = new Array();
|
|
729
|
+
let parseQueue = new Queue_1.Queue(1000);
|
|
730
|
+
parseQueue.enqueueAll(fsmParse);
|
|
731
|
+
while (!parseQueue.isEmpty()) {
|
|
732
|
+
let currentFsmParse = parseQueue.peek();
|
|
733
|
+
parseQueue.dequeue();
|
|
734
|
+
let root = currentFsmParse.getWord();
|
|
652
735
|
let currentState = currentFsmParse.getFinalSuffix();
|
|
653
736
|
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
newFsmParse.setAgreement(currentTransition.getWith());
|
|
661
|
-
fsmParse.enqueue(newFsmParse);
|
|
662
|
-
}
|
|
737
|
+
if (currentState.isEndState() && currentSurfaceForm.length <= maxLength) {
|
|
738
|
+
let currentTransitionList = currentSurfaceForm + " " + currentFsmParse.getFsmParseTransitionList();
|
|
739
|
+
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
740
|
+
result.push(currentFsmParse);
|
|
741
|
+
currentFsmParse.constructInflectionalGroups();
|
|
742
|
+
resultTransitionList.push(currentTransitionList);
|
|
663
743
|
}
|
|
664
744
|
}
|
|
745
|
+
this.addNewParsesFromCurrentParseLength(currentFsmParse, parseQueue, maxLength, root);
|
|
665
746
|
}
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
747
|
+
return result;
|
|
748
|
+
}
|
|
749
|
+
/**
|
|
750
|
+
* The parseWord method is used to parse a given fsmParse. It simply adds new parses to the current parse by
|
|
751
|
+
* using addNewParsesFromCurrentParse method.
|
|
752
|
+
*
|
|
753
|
+
* @param fsmParse an ArrayList of FsmParse
|
|
754
|
+
* @param surfaceForm String to use during transition.
|
|
755
|
+
* @return result {@link Array} which has the currentFsmParse.
|
|
756
|
+
*/
|
|
757
|
+
parseWordSurfaceForm(fsmParse, surfaceForm) {
|
|
758
|
+
let result = new Array();
|
|
759
|
+
let resultTransitionList = new Array();
|
|
760
|
+
let parseQueue = new Queue_1.Queue(1000);
|
|
761
|
+
parseQueue.enqueueAll(fsmParse);
|
|
762
|
+
while (!parseQueue.isEmpty()) {
|
|
763
|
+
let currentFsmParse = parseQueue.peek();
|
|
764
|
+
parseQueue.dequeue();
|
|
765
|
+
let root = currentFsmParse.getWord();
|
|
766
|
+
let currentState = currentFsmParse.getFinalSuffix();
|
|
767
|
+
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
768
|
+
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
769
|
+
let currentTransitionList = currentFsmParse.getFsmParseTransitionList();
|
|
770
|
+
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
771
|
+
result.push(currentFsmParse);
|
|
772
|
+
currentFsmParse.constructInflectionalGroups();
|
|
773
|
+
resultTransitionList.push(currentTransitionList);
|
|
684
774
|
}
|
|
685
|
-
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
|
|
686
775
|
}
|
|
687
|
-
|
|
776
|
+
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
|
|
688
777
|
}
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
let root = currentFsmParse.getWord();
|
|
706
|
-
let currentState = currentFsmParse.getFinalSuffix();
|
|
707
|
-
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
708
|
-
if (currentState.isEndState() && currentSurfaceForm.length <= maxLength) {
|
|
709
|
-
let currentTransitionList = currentSurfaceForm + " " + currentFsmParse.getFsmParseTransitionList();
|
|
710
|
-
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
711
|
-
result.push(currentFsmParse);
|
|
712
|
-
currentFsmParse.constructInflectionalGroups();
|
|
713
|
-
resultTransitionList.push(currentTransitionList);
|
|
714
|
-
}
|
|
715
|
-
}
|
|
716
|
-
this.addNewParsesFromCurrentParseLength(currentFsmParse, parseQueue, maxLength, root);
|
|
717
|
-
}
|
|
718
|
-
return result;
|
|
778
|
+
return result;
|
|
779
|
+
}
|
|
780
|
+
/**
|
|
781
|
+
* The morphologicalAnalysis with 3 inputs is used to initialize an {@link Array} and add a new FsmParse
|
|
782
|
+
* with given root and state.
|
|
783
|
+
*
|
|
784
|
+
* @param root TxtWord input.
|
|
785
|
+
* @param surfaceForm String input to use for parsing.
|
|
786
|
+
* @param state String input.
|
|
787
|
+
* @return parseWord method with newly populated FsmParse ArrayList and input surfaceForm.
|
|
788
|
+
*/
|
|
789
|
+
morphologicalAnalysisFromRoot(root, surfaceForm, state) {
|
|
790
|
+
let initialFsmParse = new Array();
|
|
791
|
+
if (state != undefined) {
|
|
792
|
+
initialFsmParse.push(new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState(state)));
|
|
793
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
719
794
|
}
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
*
|
|
724
|
-
* @param fsmParse an ArrayList of FsmParse
|
|
725
|
-
* @param surfaceForm String to use during transition.
|
|
726
|
-
* @return result {@link Array} which has the currentFsmParse.
|
|
727
|
-
*/
|
|
728
|
-
parseWordSurfaceForm(fsmParse, surfaceForm) {
|
|
729
|
-
let result = new Array();
|
|
730
|
-
let resultTransitionList = new Array();
|
|
731
|
-
let parseQueue = new Queue_1.Queue(1000);
|
|
732
|
-
parseQueue.enqueueAll(fsmParse);
|
|
733
|
-
while (!parseQueue.isEmpty()) {
|
|
734
|
-
let currentFsmParse = parseQueue.peek();
|
|
735
|
-
parseQueue.dequeue();
|
|
736
|
-
let root = currentFsmParse.getWord();
|
|
737
|
-
let currentState = currentFsmParse.getFinalSuffix();
|
|
738
|
-
let currentSurfaceForm = currentFsmParse.getSurfaceForm();
|
|
739
|
-
if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
|
|
740
|
-
let currentTransitionList = currentFsmParse.getFsmParseTransitionList();
|
|
741
|
-
if (!resultTransitionList.includes(currentTransitionList)) {
|
|
742
|
-
result.push(currentFsmParse);
|
|
743
|
-
currentFsmParse.constructInflectionalGroups();
|
|
744
|
-
resultTransitionList.push(currentTransitionList);
|
|
745
|
-
}
|
|
746
|
-
}
|
|
747
|
-
this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
|
|
748
|
-
}
|
|
749
|
-
return result;
|
|
795
|
+
else {
|
|
796
|
+
this.initializeParseListFromRoot(initialFsmParse, root, this.isProperNoun(surfaceForm));
|
|
797
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
750
798
|
}
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
* @param surfaceForm String input to use for parsing.
|
|
757
|
-
* @param state String input.
|
|
758
|
-
* @return parseWord method with newly populated FsmParse ArrayList and input surfaceForm.
|
|
759
|
-
*/
|
|
760
|
-
morphologicalAnalysisFromRoot(root, surfaceForm, state) {
|
|
761
|
-
let initialFsmParse = new Array();
|
|
762
|
-
if (state != undefined) {
|
|
763
|
-
initialFsmParse.push(new FsmParse_1.FsmParse(root, this.finiteStateMachine.getState(state)));
|
|
764
|
-
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
765
|
-
}
|
|
766
|
-
else {
|
|
767
|
-
this.initializeParseListFromRoot(initialFsmParse, root, this.isProperNoun(surfaceForm));
|
|
768
|
-
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
769
|
-
}
|
|
799
|
+
}
|
|
800
|
+
distinctSurfaceFormList(parseList) {
|
|
801
|
+
let items = new Set();
|
|
802
|
+
for (let parse of parseList) {
|
|
803
|
+
items.add(parse.getSurfaceForm());
|
|
770
804
|
}
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
805
|
+
return items;
|
|
806
|
+
}
|
|
807
|
+
/**
|
|
808
|
+
* The generateAllParses with 2 inputs is used to generate all parses with given root. Then it calls initializeParseListFromRoot method to initialize list with newly created ArrayList, input root,
|
|
809
|
+
* and maximum length.
|
|
810
|
+
*
|
|
811
|
+
* @param root TxtWord input.
|
|
812
|
+
* @param maxLength Maximum length of the surface form.
|
|
813
|
+
* @return parseWord method with newly populated FsmParse ArrayList and maximum length.
|
|
814
|
+
*/
|
|
815
|
+
generateAllParses(root, maxLength) {
|
|
816
|
+
let initialFsmParse = new Array();
|
|
817
|
+
if (root.isProperNoun()) {
|
|
818
|
+
this.initializeParseListFromRoot(initialFsmParse, root, true);
|
|
819
|
+
}
|
|
820
|
+
this.initializeParseListFromRoot(initialFsmParse, root, false);
|
|
821
|
+
return this.parseWordLength(initialFsmParse, maxLength);
|
|
822
|
+
}
|
|
823
|
+
/**
|
|
824
|
+
* Replaces previous lemma in the sentence with the new lemma. Both lemma can contain multiple words.
|
|
825
|
+
* @param original Original sentence to be replaced with.
|
|
826
|
+
* @param previousWord Root word in the original sentence
|
|
827
|
+
* @param newWord New word to be replaced.
|
|
828
|
+
* @return Newly generated sentence by replacing the previous word in the original sentence with the new word.
|
|
829
|
+
*/
|
|
830
|
+
replaceWord(original, previousWord, newWord) {
|
|
831
|
+
let previousWordSplitted = undefined, newWordSplitted = undefined;
|
|
832
|
+
let result = new Sentence_1.Sentence();
|
|
833
|
+
let replacedWord = undefined;
|
|
834
|
+
let previousWordMultiple = previousWord.includes(" ");
|
|
835
|
+
let newWordMultiple = newWord.includes(" ");
|
|
836
|
+
let lastWord;
|
|
837
|
+
if (previousWordMultiple) {
|
|
838
|
+
previousWordSplitted = previousWord.split(" ");
|
|
839
|
+
lastWord = previousWordSplitted[previousWordSplitted.length - 1];
|
|
840
|
+
}
|
|
841
|
+
else {
|
|
842
|
+
lastWord = previousWord;
|
|
843
|
+
}
|
|
844
|
+
let newRootWord;
|
|
845
|
+
if (newWordMultiple) {
|
|
846
|
+
newWordSplitted = newWord.split(" ");
|
|
847
|
+
newRootWord = newWordSplitted[newWordSplitted.length - 1];
|
|
848
|
+
}
|
|
849
|
+
else {
|
|
850
|
+
newRootWord = newWord;
|
|
851
|
+
}
|
|
852
|
+
let newRootTxtWord = this.dictionary.getWord(newRootWord);
|
|
853
|
+
let parseList = this.morphologicalAnalysisFromSentence(original);
|
|
854
|
+
let i;
|
|
855
|
+
for (i = 0; i < parseList.length; i++) {
|
|
856
|
+
let replaced = false;
|
|
857
|
+
for (let j = 0; j < parseList[i].size(); j++) {
|
|
858
|
+
if (parseList[i].getFsmParse(j).getWord().getName() == lastWord && newRootTxtWord != undefined) {
|
|
859
|
+
replaced = true;
|
|
860
|
+
replacedWord = parseList[i].getFsmParse(j).replaceRootWord(newRootTxtWord);
|
|
861
|
+
}
|
|
822
862
|
}
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
let replaced = false;
|
|
828
|
-
for (let j = 0; j < parseList[i].size(); j++) {
|
|
829
|
-
if (parseList[i].getFsmParse(j).getWord().getName() == lastWord && newRootTxtWord != undefined) {
|
|
830
|
-
replaced = true;
|
|
831
|
-
replacedWord = parseList[i].getFsmParse(j).replaceRootWord(newRootTxtWord);
|
|
863
|
+
if (replaced && replacedWord != null) {
|
|
864
|
+
if (previousWordMultiple) {
|
|
865
|
+
for (let k = 0; k < i - previousWordSplitted.length + 1; k++) {
|
|
866
|
+
result.addWord(original.getWord(k));
|
|
832
867
|
}
|
|
833
868
|
}
|
|
834
|
-
if (
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
result.addWord(
|
|
869
|
+
if (newWordMultiple) {
|
|
870
|
+
for (let k = 0; k < newWordSplitted.length - 1; k++) {
|
|
871
|
+
if (result.wordCount() == 0) {
|
|
872
|
+
result.addWord(new Word_1.Word((newWordSplitted[k].charAt(0) + "").toLocaleUpperCase("tr") + newWordSplitted[k].substring(1)));
|
|
838
873
|
}
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
for (let k = 0; k < newWordSplitted.length - 1; k++) {
|
|
842
|
-
if (result.wordCount() == 0) {
|
|
843
|
-
result.addWord(new Word_1.Word((newWordSplitted[k].charAt(0) + "").toLocaleUpperCase("tr") + newWordSplitted[k].substring(1)));
|
|
844
|
-
}
|
|
845
|
-
else {
|
|
846
|
-
result.addWord(new Word_1.Word(newWordSplitted[k]));
|
|
847
|
-
}
|
|
874
|
+
else {
|
|
875
|
+
result.addWord(new Word_1.Word(newWordSplitted[k]));
|
|
848
876
|
}
|
|
849
877
|
}
|
|
850
|
-
if (result.wordCount() == 0) {
|
|
851
|
-
replacedWord = (replacedWord.charAt(0) + "").toLocaleUpperCase("tr") + replacedWord.substring(1);
|
|
852
|
-
}
|
|
853
|
-
result.addWord(new Word_1.Word(replacedWord));
|
|
854
|
-
if (previousWordMultiple) {
|
|
855
|
-
i++;
|
|
856
|
-
break;
|
|
857
|
-
}
|
|
858
878
|
}
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
879
|
+
if (result.wordCount() == 0) {
|
|
880
|
+
replacedWord = (replacedWord.charAt(0) + "").toLocaleUpperCase("tr") + replacedWord.substring(1);
|
|
881
|
+
}
|
|
882
|
+
result.addWord(new Word_1.Word(replacedWord));
|
|
883
|
+
if (previousWordMultiple) {
|
|
884
|
+
i++;
|
|
885
|
+
break;
|
|
863
886
|
}
|
|
864
887
|
}
|
|
865
|
-
|
|
866
|
-
|
|
888
|
+
else {
|
|
889
|
+
if (!previousWordMultiple) {
|
|
867
890
|
result.addWord(original.getWord(i));
|
|
868
891
|
}
|
|
869
892
|
}
|
|
870
|
-
return result;
|
|
871
893
|
}
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
* this newly initialized list and surfaceForm.
|
|
876
|
-
*
|
|
877
|
-
* @param rootWord TxtWord root.
|
|
878
|
-
* @param surfaceForm String input.
|
|
879
|
-
* @param isProper boolean variable indicates a word is proper or not.
|
|
880
|
-
* @return true if surfaceForm is punctuation or double, otherwise returns parseExist method with given surfaceForm.
|
|
881
|
-
*/
|
|
882
|
-
analysisExists(rootWord, surfaceForm, isProper) {
|
|
883
|
-
if (Word_1.Word.isPunctuation(surfaceForm)) {
|
|
884
|
-
return true;
|
|
885
|
-
}
|
|
886
|
-
if (this.isDouble(surfaceForm)) {
|
|
887
|
-
return true;
|
|
888
|
-
}
|
|
889
|
-
let initialFsmParse;
|
|
890
|
-
if (rootWord != null) {
|
|
891
|
-
initialFsmParse = new Array();
|
|
892
|
-
this.initializeParseListFromRoot(initialFsmParse, rootWord, isProper);
|
|
893
|
-
}
|
|
894
|
-
else {
|
|
895
|
-
initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
|
|
896
|
-
}
|
|
897
|
-
return this.parseExists(initialFsmParse, surfaceForm);
|
|
898
|
-
}
|
|
899
|
-
/**
|
|
900
|
-
* The analysis method is used by the morphologicalAnalysis method. It gets String surfaceForm as an input and checks
|
|
901
|
-
* its type such as punctuation, number or compares with the regex for date, fraction, percent, time, range, hashtag,
|
|
902
|
-
* and mail or checks its variable type as integer or double. After finding the right case for given surfaceForm, it calls
|
|
903
|
-
* constructInflectionalGroups method which creates sub-word units.
|
|
904
|
-
*
|
|
905
|
-
* @param surfaceForm String to analyse.
|
|
906
|
-
* @param isProper is used to indicate the proper words.
|
|
907
|
-
* @return ArrayList type initialFsmParse which holds the analyses.
|
|
908
|
-
*/
|
|
909
|
-
analysis(surfaceForm, isProper) {
|
|
910
|
-
let initialFsmParse, fsmParse;
|
|
911
|
-
if (Word_1.Word.isPunctuation(surfaceForm) && surfaceForm != "%") {
|
|
912
|
-
initialFsmParse = new Array();
|
|
913
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Punctuation"), true, true));
|
|
914
|
-
fsmParse.constructInflectionalGroups();
|
|
915
|
-
initialFsmParse.push(fsmParse);
|
|
916
|
-
return initialFsmParse;
|
|
917
|
-
}
|
|
918
|
-
if (this.isNumber(surfaceForm)) {
|
|
919
|
-
initialFsmParse = new Array();
|
|
920
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("CardinalRoot"), true, true));
|
|
921
|
-
fsmParse.constructInflectionalGroups();
|
|
922
|
-
initialFsmParse.push(fsmParse);
|
|
923
|
-
return initialFsmParse;
|
|
924
|
-
}
|
|
925
|
-
if (this.patternMatches("^\\d+/\\d+$", surfaceForm)) {
|
|
926
|
-
initialFsmParse = new Array();
|
|
927
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
928
|
-
fsmParse.constructInflectionalGroups();
|
|
929
|
-
initialFsmParse.push(fsmParse);
|
|
930
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
931
|
-
fsmParse.constructInflectionalGroups();
|
|
932
|
-
initialFsmParse.push(fsmParse);
|
|
933
|
-
return initialFsmParse;
|
|
934
|
-
}
|
|
935
|
-
if (this.isDate(surfaceForm)) {
|
|
936
|
-
initialFsmParse = new Array();
|
|
937
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
938
|
-
fsmParse.constructInflectionalGroups();
|
|
939
|
-
initialFsmParse.push(fsmParse);
|
|
940
|
-
return initialFsmParse;
|
|
941
|
-
}
|
|
942
|
-
if (this.patternMatches("^\\d+\\\\/\\d+$", surfaceForm)) {
|
|
943
|
-
initialFsmParse = new Array();
|
|
944
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
945
|
-
fsmParse.constructInflectionalGroups();
|
|
946
|
-
initialFsmParse.push(fsmParse);
|
|
947
|
-
return initialFsmParse;
|
|
948
|
-
}
|
|
949
|
-
if (surfaceForm == "%" || this.isPercent(surfaceForm)) {
|
|
950
|
-
initialFsmParse = new Array();
|
|
951
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("PercentRoot"), true, true));
|
|
952
|
-
fsmParse.constructInflectionalGroups();
|
|
953
|
-
initialFsmParse.push(fsmParse);
|
|
954
|
-
return initialFsmParse;
|
|
955
|
-
}
|
|
956
|
-
if (this.isTime(surfaceForm)) {
|
|
957
|
-
initialFsmParse = new Array();
|
|
958
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("TimeRoot"), true, true));
|
|
959
|
-
fsmParse.constructInflectionalGroups();
|
|
960
|
-
initialFsmParse.push(fsmParse);
|
|
961
|
-
return initialFsmParse;
|
|
962
|
-
}
|
|
963
|
-
if (this.isRange(surfaceForm)) {
|
|
964
|
-
initialFsmParse = new Array();
|
|
965
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("RangeRoot"), true, true));
|
|
966
|
-
fsmParse.constructInflectionalGroups();
|
|
967
|
-
initialFsmParse.push(fsmParse);
|
|
968
|
-
return initialFsmParse;
|
|
969
|
-
}
|
|
970
|
-
if (surfaceForm.startsWith("#")) {
|
|
971
|
-
initialFsmParse = new Array();
|
|
972
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Hashtag"), true, true));
|
|
973
|
-
fsmParse.constructInflectionalGroups();
|
|
974
|
-
initialFsmParse.push(fsmParse);
|
|
975
|
-
return initialFsmParse;
|
|
976
|
-
}
|
|
977
|
-
if (surfaceForm.includes("@")) {
|
|
978
|
-
initialFsmParse = new Array();
|
|
979
|
-
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Email"), true, true));
|
|
980
|
-
fsmParse.constructInflectionalGroups();
|
|
981
|
-
initialFsmParse.push(fsmParse);
|
|
982
|
-
return initialFsmParse;
|
|
983
|
-
}
|
|
984
|
-
if (surfaceForm.endsWith(".") && this.isInteger(surfaceForm.substring(0, surfaceForm.length - 1))) {
|
|
985
|
-
initialFsmParse = new Array();
|
|
986
|
-
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm.substring(0, surfaceForm.length - 1)), this.finiteStateMachine.getState("OrdinalRoot"));
|
|
987
|
-
fsmParse.constructInflectionalGroups();
|
|
988
|
-
initialFsmParse.push(fsmParse);
|
|
989
|
-
return initialFsmParse;
|
|
990
|
-
}
|
|
991
|
-
if (this.isInteger(surfaceForm)) {
|
|
992
|
-
initialFsmParse = new Array();
|
|
993
|
-
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm), this.finiteStateMachine.getState("CardinalRoot"));
|
|
994
|
-
fsmParse.constructInflectionalGroups();
|
|
995
|
-
initialFsmParse.push(fsmParse);
|
|
996
|
-
return initialFsmParse;
|
|
997
|
-
}
|
|
998
|
-
if (this.isDouble(surfaceForm)) {
|
|
999
|
-
initialFsmParse = new Array();
|
|
1000
|
-
fsmParse = new FsmParse_1.FsmParse(Number.parseFloat(surfaceForm), this.finiteStateMachine.getState("RealRoot"));
|
|
1001
|
-
fsmParse.constructInflectionalGroups();
|
|
1002
|
-
initialFsmParse.push(fsmParse);
|
|
1003
|
-
return initialFsmParse;
|
|
894
|
+
if (previousWordMultiple) {
|
|
895
|
+
for (; i < parseList.length; i++) {
|
|
896
|
+
result.addWord(original.getWord(i));
|
|
1004
897
|
}
|
|
898
|
+
}
|
|
899
|
+
return result;
|
|
900
|
+
}
|
|
901
|
+
/**
|
|
902
|
+
* The analysisExists method checks several cases. If the given surfaceForm is a punctuation or double then it
|
|
903
|
+
* returns true. If it is not a root word, then it initializes the parse list and returns the parseExists method with
|
|
904
|
+
* this newly initialized list and surfaceForm.
|
|
905
|
+
*
|
|
906
|
+
* @param rootWord TxtWord root.
|
|
907
|
+
* @param surfaceForm String input.
|
|
908
|
+
* @param isProper boolean variable indicates a word is proper or not.
|
|
909
|
+
* @return true if surfaceForm is punctuation or double, otherwise returns parseExist method with given surfaceForm.
|
|
910
|
+
*/
|
|
911
|
+
analysisExists(rootWord, surfaceForm, isProper) {
|
|
912
|
+
if (Word_1.Word.isPunctuation(surfaceForm)) {
|
|
913
|
+
return true;
|
|
914
|
+
}
|
|
915
|
+
if (this.isDouble(surfaceForm)) {
|
|
916
|
+
return true;
|
|
917
|
+
}
|
|
918
|
+
let initialFsmParse;
|
|
919
|
+
if (rootWord != null) {
|
|
920
|
+
initialFsmParse = new Array();
|
|
921
|
+
this.initializeParseListFromRoot(initialFsmParse, rootWord, isProper);
|
|
922
|
+
}
|
|
923
|
+
else {
|
|
1005
924
|
initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
|
|
1006
|
-
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
1007
925
|
}
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
926
|
+
return this.parseExists(initialFsmParse, surfaceForm);
|
|
927
|
+
}
|
|
928
|
+
/**
|
|
929
|
+
* The analysis method is used by the morphologicalAnalysis method. It gets String surfaceForm as an input and checks
|
|
930
|
+
* its type such as punctuation, number or compares with the regex for date, fraction, percent, time, range, hashtag,
|
|
931
|
+
* and mail or checks its variable type as integer or double. After finding the right case for given surfaceForm, it calls
|
|
932
|
+
* constructInflectionalGroups method which creates sub-word units.
|
|
933
|
+
*
|
|
934
|
+
* @param surfaceForm String to analyse.
|
|
935
|
+
* @param isProper is used to indicate the proper words.
|
|
936
|
+
* @return ArrayList type initialFsmParse which holds the analyses.
|
|
937
|
+
*/
|
|
938
|
+
analysis(surfaceForm, isProper) {
|
|
939
|
+
let initialFsmParse, fsmParse;
|
|
940
|
+
if (Word_1.Word.isPunctuation(surfaceForm) && surfaceForm != "%") {
|
|
941
|
+
initialFsmParse = new Array();
|
|
942
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Punctuation"), true, true));
|
|
943
|
+
fsmParse.constructInflectionalGroups();
|
|
944
|
+
initialFsmParse.push(fsmParse);
|
|
945
|
+
return initialFsmParse;
|
|
946
|
+
}
|
|
947
|
+
if (this.isNumber(surfaceForm)) {
|
|
948
|
+
initialFsmParse = new Array();
|
|
949
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("CardinalRoot"), true, true));
|
|
950
|
+
fsmParse.constructInflectionalGroups();
|
|
951
|
+
initialFsmParse.push(fsmParse);
|
|
952
|
+
return initialFsmParse;
|
|
953
|
+
}
|
|
954
|
+
if (this.patternMatches("^\\d+/\\d+$", surfaceForm)) {
|
|
955
|
+
initialFsmParse = new Array();
|
|
956
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
957
|
+
fsmParse.constructInflectionalGroups();
|
|
958
|
+
initialFsmParse.push(fsmParse);
|
|
959
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
960
|
+
fsmParse.constructInflectionalGroups();
|
|
961
|
+
initialFsmParse.push(fsmParse);
|
|
962
|
+
return initialFsmParse;
|
|
963
|
+
}
|
|
964
|
+
if (this.isDate(surfaceForm)) {
|
|
965
|
+
initialFsmParse = new Array();
|
|
966
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("DateRoot"), true, true));
|
|
967
|
+
fsmParse.constructInflectionalGroups();
|
|
968
|
+
initialFsmParse.push(fsmParse);
|
|
969
|
+
return initialFsmParse;
|
|
970
|
+
}
|
|
971
|
+
if (this.patternMatches("^\\d+\\\\/\\d+$", surfaceForm)) {
|
|
972
|
+
initialFsmParse = new Array();
|
|
973
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("FractionRoot"), true, true));
|
|
974
|
+
fsmParse.constructInflectionalGroups();
|
|
975
|
+
initialFsmParse.push(fsmParse);
|
|
976
|
+
return initialFsmParse;
|
|
977
|
+
}
|
|
978
|
+
if (surfaceForm == "%" || this.isPercent(surfaceForm)) {
|
|
979
|
+
initialFsmParse = new Array();
|
|
980
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("PercentRoot"), true, true));
|
|
981
|
+
fsmParse.constructInflectionalGroups();
|
|
982
|
+
initialFsmParse.push(fsmParse);
|
|
983
|
+
return initialFsmParse;
|
|
984
|
+
}
|
|
985
|
+
if (this.isTime(surfaceForm)) {
|
|
986
|
+
initialFsmParse = new Array();
|
|
987
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("TimeRoot"), true, true));
|
|
988
|
+
fsmParse.constructInflectionalGroups();
|
|
989
|
+
initialFsmParse.push(fsmParse);
|
|
990
|
+
return initialFsmParse;
|
|
991
|
+
}
|
|
992
|
+
if (this.isRange(surfaceForm)) {
|
|
993
|
+
initialFsmParse = new Array();
|
|
994
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("RangeRoot"), true, true));
|
|
995
|
+
fsmParse.constructInflectionalGroups();
|
|
996
|
+
initialFsmParse.push(fsmParse);
|
|
997
|
+
return initialFsmParse;
|
|
998
|
+
}
|
|
999
|
+
if (surfaceForm.startsWith("#")) {
|
|
1000
|
+
initialFsmParse = new Array();
|
|
1001
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Hashtag"), true, true));
|
|
1002
|
+
fsmParse.constructInflectionalGroups();
|
|
1003
|
+
initialFsmParse.push(fsmParse);
|
|
1004
|
+
return initialFsmParse;
|
|
1005
|
+
}
|
|
1006
|
+
if (surfaceForm.includes("@")) {
|
|
1007
|
+
initialFsmParse = new Array();
|
|
1008
|
+
fsmParse = new FsmParse_1.FsmParse(surfaceForm, new State_1.State(("Email"), true, true));
|
|
1009
|
+
fsmParse.constructInflectionalGroups();
|
|
1010
|
+
initialFsmParse.push(fsmParse);
|
|
1011
|
+
return initialFsmParse;
|
|
1012
|
+
}
|
|
1013
|
+
if (surfaceForm.endsWith(".") && this.isInteger(surfaceForm.substring(0, surfaceForm.length - 1))) {
|
|
1014
|
+
initialFsmParse = new Array();
|
|
1015
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm.substring(0, surfaceForm.length - 1)), this.finiteStateMachine.getState("OrdinalRoot"));
|
|
1016
|
+
fsmParse.constructInflectionalGroups();
|
|
1017
|
+
initialFsmParse.push(fsmParse);
|
|
1018
|
+
return initialFsmParse;
|
|
1019
|
+
}
|
|
1020
|
+
if (this.isInteger(surfaceForm)) {
|
|
1021
|
+
initialFsmParse = new Array();
|
|
1022
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseInt(surfaceForm), this.finiteStateMachine.getState("CardinalRoot"));
|
|
1023
|
+
fsmParse.constructInflectionalGroups();
|
|
1024
|
+
initialFsmParse.push(fsmParse);
|
|
1025
|
+
return initialFsmParse;
|
|
1026
|
+
}
|
|
1027
|
+
if (this.isDouble(surfaceForm)) {
|
|
1028
|
+
initialFsmParse = new Array();
|
|
1029
|
+
fsmParse = new FsmParse_1.FsmParse(Number.parseFloat(surfaceForm), this.finiteStateMachine.getState("RealRoot"));
|
|
1030
|
+
fsmParse.constructInflectionalGroups();
|
|
1031
|
+
initialFsmParse.push(fsmParse);
|
|
1032
|
+
return initialFsmParse;
|
|
1033
|
+
}
|
|
1034
|
+
initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
|
|
1035
|
+
return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
|
|
1036
|
+
}
|
|
1037
|
+
/**
|
|
1038
|
+
* This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
|
|
1039
|
+
* the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
|
|
1040
|
+
* mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
|
|
1041
|
+
* is compiled and put in the mostUsedPatterns.
|
|
1042
|
+
* @param expr Pattern to check
|
|
1043
|
+
* @param value String to match the pattern
|
|
1044
|
+
* @return True if the string matches the pattern, false otherwise.
|
|
1045
|
+
*/
|
|
1046
|
+
patternMatches(expr, value) {
|
|
1047
|
+
let p = this.mostUsedPatterns.get(expr);
|
|
1048
|
+
if (p == undefined) {
|
|
1049
|
+
p = RegExp(expr);
|
|
1050
|
+
this.mostUsedPatterns.set(expr, p);
|
|
1051
|
+
}
|
|
1052
|
+
return value.match(p) != null;
|
|
1053
|
+
}
|
|
1054
|
+
/**
|
|
1055
|
+
* The isProperNoun method takes surfaceForm String as input and checks its each char whether they are in the range
|
|
1056
|
+
* of letters between A to Z or one of the Turkish letters such as İ, Ü, Ğ, Ş, Ç, and Ö.
|
|
1057
|
+
*
|
|
1058
|
+
* @param surfaceForm String to check for proper noun.
|
|
1059
|
+
* @return false if surfaceForm is null or length of 0, return true if it is a letter.
|
|
1060
|
+
*/
|
|
1061
|
+
isProperNoun(surfaceForm) {
|
|
1062
|
+
if (surfaceForm == undefined || surfaceForm.length == 0) {
|
|
1063
|
+
return false;
|
|
1064
|
+
}
|
|
1065
|
+
return (surfaceForm.charAt(0) >= 'A' && surfaceForm.charAt(0) <= 'Z') || (surfaceForm.charAt(0) == '\u0130') ||
|
|
1066
|
+
(surfaceForm.charAt(0) == '\u00dc') || (surfaceForm.charAt(0) == '\u011e') || (surfaceForm.charAt(0) == '\u015e') ||
|
|
1067
|
+
(surfaceForm.charAt(0) == '\u00c7') || (surfaceForm.charAt(0) == '\u00d6'); // İ, Ü, Ğ, Ş, Ç, Ö
|
|
1068
|
+
}
|
|
1069
|
+
/**
|
|
1070
|
+
* The isCode method takes surfaceForm String as input and checks if it consists of both letters and numbers
|
|
1071
|
+
*
|
|
1072
|
+
* @param surfaceForm String to check for code-like word.
|
|
1073
|
+
* @return true if it is a code-like word, return false otherwise.
|
|
1074
|
+
*/
|
|
1075
|
+
isCode(surfaceForm) {
|
|
1076
|
+
if (surfaceForm == undefined || surfaceForm.length == 0) {
|
|
1077
|
+
return false;
|
|
1078
|
+
}
|
|
1079
|
+
return this.patternMatches("^.*[0-9].*$", surfaceForm) && this.patternMatches("^.*[a-zA-ZçöğüşıÇÖĞÜŞİ].*$", surfaceForm);
|
|
1080
|
+
}
|
|
1081
|
+
/**
|
|
1082
|
+
* Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
|
|
1083
|
+
* for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
|
|
1084
|
+
* way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
|
|
1085
|
+
* have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
|
|
1086
|
+
* surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
|
|
1087
|
+
* a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
|
|
1088
|
+
* will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
|
|
1089
|
+
* with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
|
|
1090
|
+
* with 'morfolojiklik'.
|
|
1091
|
+
* @param surfaceForm Surface form for which we will identify a possible new root form.
|
|
1092
|
+
* @return Possible new root form.
|
|
1093
|
+
*/
|
|
1094
|
+
rootOfPossiblyNewWord(surfaceForm) {
|
|
1095
|
+
let words = this.suffixTrie.getWordsWithPrefix(this.reverseString(surfaceForm));
|
|
1096
|
+
let candidateWord = null;
|
|
1097
|
+
let candidateList = new Array();
|
|
1098
|
+
for (let word of words) {
|
|
1099
|
+
candidateWord = surfaceForm.substring(0, surfaceForm.length - word.getName().length);
|
|
1100
|
+
let newWord;
|
|
1101
|
+
if (candidateWord.endsWith("ğ")) {
|
|
1102
|
+
candidateWord = candidateWord.substring(0, candidateWord.length - 1) + "k";
|
|
1103
|
+
newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
|
|
1104
|
+
newWord.addFlag("IS_SD");
|
|
1035
1105
|
}
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
}
|
|
1040
|
-
/**
|
|
1041
|
-
* The isCode method takes surfaceForm String as input and checks if it consists of both letters and numbers
|
|
1042
|
-
*
|
|
1043
|
-
* @param surfaceForm String to check for code-like word.
|
|
1044
|
-
* @return true if it is a code-like word, return false otherwise.
|
|
1045
|
-
*/
|
|
1046
|
-
isCode(surfaceForm) {
|
|
1047
|
-
if (surfaceForm == undefined || surfaceForm.length == 0) {
|
|
1048
|
-
return false;
|
|
1106
|
+
else {
|
|
1107
|
+
newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
|
|
1108
|
+
newWord.addFlag("CL_FIIL");
|
|
1049
1109
|
}
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
candidateWord = candidateWord.substring(0, candidateWord.length - 1) + "k";
|
|
1074
|
-
newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
|
|
1075
|
-
newWord.addFlag("IS_SD");
|
|
1076
|
-
}
|
|
1077
|
-
else {
|
|
1078
|
-
newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
|
|
1079
|
-
newWord.addFlag("CL_FIIL");
|
|
1080
|
-
}
|
|
1081
|
-
candidateList.push(newWord);
|
|
1082
|
-
this.dictionaryTrie.addWord(candidateWord, newWord);
|
|
1110
|
+
candidateList.push(newWord);
|
|
1111
|
+
this.dictionaryTrie.addWord(candidateWord, newWord);
|
|
1112
|
+
}
|
|
1113
|
+
return candidateList;
|
|
1114
|
+
}
|
|
1115
|
+
/**
|
|
1116
|
+
* The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
|
|
1117
|
+
* then, if the size of the currentParse is 0, and given surfaceForm is a proper noun, it adds the surfaceForm
|
|
1118
|
+
* whose state name is ProperRoot to an {@link Array}, of it is not a proper noon, it adds the surfaceForm
|
|
1119
|
+
* whose state name is NominalRoot to the {@link Array}.
|
|
1120
|
+
*
|
|
1121
|
+
* @param surfaceForm String to analyse.
|
|
1122
|
+
* @return FsmParseList type currentParse which holds morphological analysis of the surfaceForm.
|
|
1123
|
+
*/
|
|
1124
|
+
robustMorphologicalAnalysis(surfaceForm) {
|
|
1125
|
+
if (surfaceForm == undefined || surfaceForm == "") {
|
|
1126
|
+
return new FsmParseList_1.FsmParseList(new Array());
|
|
1127
|
+
}
|
|
1128
|
+
let currentParse = this.morphologicalAnalysis(surfaceForm);
|
|
1129
|
+
if (currentParse.size() == 0) {
|
|
1130
|
+
let fsmParse = new Array();
|
|
1131
|
+
if (this.isProperNoun(surfaceForm)) {
|
|
1132
|
+
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("ProperRoot")));
|
|
1083
1133
|
}
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
/**
|
|
1087
|
-
* The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
|
|
1088
|
-
* then, if the size of the currentParse is 0, and given surfaceForm is a proper noun, it adds the surfaceForm
|
|
1089
|
-
* whose state name is ProperRoot to an {@link Array}, of it is not a proper noon, it adds the surfaceForm
|
|
1090
|
-
* whose state name is NominalRoot to the {@link Array}.
|
|
1091
|
-
*
|
|
1092
|
-
* @param surfaceForm String to analyse.
|
|
1093
|
-
* @return FsmParseList type currentParse which holds morphological analysis of the surfaceForm.
|
|
1094
|
-
*/
|
|
1095
|
-
robustMorphologicalAnalysis(surfaceForm) {
|
|
1096
|
-
if (surfaceForm == undefined || surfaceForm == "") {
|
|
1097
|
-
return new FsmParseList_1.FsmParseList(new Array());
|
|
1134
|
+
if (this.isCode(surfaceForm)) {
|
|
1135
|
+
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("CodeRoot")));
|
|
1098
1136
|
}
|
|
1099
|
-
let
|
|
1100
|
-
if (
|
|
1101
|
-
let
|
|
1102
|
-
|
|
1103
|
-
fsmParse.push(new FsmParse_1.FsmParse(
|
|
1104
|
-
}
|
|
1105
|
-
if (this.isCode(surfaceForm)) {
|
|
1106
|
-
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("CodeRoot")));
|
|
1107
|
-
}
|
|
1108
|
-
let newCandidateList = this.rootOfPossiblyNewWord(surfaceForm);
|
|
1109
|
-
if (newCandidateList.length != 0) {
|
|
1110
|
-
for (let word of newCandidateList) {
|
|
1111
|
-
fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("VerbalRoot")));
|
|
1112
|
-
fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("NominalRoot")));
|
|
1113
|
-
}
|
|
1137
|
+
let newCandidateList = this.rootOfPossiblyNewWord(surfaceForm);
|
|
1138
|
+
if (newCandidateList.length != 0) {
|
|
1139
|
+
for (let word of newCandidateList) {
|
|
1140
|
+
fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("VerbalRoot")));
|
|
1141
|
+
fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("NominalRoot")));
|
|
1114
1142
|
}
|
|
1115
|
-
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")));
|
|
1116
|
-
return new FsmParseList_1.FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
|
|
1117
|
-
}
|
|
1118
|
-
else {
|
|
1119
|
-
return currentParse;
|
|
1120
1143
|
}
|
|
1144
|
+
fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")));
|
|
1145
|
+
return new FsmParseList_1.FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
|
|
1121
1146
|
}
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
*
|
|
1125
|
-
* @param sentence to get word from.
|
|
1126
|
-
* @return FsmParseList type result.
|
|
1127
|
-
*/
|
|
1128
|
-
morphologicalAnalysisFromSentence(sentence) {
|
|
1129
|
-
let result = new Array();
|
|
1130
|
-
for (let i = 0; i < sentence.wordCount(); i++) {
|
|
1131
|
-
let originalForm = sentence.getWord(i).getName();
|
|
1132
|
-
let spellCorrectedForm = this.dictionary.getCorrectForm(originalForm);
|
|
1133
|
-
if (spellCorrectedForm == undefined) {
|
|
1134
|
-
spellCorrectedForm = originalForm;
|
|
1135
|
-
}
|
|
1136
|
-
let wordFsmParseList = this.morphologicalAnalysis(spellCorrectedForm);
|
|
1137
|
-
result.push(wordFsmParseList);
|
|
1138
|
-
}
|
|
1139
|
-
return result;
|
|
1147
|
+
else {
|
|
1148
|
+
return currentParse;
|
|
1140
1149
|
}
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1150
|
+
}
|
|
1151
|
+
/**
|
|
1152
|
+
* The morphologicalAnalysis is used for debug purposes.
|
|
1153
|
+
*
|
|
1154
|
+
* @param sentence to get word from.
|
|
1155
|
+
* @return FsmParseList type result.
|
|
1156
|
+
*/
|
|
1157
|
+
morphologicalAnalysisFromSentence(sentence) {
|
|
1158
|
+
let result = new Array();
|
|
1159
|
+
for (let i = 0; i < sentence.wordCount(); i++) {
|
|
1160
|
+
let originalForm = sentence.getWord(i).getName();
|
|
1161
|
+
let spellCorrectedForm = this.dictionary.getCorrectForm(originalForm);
|
|
1162
|
+
if (spellCorrectedForm == undefined) {
|
|
1163
|
+
spellCorrectedForm = originalForm;
|
|
1164
|
+
}
|
|
1165
|
+
let wordFsmParseList = this.morphologicalAnalysis(spellCorrectedForm);
|
|
1166
|
+
result.push(wordFsmParseList);
|
|
1167
|
+
}
|
|
1168
|
+
return result;
|
|
1169
|
+
}
|
|
1170
|
+
/**
|
|
1171
|
+
* The robustMorphologicalAnalysis method takes just one argument as an input. It gets the name of the words from
|
|
1172
|
+
* input sentence then calls robustMorphologicalAnalysis with surfaceForm.
|
|
1173
|
+
*
|
|
1174
|
+
* @param sentence Sentence type input used to get surfaceForm.
|
|
1175
|
+
* @return FsmParseList array which holds the result of the analysis.
|
|
1176
|
+
*/
|
|
1177
|
+
robustMorphologicalAnalysisFromSentence(sentence) {
|
|
1178
|
+
let result = new Array();
|
|
1179
|
+
for (let i = 0; i < sentence.wordCount(); i++) {
|
|
1180
|
+
let originalForm = sentence.getWord(i).getName();
|
|
1181
|
+
let spellCorrectedForm = this.dictionary.getCorrectForm(originalForm);
|
|
1182
|
+
if (spellCorrectedForm == undefined) {
|
|
1183
|
+
spellCorrectedForm = originalForm;
|
|
1184
|
+
}
|
|
1185
|
+
let fsmParseList = this.robustMorphologicalAnalysis(spellCorrectedForm);
|
|
1186
|
+
result.push(fsmParseList);
|
|
1160
1187
|
}
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1188
|
+
return result;
|
|
1189
|
+
}
|
|
1190
|
+
/**
|
|
1191
|
+
* The isInteger method compares input surfaceForm with regex \+?\d+ and returns the result.
|
|
1192
|
+
* Supports positive integer checks only.
|
|
1193
|
+
*
|
|
1194
|
+
* @param surfaceForm String to check.
|
|
1195
|
+
* @return true if surfaceForm matches with the regex.
|
|
1196
|
+
*/
|
|
1197
|
+
isInteger(surfaceForm) {
|
|
1198
|
+
if (!this.patternMatches("^[+-]?\\d+$", surfaceForm))
|
|
1199
|
+
return false;
|
|
1200
|
+
let len = surfaceForm.length;
|
|
1201
|
+
if (len < 10) {
|
|
1202
|
+
return true;
|
|
1203
|
+
}
|
|
1204
|
+
else {
|
|
1205
|
+
if (len > 10) {
|
|
1170
1206
|
return false;
|
|
1171
|
-
let len = surfaceForm.length;
|
|
1172
|
-
if (len < 10) {
|
|
1173
|
-
return true;
|
|
1174
1207
|
}
|
|
1175
1208
|
else {
|
|
1176
|
-
|
|
1177
|
-
return false;
|
|
1178
|
-
}
|
|
1179
|
-
else {
|
|
1180
|
-
return surfaceForm >= "2147483647";
|
|
1181
|
-
}
|
|
1209
|
+
return surfaceForm >= "2147483647";
|
|
1182
1210
|
}
|
|
1183
1211
|
}
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
|
|
1187
|
-
|
|
1188
|
-
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
|
-
|
|
1194
|
-
|
|
1195
|
-
|
|
1196
|
-
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
}
|
|
1214
|
-
}
|
|
1215
|
-
if (!found) {
|
|
1212
|
+
}
|
|
1213
|
+
/**
|
|
1214
|
+
* The isDouble method compares input surfaceForm with regex \+?(\d+)?\.\d* and returns the result.
|
|
1215
|
+
*
|
|
1216
|
+
* @param surfaceForm String to check.
|
|
1217
|
+
* @return true if surfaceForm matches with the regex.
|
|
1218
|
+
*/
|
|
1219
|
+
isDouble(surfaceForm) {
|
|
1220
|
+
return this.patternMatches("^[+-]?(\\d+)?\\.\\d*$", surfaceForm);
|
|
1221
|
+
}
|
|
1222
|
+
/**
|
|
1223
|
+
* The isNumber method compares input surfaceForm with the array of written numbers and returns the result.
|
|
1224
|
+
*
|
|
1225
|
+
* @param surfaceForm String to check.
|
|
1226
|
+
* @return true if surfaceForm matches with the regex.
|
|
1227
|
+
*/
|
|
1228
|
+
isNumber(surfaceForm) {
|
|
1229
|
+
let count = 0;
|
|
1230
|
+
let numbers = ["bir", "iki", "üç", "dört", "beş", "altı", "yedi", "sekiz", "dokuz",
|
|
1231
|
+
"on", "yirmi", "otuz", "kırk", "elli", "altmış", "yetmiş", "seksen", "doksan",
|
|
1232
|
+
"yüz", "bin", "milyon", "milyar", "trilyon", "katrilyon"];
|
|
1233
|
+
let word = surfaceForm;
|
|
1234
|
+
while (word != "") {
|
|
1235
|
+
let found = false;
|
|
1236
|
+
for (let number of numbers) {
|
|
1237
|
+
if (word.startsWith(number)) {
|
|
1238
|
+
found = true;
|
|
1239
|
+
count++;
|
|
1240
|
+
word = word.substring(number.length);
|
|
1216
1241
|
break;
|
|
1217
1242
|
}
|
|
1218
1243
|
}
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
/**
|
|
1222
|
-
* Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
|
|
1223
|
-
* @param surfaceForm Surface form to be checked.
|
|
1224
|
-
* @return True if the surface form is in percent form
|
|
1225
|
-
*/
|
|
1226
|
-
isPercent(surfaceForm) {
|
|
1227
|
-
return this.patternMatches("^%(\\d\\d|\\d)$", surfaceForm) ||
|
|
1228
|
-
this.patternMatches("^%(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1229
|
-
}
|
|
1230
|
-
/**
|
|
1231
|
-
* Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
|
|
1232
|
-
* @param surfaceForm Surface form to be checked.
|
|
1233
|
-
* @return True if the surface form is in time form
|
|
1234
|
-
*/
|
|
1235
|
-
isTime(surfaceForm) {
|
|
1236
|
-
return this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1237
|
-
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm);
|
|
1238
|
-
}
|
|
1239
|
-
/**
|
|
1240
|
-
* Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
|
|
1241
|
-
* 3.45-4.67.
|
|
1242
|
-
* @param surfaceForm Surface form to be checked.
|
|
1243
|
-
* @return True if the surface form is in range form
|
|
1244
|
-
*/
|
|
1245
|
-
isRange(surfaceForm) {
|
|
1246
|
-
return this.patternMatches("^\\d+-\\d+$", surfaceForm) ||
|
|
1247
|
-
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1248
|
-
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)$", surfaceForm);
|
|
1249
|
-
}
|
|
1250
|
-
/**
|
|
1251
|
-
* Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
|
|
1252
|
-
* @param surfaceForm Surface form to be checked.
|
|
1253
|
-
* @return True if the surface form is in date form
|
|
1254
|
-
*/
|
|
1255
|
-
isDate(surfaceForm) {
|
|
1256
|
-
return this.patternMatches("^(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+$", surfaceForm) ||
|
|
1257
|
-
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1258
|
-
}
|
|
1259
|
-
/**
|
|
1260
|
-
* The morphologicalAnalysis method is used to analyse a FsmParseList by comparing with the regex.
|
|
1261
|
-
* It creates an {@link Array} fsmParse to hold the result of the analysis method. For each surfaceForm input,
|
|
1262
|
-
* it gets a substring and considers it as a possibleRoot. Then compares with the regex.
|
|
1263
|
-
* <p>
|
|
1264
|
-
* If the surfaceForm input string matches with Turkish chars like Ç, Ş, İ, Ü, Ö, it adds the surfaceForm to Trie with IS_OA tag.
|
|
1265
|
-
* If the possibleRoot contains /, then it is added to the Trie with IS_KESIR tag.
|
|
1266
|
-
* If the possibleRoot contains \d\d|\d)/(\d\d|\d)/\d+, then it is added to the Trie with IS_DATE tag.
|
|
1267
|
-
* If the possibleRoot contains \\d\d|\d, then it is added to the Trie with IS_PERCENT tag.
|
|
1268
|
-
* If the possibleRoot contains \d\d|\d):(\d\d|\d):(\d\d|\d), then it is added to the Trie with IS_ZAMAN tag.
|
|
1269
|
-
* If the possibleRoot contains \d+-\d+, then it is added to the Trie with IS_RANGE tag.
|
|
1270
|
-
* If the possibleRoot is an Integer, then it is added to the Trie with IS_SAYI tag.
|
|
1271
|
-
* If the possibleRoot is a Double, then it is added to the Trie with IS_REELSAYI tag.
|
|
1272
|
-
*
|
|
1273
|
-
* @param surfaceForm String to analyse.
|
|
1274
|
-
* @return fsmParseList which holds the analysis.
|
|
1275
|
-
*/
|
|
1276
|
-
morphologicalAnalysis(surfaceForm) {
|
|
1277
|
-
let lowerCased = surfaceForm.toLocaleLowerCase("tr");
|
|
1278
|
-
let possibleRootLowerCased = "", pronunciation = "";
|
|
1279
|
-
let isRootReplaced = false;
|
|
1280
|
-
if (this.parsedSurfaceForms != undefined && this.parsedSurfaceForms.has(lowerCased) &&
|
|
1281
|
-
!this.isInteger(surfaceForm) && !this.isDouble(surfaceForm) && !this.isPercent(surfaceForm) &&
|
|
1282
|
-
!this.isTime(surfaceForm) && !this.isRange(surfaceForm) && !this.isDate(surfaceForm)) {
|
|
1283
|
-
let parses = new Array();
|
|
1284
|
-
parses.push(new FsmParse_1.FsmParse(new Word_1.Word(this.parsedSurfaceForms.get(lowerCased))));
|
|
1285
|
-
return new FsmParseList_1.FsmParseList(parses);
|
|
1286
|
-
}
|
|
1287
|
-
if (this.cache != undefined && this.cache.contains(surfaceForm)) {
|
|
1288
|
-
return this.cache.get(surfaceForm);
|
|
1244
|
+
if (!found) {
|
|
1245
|
+
break;
|
|
1289
1246
|
}
|
|
1290
|
-
|
|
1291
|
-
|
|
1247
|
+
}
|
|
1248
|
+
return word == "" && count > 1;
|
|
1249
|
+
}
|
|
1250
|
+
/**
|
|
1251
|
+
* Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
|
|
1252
|
+
* @param surfaceForm Surface form to be checked.
|
|
1253
|
+
* @return True if the surface form is in percent form
|
|
1254
|
+
*/
|
|
1255
|
+
isPercent(surfaceForm) {
|
|
1256
|
+
return this.patternMatches("^%(\\d\\d|\\d)$", surfaceForm) ||
|
|
1257
|
+
this.patternMatches("^%(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1258
|
+
}
|
|
1259
|
+
/**
|
|
1260
|
+
* Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
|
|
1261
|
+
* @param surfaceForm Surface form to be checked.
|
|
1262
|
+
* @return True if the surface form is in time form
|
|
1263
|
+
*/
|
|
1264
|
+
isTime(surfaceForm) {
|
|
1265
|
+
return this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1266
|
+
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm);
|
|
1267
|
+
}
|
|
1268
|
+
/**
|
|
1269
|
+
* Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
|
|
1270
|
+
* 3.45-4.67.
|
|
1271
|
+
* @param surfaceForm Surface form to be checked.
|
|
1272
|
+
* @return True if the surface form is in range form
|
|
1273
|
+
*/
|
|
1274
|
+
isRange(surfaceForm) {
|
|
1275
|
+
return this.patternMatches("^\\d+-\\d+$", surfaceForm) ||
|
|
1276
|
+
this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
|
|
1277
|
+
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)$", surfaceForm);
|
|
1278
|
+
}
|
|
1279
|
+
/**
|
|
1280
|
+
* Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
|
|
1281
|
+
* @param surfaceForm Surface form to be checked.
|
|
1282
|
+
* @return True if the surface form is in date form
|
|
1283
|
+
*/
|
|
1284
|
+
isDate(surfaceForm) {
|
|
1285
|
+
return this.patternMatches("^(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+$", surfaceForm) ||
|
|
1286
|
+
this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+$", surfaceForm);
|
|
1287
|
+
}
|
|
1288
|
+
/**
|
|
1289
|
+
* The morphologicalAnalysis method is used to analyse a FsmParseList by comparing with the regex.
|
|
1290
|
+
* It creates an {@link Array} fsmParse to hold the result of the analysis method. For each surfaceForm input,
|
|
1291
|
+
* it gets a substring and considers it as a possibleRoot. Then compares with the regex.
|
|
1292
|
+
* <p>
|
|
1293
|
+
* If the surfaceForm input string matches with Turkish chars like Ç, Ş, İ, Ü, Ö, it adds the surfaceForm to Trie with IS_OA tag.
|
|
1294
|
+
* If the possibleRoot contains /, then it is added to the Trie with IS_KESIR tag.
|
|
1295
|
+
* If the possibleRoot contains \d\d|\d)/(\d\d|\d)/\d+, then it is added to the Trie with IS_DATE tag.
|
|
1296
|
+
* If the possibleRoot contains \\d\d|\d, then it is added to the Trie with IS_PERCENT tag.
|
|
1297
|
+
* If the possibleRoot contains \d\d|\d):(\d\d|\d):(\d\d|\d), then it is added to the Trie with IS_ZAMAN tag.
|
|
1298
|
+
* If the possibleRoot contains \d+-\d+, then it is added to the Trie with IS_RANGE tag.
|
|
1299
|
+
* If the possibleRoot is an Integer, then it is added to the Trie with IS_SAYI tag.
|
|
1300
|
+
* If the possibleRoot is a Double, then it is added to the Trie with IS_REELSAYI tag.
|
|
1301
|
+
*
|
|
1302
|
+
* @param surfaceForm String to analyse.
|
|
1303
|
+
* @return fsmParseList which holds the analysis.
|
|
1304
|
+
*/
|
|
1305
|
+
morphologicalAnalysis(surfaceForm) {
|
|
1306
|
+
let lowerCased = surfaceForm.toLocaleLowerCase("tr");
|
|
1307
|
+
let possibleRootLowerCased = "", pronunciation = "";
|
|
1308
|
+
let isRootReplaced = false;
|
|
1309
|
+
if (this.parsedSurfaceForms != undefined && this.parsedSurfaceForms.has(lowerCased) &&
|
|
1310
|
+
!this.isInteger(surfaceForm) && !this.isDouble(surfaceForm) && !this.isPercent(surfaceForm) &&
|
|
1311
|
+
!this.isTime(surfaceForm) && !this.isRange(surfaceForm) && !this.isDate(surfaceForm)) {
|
|
1312
|
+
let parses = new Array();
|
|
1313
|
+
parses.push(new FsmParse_1.FsmParse(new Word_1.Word(this.parsedSurfaceForms.get(lowerCased))));
|
|
1314
|
+
return new FsmParseList_1.FsmParseList(parses);
|
|
1315
|
+
}
|
|
1316
|
+
if (this.cache != undefined && this.cache.contains(surfaceForm)) {
|
|
1317
|
+
return this.cache.get(surfaceForm);
|
|
1318
|
+
}
|
|
1319
|
+
if (this.patternMatches("^(\\w|Ç|Ş|İ|Ü|Ö)\\.$", surfaceForm)) {
|
|
1320
|
+
this.dictionaryTrie.addWord(lowerCased, new TxtWord_1.TxtWord(lowerCased, "IS_OA"));
|
|
1321
|
+
}
|
|
1322
|
+
let defaultFsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1323
|
+
if (defaultFsmParse.length > 0) {
|
|
1324
|
+
let fsmParseList = new FsmParseList_1.FsmParseList(defaultFsmParse);
|
|
1325
|
+
if (this.cache != undefined) {
|
|
1326
|
+
this.cache.add(surfaceForm, fsmParseList);
|
|
1292
1327
|
}
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1328
|
+
return fsmParseList;
|
|
1329
|
+
}
|
|
1330
|
+
let fsmParse = new Array();
|
|
1331
|
+
if (surfaceForm.includes("'")) {
|
|
1332
|
+
let possibleRoot = surfaceForm.substring(0, surfaceForm.indexOf('\''));
|
|
1333
|
+
if (possibleRoot != "") {
|
|
1334
|
+
if (possibleRoot.includes("/") || possibleRoot.includes("\\/")) {
|
|
1335
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
|
|
1336
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1298
1337
|
}
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
if (surfaceForm.includes("'")) {
|
|
1303
|
-
let possibleRoot = surfaceForm.substring(0, surfaceForm.indexOf('\''));
|
|
1304
|
-
if (possibleRoot != "") {
|
|
1305
|
-
if (possibleRoot.includes("/") || possibleRoot.includes("\\/")) {
|
|
1306
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
|
|
1338
|
+
else {
|
|
1339
|
+
if (this.isDate(possibleRoot)) {
|
|
1340
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_DATE"));
|
|
1307
1341
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1308
1342
|
}
|
|
1309
1343
|
else {
|
|
1310
|
-
if (this.
|
|
1311
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1344
|
+
if (this.patternMatches("^\\d+/\\d+$", possibleRoot)) {
|
|
1345
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
|
|
1312
1346
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1313
1347
|
}
|
|
1314
1348
|
else {
|
|
1315
|
-
if (this.
|
|
1316
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1349
|
+
if (this.isPercent(possibleRoot)) {
|
|
1350
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_PERCENT"));
|
|
1317
1351
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1318
1352
|
}
|
|
1319
1353
|
else {
|
|
1320
|
-
if (this.
|
|
1321
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1354
|
+
if (this.isTime(surfaceForm)) {
|
|
1355
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_ZAMAN"));
|
|
1322
1356
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1323
1357
|
}
|
|
1324
1358
|
else {
|
|
1325
|
-
if (this.
|
|
1326
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1359
|
+
if (this.isRange(surfaceForm)) {
|
|
1360
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_RANGE"));
|
|
1327
1361
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1328
1362
|
}
|
|
1329
1363
|
else {
|
|
1330
|
-
if (this.
|
|
1331
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1364
|
+
if (this.isInteger(possibleRoot)) {
|
|
1365
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_SAYI"));
|
|
1332
1366
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1333
1367
|
}
|
|
1334
1368
|
else {
|
|
1335
|
-
if (this.
|
|
1336
|
-
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "
|
|
1369
|
+
if (this.isDouble(possibleRoot)) {
|
|
1370
|
+
this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_REELSAYI"));
|
|
1337
1371
|
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1338
1372
|
}
|
|
1339
1373
|
else {
|
|
1340
|
-
if (
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
if (this.pronunciations.has(possibleRootLowerCased)) {
|
|
1349
|
-
isRootReplaced = true;
|
|
1350
|
-
pronunciation = this.pronunciations.get(possibleRootLowerCased);
|
|
1351
|
-
if (this.dictionary.getWord(pronunciation) != null) {
|
|
1352
|
-
this.dictionary.getWord(pronunciation).addFlag("IS_OA");
|
|
1353
|
-
}
|
|
1354
|
-
else {
|
|
1355
|
-
newWord = new TxtWord_1.TxtWord(pronunciation, "IS_OA");
|
|
1356
|
-
this.dictionaryTrie.addWord(pronunciation, newWord);
|
|
1357
|
-
}
|
|
1358
|
-
let replacedWord = pronunciation + lowerCased.substring(possibleRootLowerCased.length);
|
|
1359
|
-
fsmParse = this.analysis(replacedWord, this.isProperNoun(surfaceForm));
|
|
1374
|
+
if (Word_1.Word.isCapital(possibleRoot) || "QXW".includes(possibleRoot.substring(0, 1))) {
|
|
1375
|
+
let newWord = undefined;
|
|
1376
|
+
possibleRootLowerCased = possibleRoot.toLocaleLowerCase("tr");
|
|
1377
|
+
if (this.pronunciations.has(possibleRootLowerCased)) {
|
|
1378
|
+
isRootReplaced = true;
|
|
1379
|
+
pronunciation = this.pronunciations.get(possibleRootLowerCased);
|
|
1380
|
+
if (this.dictionary.getWord(pronunciation) != null) {
|
|
1381
|
+
this.dictionary.getWord(pronunciation).addFlag("IS_OA");
|
|
1360
1382
|
}
|
|
1361
1383
|
else {
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
}
|
|
1365
|
-
else {
|
|
1366
|
-
newWord = new TxtWord_1.TxtWord(possibleRootLowerCased, "IS_OA");
|
|
1367
|
-
this.dictionaryTrie.addWord(possibleRootLowerCased, newWord);
|
|
1368
|
-
}
|
|
1369
|
-
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1384
|
+
newWord = new TxtWord_1.TxtWord(pronunciation, "IS_OA");
|
|
1385
|
+
this.dictionaryTrie.addWord(pronunciation, newWord);
|
|
1370
1386
|
}
|
|
1387
|
+
let replacedWord = pronunciation + lowerCased.substring(possibleRootLowerCased.length);
|
|
1388
|
+
fsmParse = this.analysis(replacedWord, this.isProperNoun(surfaceForm));
|
|
1389
|
+
}
|
|
1390
|
+
else {
|
|
1391
|
+
if (this.dictionary.getWord(possibleRootLowerCased) != null) {
|
|
1392
|
+
this.dictionary.getWord(possibleRootLowerCased).addFlag("IS_OA");
|
|
1393
|
+
}
|
|
1394
|
+
else {
|
|
1395
|
+
newWord = new TxtWord_1.TxtWord(possibleRootLowerCased, "IS_OA");
|
|
1396
|
+
this.dictionaryTrie.addWord(possibleRootLowerCased, newWord);
|
|
1397
|
+
}
|
|
1398
|
+
fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
|
|
1371
1399
|
}
|
|
1372
1400
|
}
|
|
1373
1401
|
}
|
|
@@ -1379,30 +1407,29 @@
|
|
|
1379
1407
|
}
|
|
1380
1408
|
}
|
|
1381
1409
|
}
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
}
|
|
1387
|
-
let fsmParseList = new FsmParseList_1.FsmParseList(fsmParse);
|
|
1388
|
-
if (this.cache != undefined && fsmParseList.size() > 0) {
|
|
1389
|
-
this.cache.add(surfaceForm, fsmParseList);
|
|
1410
|
+
}
|
|
1411
|
+
if (!isRootReplaced) {
|
|
1412
|
+
for (let parse of fsmParse) {
|
|
1413
|
+
parse.restoreOriginalForm(possibleRootLowerCased, pronunciation);
|
|
1390
1414
|
}
|
|
1391
|
-
return fsmParseList;
|
|
1392
1415
|
}
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
*
|
|
1397
|
-
* @param surfaceForm String to check.
|
|
1398
|
-
* @param rootWord TxtWord input root.
|
|
1399
|
-
* @return true an analysis exists, otherwise return false.
|
|
1400
|
-
*/
|
|
1401
|
-
morphologicalAnalysisExists(rootWord, surfaceForm) {
|
|
1402
|
-
return this.analysisExists(rootWord, surfaceForm.toLocaleLowerCase("tr"), true);
|
|
1416
|
+
let fsmParseList = new FsmParseList_1.FsmParseList(fsmParse);
|
|
1417
|
+
if (this.cache != undefined && fsmParseList.size() > 0) {
|
|
1418
|
+
this.cache.add(surfaceForm, fsmParseList);
|
|
1403
1419
|
}
|
|
1420
|
+
return fsmParseList;
|
|
1404
1421
|
}
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1422
|
+
/**
|
|
1423
|
+
* The morphologicalAnalysisExists method calls analysisExists to check the existence of the analysis with given
|
|
1424
|
+
* root and surfaceForm.
|
|
1425
|
+
*
|
|
1426
|
+
* @param surfaceForm String to check.
|
|
1427
|
+
* @param rootWord TxtWord input root.
|
|
1428
|
+
* @return true an analysis exists, otherwise return false.
|
|
1429
|
+
*/
|
|
1430
|
+
morphologicalAnalysisExists(rootWord, surfaceForm) {
|
|
1431
|
+
return this.analysisExists(rootWord, surfaceForm.toLocaleLowerCase("tr"), true);
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
exports.FsmMorphologicalAnalyzer = FsmMorphologicalAnalyzer;
|
|
1408
1435
|
//# sourceMappingURL=FsmMorphologicalAnalyzer.js.map
|