nlptoolkit-morphologicalanalysis 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/README.md +1 -1
  2. package/dist/Corpus/DisambiguationCorpus.d.ts +7 -0
  3. package/dist/Corpus/DisambiguationCorpus.js +7 -0
  4. package/dist/Corpus/DisambiguationCorpus.js.map +1 -1
  5. package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.d.ts +67 -3
  6. package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js +136 -64
  7. package/dist/MorphologicalAnalysis/FsmMorphologicalAnalyzer.js.map +1 -1
  8. package/dist/MorphologicalAnalysis/FsmParse.d.ts +9 -0
  9. package/dist/MorphologicalAnalysis/FsmParse.js +15 -0
  10. package/dist/MorphologicalAnalysis/FsmParse.js.map +1 -1
  11. package/dist/MorphologicalAnalysis/InflectionalGroup.js +3 -2
  12. package/dist/MorphologicalAnalysis/InflectionalGroup.js.map +1 -1
  13. package/dist/MorphologicalAnalysis/MorphologicalParse.d.ts +98 -0
  14. package/dist/MorphologicalAnalysis/MorphologicalParse.js +161 -10
  15. package/dist/MorphologicalAnalysis/MorphologicalParse.js.map +1 -1
  16. package/dist/MorphologicalAnalysis/MorphologicalTag.d.ts +9 -1
  17. package/dist/MorphologicalAnalysis/MorphologicalTag.js +8 -0
  18. package/dist/MorphologicalAnalysis/MorphologicalTag.js.map +1 -1
  19. package/dist/MorphologicalAnalysis/MorphotacticEngine.d.ts +47 -0
  20. package/dist/MorphologicalAnalysis/MorphotacticEngine.js +51 -1
  21. package/dist/MorphologicalAnalysis/MorphotacticEngine.js.map +1 -1
  22. package/dist/MorphologicalAnalysis/Transition.d.ts +22 -8
  23. package/dist/MorphologicalAnalysis/Transition.js +25 -9
  24. package/dist/MorphologicalAnalysis/Transition.js.map +1 -1
  25. package/package.json +2 -2
  26. package/parses/ac/314/247/304/261kla.txt +57 -3
  27. package/parses/ak.txt +72 -3
  28. package/parses/aksa.txt +40 -2
  29. package/parses/anla.txt +57 -3
  30. package/parses/azal.txt +63 -4
  31. package/parses/bo/314/210l.txt +53 -3
  32. package/parses/bul.txt +53 -3
  33. package/parses/cenk.txt +8 -0
  34. package/parses/cevapla.txt +74 -4
  35. package/parses/cos/314/247.txt +53 -3
  36. package/parses/c/314/247o/314/210k.txt +54 -3
  37. package/parses/c/314/247/304/261k.txt +59 -3
  38. package/parses/del.txt +47 -3
  39. package/parses/doldur.txt +47 -3
  40. package/parses/emlak.txt +2 -0
  41. package/parses/git.txt +59 -3
  42. package/parses/giy.txt +59 -3
  43. package/parses/go/314/210c/314/247.txt +59 -3
  44. package/parses/go/314/210ster.txt +63 -4
  45. package/parses/hal.txt +20 -4
  46. package/parses/kalp.txt +29 -4
  47. package/parses/kavur.txt +80 -5
  48. package/parses/kaydol.txt +69 -4
  49. package/parses/resim.txt +14 -0
  50. package/parses/s/304/261ska.txt +24 -0
  51. package/parses/ye.txt +40 -2
  52. package/parses/yemek.txt +6 -0
  53. package/parses/y/304/261ka.txt +90 -5
  54. package/parses/y/304/261ldo/314/210nu/314/210mu/314/210.txt +6 -0
  55. package/pronunciations.txt +490 -0
  56. package/source/Corpus/DisambiguationCorpus.ts +7 -0
  57. package/source/MorphologicalAnalysis/FsmMorphologicalAnalyzer.ts +141 -67
  58. package/source/MorphologicalAnalysis/FsmParse.ts +16 -1
  59. package/source/MorphologicalAnalysis/InflectionalGroup.ts +3 -2
  60. package/source/MorphologicalAnalysis/MorphologicalParse.ts +161 -10
  61. package/source/MorphologicalAnalysis/MorphologicalTag.ts +9 -1
  62. package/source/MorphologicalAnalysis/MorphotacticEngine.ts +51 -1
  63. package/source/MorphologicalAnalysis/Transition.ts +25 -9
  64. package/tests/DisambiguationCorpusTest.js +14 -0
  65. package/tests/DisambiguationCorpusTest.js.map +1 -0
  66. package/tests/FiniteStateMachineTest.js +96 -0
  67. package/tests/FiniteStateMachineTest.js.map +1 -0
  68. package/tests/FiniteStateMachineTest.ts +1 -1
  69. package/tests/FsmMorphologicalAnalyzerTest.js +250 -0
  70. package/tests/FsmMorphologicalAnalyzerTest.js.map +1 -0
  71. package/tests/FsmMorphologicalAnalyzerTest.ts +9 -10
  72. package/tests/FsmParseListTest.js +100 -0
  73. package/tests/FsmParseListTest.js.map +1 -0
  74. package/tests/FsmParseTest.js +68 -0
  75. package/tests/FsmParseTest.js.map +1 -0
  76. package/tests/InflectionalGroupTest.js +86 -0
  77. package/tests/InflectionalGroupTest.js.map +1 -0
  78. package/tests/MorphologicalParseTest.js +154 -0
  79. package/tests/MorphologicalParseTest.js.map +1 -0
  80. package/tests/TransitionTest.js +184 -0
  81. package/tests/TransitionTest.js.map +1 -0
  82. package/tests/TransitionTest.ts +8 -0
  83. package/turkish_finite_state_machine.xml +11 -3
package/README.md CHANGED
@@ -32,7 +32,7 @@ For Developers
32
32
  ============
33
33
 
34
34
  You can also see [Python](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-Py),
35
- [Java](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis), [C++](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-CPP),
35
+ [Java](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis), [C++](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-CPP), [C](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-C),
36
36
  [Swift](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-Swift), [Cython](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-Cy),
37
37
  or [C#](https://github.com/starlangsoftware/TurkishMorphologicalAnalysis-CS) repository.
38
38
 
@@ -1,4 +1,11 @@
1
1
  import { Corpus } from "nlptoolkit-corpus/dist/Corpus";
2
2
  export declare class DisambiguationCorpus extends Corpus {
3
+ /**
4
+ * Constructor which takes a file name {@link String} as an input and reads the file line by line. It takes each word of the line,
5
+ * and creates a new {@link DisambiguatedWord} with current word and its {@link MorphologicalParse}. It also creates a new {@link Sentence}
6
+ * when a new sentence starts, and adds each word to this sentence till the end of that sentence.
7
+ *
8
+ * @param fileName File which will be read and parsed.
9
+ */
3
10
  constructor(fileName?: string);
4
11
  }
@@ -16,6 +16,13 @@
16
16
  const MorphologicalParse_1 = require("../MorphologicalAnalysis/MorphologicalParse");
17
17
  const Sentence_1 = require("nlptoolkit-corpus/dist/Sentence");
18
18
  class DisambiguationCorpus extends Corpus_1.Corpus {
19
+ /**
20
+ * Constructor which takes a file name {@link String} as an input and reads the file line by line. It takes each word of the line,
21
+ * and creates a new {@link DisambiguatedWord} with current word and its {@link MorphologicalParse}. It also creates a new {@link Sentence}
22
+ * when a new sentence starts, and adds each word to this sentence till the end of that sentence.
23
+ *
24
+ * @param fileName File which will be read and parsed.
25
+ */
19
26
  constructor(fileName) {
20
27
  super();
21
28
  if (fileName != undefined) {
@@ -1 +1 @@
1
- {"version":3,"file":"DisambiguationCorpus.js","sourceRoot":"","sources":["../../source/Corpus/DisambiguationCorpus.ts"],"names":[],"mappings":";;;;;;;;;;;;IAAA,0DAAqD;IACrD,yBAAyB;IACzB,2DAAsD;IACtD,oFAA+E;IAC/E,8DAAyD;IAEzD,MAAa,oBAAqB,SAAQ,eAAM;QAE5C,YAAY,QAAiB;YACzB,KAAK,EAAE,CAAC;YACR,IAAI,QAAQ,IAAI,SAAS,EAAC;gBACtB,IAAI,WAAW,GAAG,SAAS,CAAC;gBAC5B,IAAI,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;gBAC5C,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;gBAC5B,KAAK,IAAI,IAAI,IAAI,KAAK,EAAE;oBACpB,IAAI,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;oBACjD,IAAI,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;oBACnD,IAAI,IAAI,IAAI,EAAE,IAAI,KAAK,IAAI,EAAE,EAAE;wBAC3B,IAAI,OAAO,GAAG,IAAI,qCAAiB,CAAC,IAAI,EAAE,IAAI,uCAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;wBACzE,IAAI,IAAI,IAAI,KAAK,EAAE;4BACf,WAAW,GAAG,IAAI,mBAAQ,EAAE,CAAC;yBAChC;6BAAM;4BACH,IAAI,IAAI,IAAI,MAAM,EAAE;gCAChB,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC;6BACjC;iCAAM;gCACH,IAAI,IAAI,IAAI,OAAO,IAAI,IAAI,IAAI,QAAQ,IAAI,IAAI,IAAI,SAAS,IAAI,IAAI,IAAI,UAAU,EAAE;iCACnF;qCAAM;oCACH,IAAI,WAAW,IAAI,IAAI,EAAE;wCACrB,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;qCAChC;iCACJ;6BACJ;yBACJ;qBACJ;iBACJ;aACJ;QACL,CAAC;KAEJ;IAhCD,oDAgCC"}
1
+ {"version":3,"file":"DisambiguationCorpus.js","sourceRoot":"","sources":["../../source/Corpus/DisambiguationCorpus.ts"],"names":[],"mappings":";;;;;;;;;;;;IAAA,0DAAqD;IACrD,yBAAyB;IACzB,2DAAsD;IACtD,oFAA+E;IAC/E,8DAAyD;IAEzD,MAAa,oBAAqB,SAAQ,eAAM;QAE5C;;;;;;WAMG;QACH,YAAY,QAAiB;YACzB,KAAK,EAAE,CAAC;YACR,IAAI,QAAQ,IAAI,SAAS,EAAC;gBACtB,IAAI,WAAW,GAAG,SAAS,CAAC;gBAC5B,IAAI,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,CAAC,CAAA;gBAC5C,IAAI,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;gBAC5B,KAAK,IAAI,IAAI,IAAI,KAAK,EAAE;oBACpB,IAAI,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;oBACjD,IAAI,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;oBACnD,IAAI,IAAI,IAAI,EAAE,IAAI,KAAK,IAAI,EAAE,EAAE;wBAC3B,IAAI,OAAO,GAAG,IAAI,qCAAiB,CAAC,IAAI,EAAE,IAAI,uCAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;wBACzE,IAAI,IAAI,IAAI,KAAK,EAAE;4BACf,WAAW,GAAG,IAAI,mBAAQ,EAAE,CAAC;yBAChC;6BAAM;4BACH,IAAI,IAAI,IAAI,MAAM,EAAE;gCAChB,IAAI,CAAC,WAAW,CAAC,WAAW,CAAC,CAAC;6BACjC;iCAAM;gCACH,IAAI,IAAI,IAAI,OAAO,IAAI,IAAI,IAAI,QAAQ,IAAI,IAAI,IAAI,SAAS,IAAI,IAAI,IAAI,UAAU,EAAE;iCACnF;qCAAM;oCACH,IAAI,WAAW,IAAI,IAAI,EAAE;wCACrB,WAAW,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;qCAChC;iCACJ;6BACJ;yBACJ;qBACJ;iBACJ;aACJ;QACL,CAAC;KAEJ;IAvCD,oDAuCC"}
@@ -10,10 +10,11 @@ export declare class FsmMorphologicalAnalyzer {
10
10
  private dictionaryTrie;
11
11
  private suffixTrie;
12
12
  private parsedSurfaceForms;
13
- private finiteStateMachine;
13
+ private pronunciations;
14
+ private readonly finiteStateMachine;
14
15
  private static MAX_DISTANCE;
15
- private dictionary;
16
- private cache;
16
+ private readonly dictionary;
17
+ private readonly cache;
17
18
  private mostUsedPatterns;
18
19
  /**
19
20
  * Another constructor of FsmMorphologicalAnalyzer class. It generates a new TxtDictionary type dictionary from
@@ -24,9 +25,29 @@ export declare class FsmMorphologicalAnalyzer {
24
25
  * @param cacheSize the size of the LRUCache.
25
26
  */
26
27
  constructor(fileName?: string, dictionaryFileNameOrDictionary?: any, cacheSize?: number);
28
+ /**
29
+ * Constructs and returns the reverse string of a given string.
30
+ * @param s String to be reversed.
31
+ * @return Reverse of a given string.
32
+ */
27
33
  private reverseString;
34
+ /**
35
+ * Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
36
+ * suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
37
+ * which can be then used to match a given word for its possible suffix content.
38
+ */
28
39
  private prepareSuffixTrie;
40
+ /**
41
+ * Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
42
+ * which have at least one morphological analysis in Turkish.
43
+ * @param fileName Input file containing analyzable surface forms and their root forms.
44
+ */
29
45
  addParsedSurfaceForms(fileName: string): void;
46
+ /**
47
+ * Reads the file for foreign words and their pronunciations.
48
+ * @param fileName Input file containing foreign words and their pronunciations.
49
+ */
50
+ addPronunciations(fileName: string): void;
30
51
  /**
31
52
  * The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
32
53
  * First it determines whether the given morphologicalParse is the root verb and whether it contains a verb tag.
@@ -319,6 +340,15 @@ export declare class FsmMorphologicalAnalyzer {
319
340
  * @return ArrayList type initialFsmParse which holds the analyses.
320
341
  */
321
342
  analysis(surfaceForm: string, isProper: boolean): Array<FsmParse>;
343
+ /**
344
+ * This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
345
+ * the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
346
+ * mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
347
+ * is compiled and put in the mostUsedPatterns.
348
+ * @param expr Pattern to check
349
+ * @param value String to match the pattern
350
+ * @return True if the string matches the pattern, false otherwise.
351
+ */
322
352
  private patternMatches;
323
353
  /**
324
354
  * The isProperNoun method takes surfaceForm String as input and checks its each char whether they are in the range
@@ -335,6 +365,19 @@ export declare class FsmMorphologicalAnalyzer {
335
365
  * @return true if it is a code-like word, return false otherwise.
336
366
  */
337
367
  isCode(surfaceForm: string): boolean;
368
+ /**
369
+ * Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
370
+ * for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
371
+ * way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
372
+ * have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
373
+ * surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
374
+ * a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
375
+ * will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
376
+ * with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
377
+ * with 'morfolojiklik'.
378
+ * @param surfaceForm Surface form for which we will identify a possible new root form.
379
+ * @return Possible new root form.
380
+ */
338
381
  private rootOfPossiblyNewWord;
339
382
  /**
340
383
  * The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
@@ -383,9 +426,30 @@ export declare class FsmMorphologicalAnalyzer {
383
426
  * @return true if surfaceForm matches with the regex.
384
427
  */
385
428
  private isNumber;
429
+ /**
430
+ * Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
431
+ * @param surfaceForm Surface form to be checked.
432
+ * @return True if the surface form is in percent form
433
+ */
386
434
  private isPercent;
435
+ /**
436
+ * Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
437
+ * @param surfaceForm Surface form to be checked.
438
+ * @return True if the surface form is in time form
439
+ */
387
440
  private isTime;
441
+ /**
442
+ * Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
443
+ * 3.45-4.67.
444
+ * @param surfaceForm Surface form to be checked.
445
+ * @return True if the surface form is in range form
446
+ */
388
447
  private isRange;
448
+ /**
449
+ * Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
450
+ * @param surfaceForm Surface form to be checked.
451
+ * @return True if the surface form is in date form
452
+ */
389
453
  private isDate;
390
454
  /**
391
455
  * The morphologicalAnalysis method is used to analyse a FsmParseList by comparing with the regex.
@@ -4,7 +4,7 @@
4
4
  if (v !== undefined) module.exports = v;
5
5
  }
6
6
  else if (typeof define === "function" && define.amd) {
7
- define(["require", "exports", "nlptoolkit-dictionary/dist/Dictionary/Trie/Trie", "./FiniteStateMachine", "nlptoolkit-dictionary/dist/Dictionary/TxtDictionary", "nlptoolkit-datastructure/dist/LRUCache", "./FsmParseList", "nlptoolkit-dictionary/dist/Dictionary/WordComparator", "fs", "./Transition", "./MorphologicalTag", "nlptoolkit-dictionary/dist/Dictionary/TxtWord", "./FsmParse", "nlptoolkit-corpus/dist/Sentence", "nlptoolkit-dictionary/dist/Dictionary/Word", "./State", "nlptoolkit-datastructure/dist/Queue"], factory);
7
+ define(["require", "exports", "nlptoolkit-dictionary/dist/Dictionary/Trie/Trie", "./FiniteStateMachine", "nlptoolkit-dictionary/dist/Dictionary/TxtDictionary", "nlptoolkit-datastructure/dist/LRUCache", "./FsmParseList", "nlptoolkit-dictionary/dist/Dictionary/WordComparator", "fs", "./Transition", "./MorphologicalTag", "nlptoolkit-dictionary/dist/Dictionary/TxtWord", "./FsmParse", "nlptoolkit-corpus/dist/Sentence", "nlptoolkit-dictionary/dist/Dictionary/Word", "./State", "nlptoolkit-datastructure/dist/Queue", "nlptoolkit-util/dist/FileUtils"], factory);
8
8
  }
9
9
  })(function (require, exports) {
10
10
  "use strict";
@@ -25,6 +25,7 @@
25
25
  const Word_1 = require("nlptoolkit-dictionary/dist/Dictionary/Word");
26
26
  const State_1 = require("./State");
27
27
  const Queue_1 = require("nlptoolkit-datastructure/dist/Queue");
28
+ const FileUtils_1 = require("nlptoolkit-util/dist/FileUtils");
28
29
  class FsmMorphologicalAnalyzer {
29
30
  /**
30
31
  * Another constructor of FsmMorphologicalAnalyzer class. It generates a new TxtDictionary type dictionary from
@@ -36,6 +37,7 @@
36
37
  */
37
38
  constructor(fileName, dictionaryFileNameOrDictionary, cacheSize) {
38
39
  this.parsedSurfaceForms = undefined;
40
+ this.pronunciations = undefined;
39
41
  this.cache = undefined;
40
42
  this.mostUsedPatterns = new Map();
41
43
  if (dictionaryFileNameOrDictionary == undefined) {
@@ -60,7 +62,13 @@
60
62
  if (cacheSize > 0) {
61
63
  this.cache = new LRUCache_1.LRUCache(cacheSize);
62
64
  }
65
+ this.addPronunciations("pronunciations.txt");
63
66
  }
67
+ /**
68
+ * Constructs and returns the reverse string of a given string.
69
+ * @param s String to be reversed.
70
+ * @return Reverse of a given string.
71
+ */
64
72
  reverseString(s) {
65
73
  let result = "";
66
74
  for (let i = s.length - 1; i >= 0; i--) {
@@ -68,6 +76,11 @@
68
76
  }
69
77
  return result;
70
78
  }
79
+ /**
80
+ * Constructs the suffix trie from the input file suffixes.txt. suffixes.txt contains the most frequent 6000
81
+ * suffixes that a verb or a noun can take. The suffix trie is a trie that stores these suffixes in reverse form,
82
+ * which can be then used to match a given word for its possible suffix content.
83
+ */
71
84
  prepareSuffixTrie() {
72
85
  this.suffixTrie = new Trie_1.Trie();
73
86
  let data = fs.readFileSync("suffixes.txt", 'utf8');
@@ -77,14 +90,20 @@
77
90
  this.suffixTrie.addWord(reverseSuffix, new Word_1.Word(reverseSuffix));
78
91
  }
79
92
  }
93
+ /**
94
+ * Reads the file for correct surface forms and their most frequent root forms, in other words, the surface forms
95
+ * which have at least one morphological analysis in Turkish.
96
+ * @param fileName Input file containing analyzable surface forms and their root forms.
97
+ */
80
98
  addParsedSurfaceForms(fileName) {
81
- this.parsedSurfaceForms = new Map();
82
- let data = fs.readFileSync(fileName, 'utf8');
83
- let lines = data.split("\n");
84
- for (let line of lines) {
85
- let items = line.split(" ");
86
- this.parsedSurfaceForms.set(items[0], items[1]);
87
- }
99
+ this.parsedSurfaceForms = FileUtils_1.FileUtils.readHashMap(fileName);
100
+ }
101
+ /**
102
+ * Reads the file for foreign words and their pronunciations.
103
+ * @param fileName Input file containing foreign words and their pronunciations.
104
+ */
105
+ addPronunciations(fileName) {
106
+ this.pronunciations = FileUtils_1.FileUtils.readHashMap(fileName);
88
107
  }
89
108
  /**
90
109
  * The getPossibleWords method takes {@link MorphologicalParse} and {@link MetamorphicParse} as input.
@@ -677,7 +696,7 @@
677
696
  */
678
697
  parseWordLength(fsmParse, maxLength) {
679
698
  let result = new Array();
680
- let resultSuffixList = new Array();
699
+ let resultTransitionList = new Array();
681
700
  let parseQueue = new Queue_1.Queue(1000);
682
701
  parseQueue.enqueueAll(fsmParse);
683
702
  while (!parseQueue.isEmpty()) {
@@ -687,11 +706,11 @@
687
706
  let currentState = currentFsmParse.getFinalSuffix();
688
707
  let currentSurfaceForm = currentFsmParse.getSurfaceForm();
689
708
  if (currentState.isEndState() && currentSurfaceForm.length <= maxLength) {
690
- let currentSuffixList = currentFsmParse.getSuffixList();
691
- if (!resultSuffixList.includes(currentSuffixList)) {
709
+ let currentTransitionList = currentSurfaceForm + " " + currentFsmParse.getFsmParseTransitionList();
710
+ if (!resultTransitionList.includes(currentTransitionList)) {
692
711
  result.push(currentFsmParse);
693
712
  currentFsmParse.constructInflectionalGroups();
694
- resultSuffixList.push(currentSuffixList);
713
+ resultTransitionList.push(currentTransitionList);
695
714
  }
696
715
  }
697
716
  this.addNewParsesFromCurrentParseLength(currentFsmParse, parseQueue, maxLength, root);
@@ -708,7 +727,7 @@
708
727
  */
709
728
  parseWordSurfaceForm(fsmParse, surfaceForm) {
710
729
  let result = new Array();
711
- let resultSuffixList = new Array();
730
+ let resultTransitionList = new Array();
712
731
  let parseQueue = new Queue_1.Queue(1000);
713
732
  parseQueue.enqueueAll(fsmParse);
714
733
  while (!parseQueue.isEmpty()) {
@@ -718,11 +737,11 @@
718
737
  let currentState = currentFsmParse.getFinalSuffix();
719
738
  let currentSurfaceForm = currentFsmParse.getSurfaceForm();
720
739
  if (currentState.isEndState() && currentSurfaceForm == surfaceForm) {
721
- let currentSuffixList = currentFsmParse.getSuffixList();
722
- if (!resultSuffixList.includes(currentSuffixList)) {
740
+ let currentTransitionList = currentFsmParse.getFsmParseTransitionList();
741
+ if (!resultTransitionList.includes(currentTransitionList)) {
723
742
  result.push(currentFsmParse);
724
743
  currentFsmParse.constructInflectionalGroups();
725
- resultSuffixList.push(currentSuffixList);
744
+ resultTransitionList.push(currentTransitionList);
726
745
  }
727
746
  }
728
747
  this.addNewParsesFromCurrentParseSurfaceForm(currentFsmParse, parseQueue, surfaceForm, root);
@@ -986,6 +1005,15 @@
986
1005
  initialFsmParse = this.initializeParseListFromSurfaceForm(surfaceForm, isProper);
987
1006
  return this.parseWordSurfaceForm(initialFsmParse, surfaceForm);
988
1007
  }
1008
+ /**
1009
+ * This method uses cache idea to speed up pattern matching in Fsm. mostUsedPatterns stores the compiled forms of
1010
+ * the previously used patterns. When Fsm tries to match a string to a pattern, first we check if it exists in
1011
+ * mostUsedPatterns. If it exists, we directly use the compiled pattern to match the string. Otherwise, new pattern
1012
+ * is compiled and put in the mostUsedPatterns.
1013
+ * @param expr Pattern to check
1014
+ * @param value String to match the pattern
1015
+ * @return True if the string matches the pattern, false otherwise.
1016
+ */
989
1017
  patternMatches(expr, value) {
990
1018
  let p = this.mostUsedPatterns.get(expr);
991
1019
  if (p == undefined) {
@@ -1021,31 +1049,39 @@
1021
1049
  }
1022
1050
  return this.patternMatches("^.*[0-9].*$", surfaceForm) && this.patternMatches("^.*[a-zA-ZçöğüşıÇÖĞÜŞİ].*$", surfaceForm);
1023
1051
  }
1052
+ /**
1053
+ * Identifies a possible new root word for a given surface form. It also adds the new root form to the dictionary
1054
+ * for further usage. The method first searches the suffix trie for the reverse string of the surface form. This
1055
+ * way, it can identify if the word has a suffix that is in the most frequently used suffix list. Since a word can
1056
+ * have multiple possible suffixes, the method identifies the longest suffix and returns the substring of the
1057
+ * surface form tht does not contain the suffix. Let say the word is 'googlelaştırdık', it will identify 'tık' as
1058
+ * a suffix and will return 'googlelaştır' as a possible root form. Another example will be 'homelesslerimizle', it
1059
+ * will identify 'lerimizle' as suffix and will return 'homeless' as a possible root form. If the root word ends
1060
+ * with 'ğ', it is replacesd with 'k'. 'morfolojikliğini' will return 'morfolojikliğ' then which will be replaced
1061
+ * with 'morfolojiklik'.
1062
+ * @param surfaceForm Surface form for which we will identify a possible new root form.
1063
+ * @return Possible new root form.
1064
+ */
1024
1065
  rootOfPossiblyNewWord(surfaceForm) {
1025
1066
  let words = this.suffixTrie.getWordsWithPrefix(this.reverseString(surfaceForm));
1026
- let maxLength = 0;
1027
- let longestWord = null;
1067
+ let candidateWord = null;
1068
+ let candidateList = new Array();
1028
1069
  for (let word of words) {
1029
- if (word.getName().length > maxLength) {
1030
- longestWord = surfaceForm.substring(0, surfaceForm.length - word.getName().length);
1031
- maxLength = word.getName().length;
1032
- }
1033
- }
1034
- if (maxLength != 0) {
1070
+ candidateWord = surfaceForm.substring(0, surfaceForm.length - word.getName().length);
1035
1071
  let newWord;
1036
- if (longestWord.endsWith("ğ")) {
1037
- longestWord = longestWord.substring(0, longestWord.length - 1) + "k";
1038
- newWord = new TxtWord_1.TxtWord(longestWord, "CL_ISIM");
1072
+ if (candidateWord.endsWith("ğ")) {
1073
+ candidateWord = candidateWord.substring(0, candidateWord.length - 1) + "k";
1074
+ newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
1039
1075
  newWord.addFlag("IS_SD");
1040
1076
  }
1041
1077
  else {
1042
- newWord = new TxtWord_1.TxtWord(longestWord, "CL_ISIM");
1078
+ newWord = new TxtWord_1.TxtWord(candidateWord, "CL_ISIM");
1043
1079
  newWord.addFlag("CL_FIIL");
1044
1080
  }
1045
- this.dictionaryTrie.addWord(longestWord, newWord);
1046
- return newWord;
1081
+ candidateList.push(newWord);
1082
+ this.dictionaryTrie.addWord(candidateWord, newWord);
1047
1083
  }
1048
- return null;
1084
+ return candidateList;
1049
1085
  }
1050
1086
  /**
1051
1087
  * The robustMorphologicalAnalysis is used to analyse surfaceForm String. First it gets the currentParse of the surfaceForm
@@ -1066,21 +1102,17 @@
1066
1102
  if (this.isProperNoun(surfaceForm)) {
1067
1103
  fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("ProperRoot")));
1068
1104
  }
1069
- else {
1070
- if (this.isCode(surfaceForm)) {
1071
- fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("CodeRoot")));
1072
- }
1073
- else {
1074
- let newRoot = this.rootOfPossiblyNewWord(surfaceForm);
1075
- if (newRoot != null) {
1076
- fsmParse.push(new FsmParse_1.FsmParse(newRoot, this.finiteStateMachine.getState("VerbalRoot")));
1077
- fsmParse.push(new FsmParse_1.FsmParse(newRoot, this.finiteStateMachine.getState("NominalRoot")));
1078
- }
1079
- else {
1080
- fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")));
1081
- }
1105
+ if (this.isCode(surfaceForm)) {
1106
+ fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("CodeRoot")));
1107
+ }
1108
+ let newCandidateList = this.rootOfPossiblyNewWord(surfaceForm);
1109
+ if (newCandidateList.length != 0) {
1110
+ for (let word of newCandidateList) {
1111
+ fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("VerbalRoot")));
1112
+ fsmParse.push(new FsmParse_1.FsmParse(word, this.finiteStateMachine.getState("NominalRoot")));
1082
1113
  }
1083
1114
  }
1115
+ fsmParse.push(new FsmParse_1.FsmParse(surfaceForm, this.finiteStateMachine.getState("NominalRoot")));
1084
1116
  return new FsmParseList_1.FsmParseList(this.parseWordSurfaceForm(fsmParse, surfaceForm));
1085
1117
  }
1086
1118
  else {
@@ -1186,19 +1218,40 @@
1186
1218
  }
1187
1219
  return word == "" && count > 1;
1188
1220
  }
1221
+ /**
1222
+ * Checks if a given surface form matches to a percent value. It should be something like %4, %45, %4.3 or %56.786
1223
+ * @param surfaceForm Surface form to be checked.
1224
+ * @return True if the surface form is in percent form
1225
+ */
1189
1226
  isPercent(surfaceForm) {
1190
1227
  return this.patternMatches("^%(\\d\\d|\\d)$", surfaceForm) ||
1191
1228
  this.patternMatches("^%(\\d\\d|\\d)\\.\\d+$", surfaceForm);
1192
1229
  }
1230
+ /**
1231
+ * Checks if a given surface form matches to a time form. It should be something like 3:34, 12:56 etc.
1232
+ * @param surfaceForm Surface form to be checked.
1233
+ * @return True if the surface form is in time form
1234
+ */
1193
1235
  isTime(surfaceForm) {
1194
1236
  return this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
1195
1237
  this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm);
1196
1238
  }
1239
+ /**
1240
+ * Checks if a given surface form matches to a range form. It should be something like 123-1400 or 12:34-15:78 or
1241
+ * 3.45-4.67.
1242
+ * @param surfaceForm Surface form to be checked.
1243
+ * @return True if the surface form is in range form
1244
+ */
1197
1245
  isRange(surfaceForm) {
1198
1246
  return this.patternMatches("^\\d+-\\d+$", surfaceForm) ||
1199
1247
  this.patternMatches("^(\\d\\d|\\d):(\\d\\d|\\d)-(\\d\\d|\\d):(\\d\\d|\\d)$", surfaceForm) ||
1200
1248
  this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)-(\\d\\d|\\d)\\.(\\d\\d|\\d)$", surfaceForm);
1201
1249
  }
1250
+ /**
1251
+ * Checks if a given surface form matches to a date form. It should be something like 3/10/2023 or 2.3.2012
1252
+ * @param surfaceForm Surface form to be checked.
1253
+ * @return True if the surface form is in date form
1254
+ */
1202
1255
  isDate(surfaceForm) {
1203
1256
  return this.patternMatches("^(\\d\\d|\\d)/(\\d\\d|\\d)/\\d+$", surfaceForm) ||
1204
1257
  this.patternMatches("^(\\d\\d|\\d)\\.(\\d\\d|\\d)\\.\\d+$", surfaceForm);
@@ -1222,6 +1275,8 @@
1222
1275
  */
1223
1276
  morphologicalAnalysis(surfaceForm) {
1224
1277
  let lowerCased = surfaceForm.toLocaleLowerCase("tr");
1278
+ let possibleRootLowerCased = "", pronunciation = "";
1279
+ let isRootReplaced = false;
1225
1280
  if (this.parsedSurfaceForms != undefined && this.parsedSurfaceForms.has(lowerCased) &&
1226
1281
  !this.isInteger(surfaceForm) && !this.isDouble(surfaceForm) && !this.isPercent(surfaceForm) &&
1227
1282
  !this.isTime(surfaceForm) && !this.isRange(surfaceForm) && !this.isDate(surfaceForm)) {
@@ -1233,9 +1288,9 @@
1233
1288
  return this.cache.get(surfaceForm);
1234
1289
  }
1235
1290
  if (this.patternMatches("^(\\w|Ç|Ş|İ|Ü|Ö)\\.$", surfaceForm)) {
1236
- this.dictionaryTrie.addWord(surfaceForm.toLocaleLowerCase("tr"), new TxtWord_1.TxtWord(surfaceForm.toLocaleLowerCase("tr"), "IS_OA"));
1291
+ this.dictionaryTrie.addWord(lowerCased, new TxtWord_1.TxtWord(lowerCased, "IS_OA"));
1237
1292
  }
1238
- let defaultFsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1293
+ let defaultFsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1239
1294
  if (defaultFsmParse.length > 0) {
1240
1295
  let fsmParseList = new FsmParseList_1.FsmParseList(defaultFsmParse);
1241
1296
  if (this.cache != undefined) {
@@ -1249,57 +1304,69 @@
1249
1304
  if (possibleRoot != "") {
1250
1305
  if (possibleRoot.includes("/") || possibleRoot.includes("\\/")) {
1251
1306
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
1252
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1307
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1253
1308
  }
1254
1309
  else {
1255
1310
  if (this.isDate(possibleRoot)) {
1256
1311
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_DATE"));
1257
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1312
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1258
1313
  }
1259
1314
  else {
1260
1315
  if (this.patternMatches("^\\d+/\\d+$", possibleRoot)) {
1261
1316
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_KESIR"));
1262
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1317
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1263
1318
  }
1264
1319
  else {
1265
1320
  if (this.isPercent(possibleRoot)) {
1266
1321
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_PERCENT"));
1267
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1322
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1268
1323
  }
1269
1324
  else {
1270
1325
  if (this.isTime(surfaceForm)) {
1271
1326
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_ZAMAN"));
1272
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1327
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1273
1328
  }
1274
1329
  else {
1275
1330
  if (this.isRange(surfaceForm)) {
1276
1331
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_RANGE"));
1277
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1332
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1278
1333
  }
1279
1334
  else {
1280
1335
  if (this.isInteger(possibleRoot)) {
1281
1336
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_SAYI"));
1282
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1337
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1283
1338
  }
1284
1339
  else {
1285
1340
  if (this.isDouble(possibleRoot)) {
1286
1341
  this.dictionaryTrie.addWord(possibleRoot, new TxtWord_1.TxtWord(possibleRoot, "IS_REELSAYI"));
1287
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1342
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1288
1343
  }
1289
1344
  else {
1290
- if (Word_1.Word.isCapital(possibleRoot)) {
1345
+ if (Word_1.Word.isCapital(possibleRoot) || "QXW".includes(possibleRoot.substring(0, 1))) {
1291
1346
  let newWord = undefined;
1292
- if (this.dictionary.getWord(possibleRoot.toLocaleLowerCase("tr")) != null) {
1293
- this.dictionary.getWord(possibleRoot.toLocaleLowerCase("tr")).addFlag("IS_OA");
1347
+ possibleRootLowerCased = possibleRoot.toLocaleLowerCase("tr");
1348
+ if (this.pronunciations.has(possibleRootLowerCased)) {
1349
+ isRootReplaced = true;
1350
+ pronunciation = this.pronunciations.get(possibleRootLowerCased);
1351
+ if (this.dictionary.getWord(pronunciation) != null) {
1352
+ this.dictionary.getWord(pronunciation).addFlag("IS_OA");
1353
+ }
1354
+ else {
1355
+ newWord = new TxtWord_1.TxtWord(pronunciation, "IS_OA");
1356
+ this.dictionaryTrie.addWord(pronunciation, newWord);
1357
+ }
1358
+ let replacedWord = pronunciation + lowerCased.substring(possibleRootLowerCased.length);
1359
+ fsmParse = this.analysis(replacedWord, this.isProperNoun(surfaceForm));
1294
1360
  }
1295
1361
  else {
1296
- newWord = new TxtWord_1.TxtWord(possibleRoot.toLocaleLowerCase("tr"), "IS_OA");
1297
- this.dictionaryTrie.addWord(possibleRoot.toLocaleLowerCase("tr"), newWord);
1298
- }
1299
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1300
- if (fsmParse.length == 0 && newWord != undefined) {
1301
- newWord.addFlag("IS_KIS");
1302
- fsmParse = this.analysis(surfaceForm.toLocaleLowerCase("tr"), this.isProperNoun(surfaceForm));
1362
+ if (this.dictionary.getWord(possibleRootLowerCased) != null) {
1363
+ this.dictionary.getWord(possibleRootLowerCased).addFlag("IS_OA");
1364
+ }
1365
+ else {
1366
+ newWord = new TxtWord_1.TxtWord(possibleRootLowerCased, "IS_OA");
1367
+ this.dictionaryTrie.addWord(possibleRootLowerCased, newWord);
1368
+ }
1369
+ fsmParse = this.analysis(lowerCased, this.isProperNoun(surfaceForm));
1303
1370
  }
1304
1371
  }
1305
1372
  }
@@ -1312,6 +1379,11 @@
1312
1379
  }
1313
1380
  }
1314
1381
  }
1382
+ if (!isRootReplaced) {
1383
+ for (let parse of fsmParse) {
1384
+ parse.restoreOriginalForm(possibleRootLowerCased, pronunciation);
1385
+ }
1386
+ }
1315
1387
  let fsmParseList = new FsmParseList_1.FsmParseList(fsmParse);
1316
1388
  if (this.cache != undefined && fsmParseList.size() > 0) {
1317
1389
  this.cache.add(surfaceForm, fsmParseList);