nlptoolkit-dictionary 1.0.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. nlptoolkit_dictionary-1.0.36/Dictionary/Dictionary.py +229 -0
  2. nlptoolkit_dictionary-1.0.36/Dictionary/ExceptionalWord.py +52 -0
  3. nlptoolkit_dictionary-1.0.36/Dictionary/Pos.py +40 -0
  4. nlptoolkit_dictionary-1.0.36/Dictionary/Trie/Trie.py +88 -0
  5. nlptoolkit_dictionary-1.0.36/Dictionary/Trie/TrieNode.py +75 -0
  6. nlptoolkit_dictionary-1.0.36/Dictionary/Trie/__init__.py +0 -0
  7. nlptoolkit_dictionary-1.0.36/Dictionary/TxtDictionary.py +477 -0
  8. nlptoolkit_dictionary-1.0.36/Dictionary/TxtWord.py +899 -0
  9. nlptoolkit_dictionary-1.0.36/Dictionary/VectorizedDictionary.py +163 -0
  10. nlptoolkit_dictionary-1.0.36/Dictionary/VectorizedWord.py +38 -0
  11. nlptoolkit_dictionary-1.0.36/Dictionary/Word.py +389 -0
  12. nlptoolkit_dictionary-1.0.36/Dictionary/__init__.py +0 -0
  13. nlptoolkit_dictionary-1.0.36/Dictionary/data/__init__.py +0 -0
  14. nlptoolkit_dictionary-1.0.36/Dictionary/data/turkish_dictionary.txt +62120 -0
  15. nlptoolkit_dictionary-1.0.36/Dictionary/data/turkish_misspellings.txt +148932 -0
  16. nlptoolkit_dictionary-1.0.36/Dictionary/data/turkish_morphological_lexicon.txt +48155 -0
  17. nlptoolkit_dictionary-1.0.36/LICENSE +674 -0
  18. nlptoolkit_dictionary-1.0.36/Language/EnglishLanguage.py +7 -0
  19. nlptoolkit_dictionary-1.0.36/Language/Language.py +5 -0
  20. nlptoolkit_dictionary-1.0.36/Language/TurkishLanguage.py +192 -0
  21. nlptoolkit_dictionary-1.0.36/Language/__init__.py +0 -0
  22. nlptoolkit_dictionary-1.0.36/NlpToolkit_Dictionary.egg-info/PKG-INFO +286 -0
  23. nlptoolkit_dictionary-1.0.36/NlpToolkit_Dictionary.egg-info/SOURCES.txt +36 -0
  24. nlptoolkit_dictionary-1.0.36/NlpToolkit_Dictionary.egg-info/dependency_links.txt +1 -0
  25. nlptoolkit_dictionary-1.0.36/NlpToolkit_Dictionary.egg-info/requires.txt +2 -0
  26. nlptoolkit_dictionary-1.0.36/NlpToolkit_Dictionary.egg-info/top_level.txt +3 -0
  27. nlptoolkit_dictionary-1.0.36/PKG-INFO +286 -0
  28. nlptoolkit_dictionary-1.0.36/README.md +275 -0
  29. nlptoolkit_dictionary-1.0.36/Syllibification/Syllable.py +28 -0
  30. nlptoolkit_dictionary-1.0.36/Syllibification/SyllableList.py +80 -0
  31. nlptoolkit_dictionary-1.0.36/Syllibification/__init__.py +0 -0
  32. nlptoolkit_dictionary-1.0.36/setup.cfg +4 -0
  33. nlptoolkit_dictionary-1.0.36/setup.py +20 -0
@@ -0,0 +1,229 @@
1
+ from Dictionary.Word import Word
2
+
3
+
4
+ class Dictionary:
5
+
6
+ words: list
7
+ filename: str
8
+
9
+ def __init__(self, comparator=None):
10
+ """
11
+ An empty constructor of Dictionary class.
12
+ """
13
+ self.words = []
14
+ self.filename = ""
15
+ if comparator is None:
16
+ self.comparator = Dictionary.turkishLowerCaseComparator
17
+ else:
18
+ self.comparator = comparator
19
+
20
+ @staticmethod
21
+ def turkishLowerCaseComparator(wordA: Word, wordB: Word):
22
+ """
23
+ Compares two words in a case-sensitive manner.
24
+ :param wordA: First word to compare.
25
+ :param wordB: Second word to compare.
26
+ :return: the value 0 if the wordA is equal to the wordB; a value less than 0 if this wordA is
27
+ lexicographically less than wordB; and a value greater than 1 if this wordA is lexicographically greater
28
+ than wordB.
29
+ """
30
+ LOWERCASE_LETTERS = "abcçdefgğhıijklmnoöprsştuüvyz"
31
+ for i in range(min(len(wordA.getName()), len(wordB.getName()))):
32
+ first_char = wordA.getName()[i:i + 1]
33
+ second_char = wordB.getName()[i:i + 1]
34
+ if first_char != second_char:
35
+ if first_char in LOWERCASE_LETTERS and second_char not in LOWERCASE_LETTERS:
36
+ return -1
37
+ elif first_char not in LOWERCASE_LETTERS and second_char in LOWERCASE_LETTERS:
38
+ return 1
39
+ elif first_char in LOWERCASE_LETTERS and second_char in LOWERCASE_LETTERS:
40
+ first = LOWERCASE_LETTERS.index(first_char)
41
+ second = LOWERCASE_LETTERS.index(second_char)
42
+ if first < second:
43
+ return -1
44
+ elif first > second:
45
+ return 1
46
+ if len(wordA.getName()) < len(wordB.getName()):
47
+ return -1
48
+ elif len(wordA.getName()) > len(wordB.getName()):
49
+ return 1
50
+ else:
51
+ return 0
52
+
53
+ @staticmethod
54
+ def turkishIgnoreCaseComparator(wordA: Word, wordB: Word):
55
+ """
56
+ Compares two words in a case-insensitive manner.
57
+ :param wordA: First word to compare.
58
+ :param wordB: Second word to compare.
59
+ :return: the value 0 if the wordA is equal to the wordB; a value less than 0 if this wordA is
60
+ lexicographically less than wordB; and a value greater than 1 if this wordA is lexicographically greater
61
+ than wordB.
62
+ """
63
+ IGNORE_CASE_LETTERS = "aAbBcCçÇdDeEfFgGğĞhHıIiİjJkKlLmMnNoOöÖpPrRsSşŞtTuUüÜvVyYzZ"
64
+ for i in range(min(len(wordA.getName()), len(wordB.getName()))):
65
+ first_char = wordA.getName()[i:i + 1]
66
+ second_char = wordB.getName()[i:i + 1]
67
+ if first_char != second_char:
68
+ if first_char in IGNORE_CASE_LETTERS and second_char not in IGNORE_CASE_LETTERS:
69
+ return -1
70
+ elif first_char not in IGNORE_CASE_LETTERS and second_char in IGNORE_CASE_LETTERS:
71
+ return 1
72
+ elif first_char in IGNORE_CASE_LETTERS and second_char in IGNORE_CASE_LETTERS:
73
+ first = IGNORE_CASE_LETTERS.index(first_char)
74
+ second = IGNORE_CASE_LETTERS.index(second_char)
75
+ if first < second:
76
+ return -1
77
+ elif first > second:
78
+ return 1
79
+ if len(wordA.getName()) < len(wordB.getName()):
80
+ return -1
81
+ elif len(wordA.getName()) > len(wordB.getName()):
82
+ return 1
83
+ else:
84
+ return 0
85
+
86
+ def getWord(self, name: str) -> Word:
87
+ """
88
+ The getWord method takes a String name as an input and performs binary search within words list and assigns the
89
+ result to integer variable middle. If the middle is greater than 0, it returns the item at index middle of words
90
+ list, None otherwise.
91
+
92
+ PARAMETERS
93
+ ----------
94
+ name : str
95
+ String input.
96
+
97
+ RETURNS
98
+ -------
99
+ Word
100
+ the item at found index of words {@link ArrayList}, null if cannot be found.
101
+ """
102
+ word = Word(name)
103
+ middle = self.__getPosition(word)
104
+ if middle >= 0:
105
+ return self.words[middle]
106
+ return None
107
+
108
+ def getWordIndex(self, name: str) -> int:
109
+ """
110
+ The getWordIndex method takes a String name as an input and performs binary search within words list and assigns
111
+ the result to integer variable middle. If the middle is greater than 0, it returns the index middle, -1
112
+ otherwise.
113
+
114
+ PARAMETERS
115
+ ----------
116
+ name : str
117
+ String input.
118
+
119
+ RETURNS
120
+ -------
121
+ int
122
+ found index of words list, -1 if cannot be found.
123
+ """
124
+ word = Word(name)
125
+ middle = self.__getPosition(word)
126
+ if middle >= 0:
127
+ return middle
128
+ return -1
129
+
130
+ def removeWord(self, name: str):
131
+ """
132
+ RemoveWord removes a word with the given name
133
+
134
+ PARAMETERS
135
+ ----------
136
+ name : str
137
+ Name of the word to be removed.
138
+ """
139
+ index = self.getWordIndex(name)
140
+ if index != -1:
141
+ self.words.pop(index)
142
+
143
+ def size(self) -> int:
144
+ """
145
+ The size method returns the size of the words list.
146
+
147
+ RETURNS
148
+ -------
149
+ int
150
+ The size of the words list.
151
+ """
152
+ return len(self.words)
153
+
154
+ def getWordWithIndex(self, index: int) -> Word:
155
+ """
156
+ The getWordWithIndex method which takes an index as an input and returns the value at given index of words list.
157
+
158
+ PARAMETERS
159
+ ----------
160
+ index : int
161
+ index to get the value.
162
+
163
+ RETURNS
164
+ -------
165
+ Word
166
+ The value at given index of words list.
167
+ """
168
+ return self.words[index]
169
+
170
+ def longestWordSize(self) -> int:
171
+ """
172
+ The longestWordSize method loops through the words list and returns the item with the maximum word length.
173
+
174
+ RETURNS
175
+ -------
176
+ int
177
+ The item with the maximum word length.
178
+ """
179
+ max_length = 0
180
+ for word in self.words:
181
+ if len(word.getName()) > max_length:
182
+ max_length = len(word.getName())
183
+ return max_length
184
+
185
+ def __getPosition(self, word: Word) -> int:
186
+ """
187
+ Checks if a given word exists in the dictionary by performing a binary search on the words array.
188
+ :param word: Searched word
189
+ :return: the index of the search word, if it is contained in the words array; otherwise,
190
+ (-(insertion point) - 1). The insertion point is defined as the point at which the word would be inserted into
191
+ the words array.
192
+ """
193
+ lo = 0
194
+ hi = len(self.words) - 1
195
+ while lo <= hi:
196
+ mid = (lo + hi) // 2
197
+ if self.comparator(self.words[mid], word) < 0:
198
+ lo = mid + 1
199
+ elif self.comparator(self.words[mid], word) > 0:
200
+ hi = mid - 1
201
+ else:
202
+ return mid
203
+ return -(lo + 1)
204
+
205
+ def getWordStartingWith(self, _hash: str) -> int:
206
+ """
207
+ The getWordStartingWith method takes a String hash as an input and performs binary search within words list and
208
+ assigns the result to integer variable middle. If the middle is greater than 0, it returns the index middle,
209
+ -middle-1 otherwise.
210
+
211
+ PARAMETERS
212
+ ----------
213
+ _hash : str
214
+ String input.
215
+
216
+ RETURNS
217
+ -------
218
+ int
219
+ Found index of words list, -middle-1 if cannot be found.
220
+ """
221
+ word = Word(_hash)
222
+ middle = self.__getPosition(word)
223
+ if middle < 0:
224
+ return -middle - 1
225
+ else:
226
+ return middle
227
+
228
+ def __repr__(self):
229
+ return f"{self.words}"
@@ -0,0 +1,52 @@
1
+ from Dictionary.Word import Word
2
+ from Dictionary.Pos import Pos
3
+
4
+
5
+ class ExceptionalWord(Word):
6
+
7
+ __root: str
8
+ __pos: Pos
9
+
10
+ def __init__(self, name: str, root: str, pos: Pos):
11
+ """
12
+ A constructor of ExceptionalWord class which takes a Pos as a part of speech and two Strings; name
13
+ and root as inputs. Then, calls its super class Word with given name and initialises root and pos variables
14
+ with given inputs.
15
+
16
+ PARAMETERS
17
+ ----------
18
+ name : str
19
+ String input.
20
+ root : str
21
+ String input.
22
+ pos : Pos
23
+ Pos type input.
24
+ """
25
+ super().__init__(name)
26
+ self.__root = root
27
+ self.__pos = pos
28
+
29
+ def getRoot(self) -> str:
30
+ """
31
+ Getter for the root variable.
32
+
33
+ RETURNS
34
+ -------
35
+ str
36
+ root variable.
37
+ """
38
+ return self.__root
39
+
40
+ def getPos(self) -> Pos:
41
+ """
42
+ Getter for the pos variable.
43
+
44
+ RETURNS
45
+ -------
46
+ Pos
47
+ pos variable.
48
+ """
49
+ return self.__pos
50
+
51
+ def __repr__(self):
52
+ return f"{self.name} {self.__root} {self.__pos}"
@@ -0,0 +1,40 @@
1
+ from enum import Enum, auto
2
+
3
+
4
+ class Pos(Enum):
5
+ """
6
+ Parts of speech.
7
+ """
8
+
9
+ """
10
+ Adjective.
11
+ """
12
+ ADJECTIVE = auto()
13
+ """
14
+ Noun.
15
+ """
16
+ NOUN = auto()
17
+ """
18
+ Verb.
19
+ """
20
+ VERB = auto()
21
+ """
22
+ Adverb.
23
+ """
24
+ ADVERB = auto()
25
+ """
26
+ Conjunction.
27
+ """
28
+ CONJUNCTION = auto()
29
+ """
30
+ Interjection.
31
+ """
32
+ INTERJECTION = auto()
33
+ """
34
+ Preposition.
35
+ """
36
+ PREPOSITION = auto()
37
+ """
38
+ Pronoun.
39
+ """
40
+ PRONOUN = auto()
@@ -0,0 +1,88 @@
1
+ from Dictionary.Trie.TrieNode import TrieNode
2
+ from Dictionary.Word import Word
3
+ from Dictionary.TxtWord import TxtWord
4
+
5
+
6
+ class Trie:
7
+
8
+ __root_node: TrieNode
9
+
10
+ def __init__(self):
11
+ """
12
+ A constructor of Trie class which creates a new TrieNode as rootNode.
13
+ """
14
+ self.__root_node = TrieNode()
15
+
16
+ def addWord(self,
17
+ word: str,
18
+ root: Word):
19
+ """
20
+ The addWord method which takes a String word and a Word root as inputs and adds given word and root to the
21
+ rootNode.
22
+
23
+ PARAMETERS
24
+ ----------
25
+ word : str
26
+ String input.
27
+ root : Word
28
+ Word input.
29
+ """
30
+ self.__root_node.addWord(word, root)
31
+
32
+ def getWordsWithPrefix(self, surfaceForm: str) -> set[Word]:
33
+ """
34
+ The getWordsWithPrefix method which takes a String surfaceForm as an input. First it creates a TrieNode current
35
+ and assigns the rootNode to it, then it creates a new set words. It loops i times where i ranges from 0 to
36
+ length of surfaceForm and assigns current's child that corresponds to the surfaceForm's char at index i and
37
+ assigns it as TrieNode current. If current is not None, it adds all words of current to the words set.
38
+
39
+ PARAMETERS
40
+ ----------
41
+ surfaceForm : str
42
+ String input.
43
+
44
+ RETURNS
45
+ -------
46
+ set
47
+ words set.
48
+ """
49
+ current = self.__root_node
50
+ words = set()
51
+ for i in range(len(surfaceForm)):
52
+ current = current.getChild(surfaceForm[i])
53
+ if current is not None:
54
+ words.update(current.getWords())
55
+ else:
56
+ break
57
+ return words
58
+
59
+ def getCompundWordStartingWith(self, _hash: str) -> TxtWord:
60
+ """
61
+ The getCompoundWordStartingWith method takes a String hash. First it creates a TrieNode current and assigns
62
+ the rootNode to it. Then it loops i times where i ranges from 0 to length of given hash and assigns current's
63
+ child that corresponds to the hash's char at index i and assigns it as current. If current is None, it returns
64
+ null.
65
+
66
+ If current is not None, it loops through the words of current TrieNode and if it is a Portmanteau word, it
67
+ directly returns the word.
68
+
69
+ PARAMETERS
70
+ ----------
71
+ _hash : str
72
+ String input.
73
+
74
+ RETURNS
75
+ -------
76
+ TxtWord
77
+ None if TrieNode is None, otherwise portmanteau word.
78
+ """
79
+ current = self.__root_node
80
+ for i in range(len(_hash)):
81
+ current = current.getChild(_hash[i])
82
+ if current is None:
83
+ return None
84
+ if current is not None:
85
+ for word in current.getWords():
86
+ if word.isPortmanteau():
87
+ return word
88
+ return None
@@ -0,0 +1,75 @@
1
+ from __future__ import annotations
2
+ from Dictionary.Word import Word
3
+
4
+
5
+ class TrieNode:
6
+
7
+ __children: dict[str, TrieNode]
8
+ __words: set[Word]
9
+
10
+ def __init__(self):
11
+ """
12
+ A constructor of TrieNode class which creates a new children.
13
+ """
14
+ self.__children = {}
15
+ self.__words = set()
16
+
17
+ def addWord(self,
18
+ word: str,
19
+ root: Word,
20
+ index=0):
21
+ """
22
+ The addWord method takes a String word, an index, and a Word root as inputs. First it creates a TrieNode child
23
+ and it directly adds it to the set when the given index is equal to the length of given word.
24
+
25
+ Then, it extracts the character at given index of given word and if children dictionary contains a mapping for
26
+ the extracted character, it assigns it to the TrieNode child, else it creates a new TrieNode and assigns it to
27
+ the child. At the end, it recursively calls the addWord method with the next index of child and puts the
28
+ character with the child into the children dictionary.
29
+
30
+ PARAMETERS
31
+ ----------
32
+ word : str
33
+ String input.
34
+ index : int
35
+ Integer index.
36
+ root : Word
37
+ Word input to add.
38
+ """
39
+ if index == len(word):
40
+ self.__words.add(root)
41
+ return
42
+ ch = word[index]
43
+ if ch in self.__children:
44
+ child = self.__children[ch]
45
+ else:
46
+ child = TrieNode()
47
+ child.addWord(word, root, index + 1)
48
+ self.__children[ch] = child
49
+
50
+ def getChild(self, ch: chr) -> TrieNode:
51
+ """
52
+ The getChild method takes a character and gets its corresponding value from children dictionary.
53
+
54
+ PARAMETERS
55
+ ----------
56
+ ch : chr
57
+ Character input.
58
+
59
+ RETURNS
60
+ -------
61
+ TreeNode
62
+ the value from children dictionary.
63
+ """
64
+ return self.__children.get(ch)
65
+
66
+ def getWords(self) -> set[Word]:
67
+ """
68
+ The getWords method returns the words set.
69
+
70
+ RETURNS
71
+ -------
72
+ set
73
+ the words set.
74
+ """
75
+ return self.__words