pre-bpe-morph-tr 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pre_bpe_morph_tr-0.1.0/LICENSE.md +32 -0
- pre_bpe_morph_tr-0.1.0/PKG-INFO +7 -0
- pre_bpe_morph_tr-0.1.0/README.md +19 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/__init__.py +0 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/detokenizer.py +95 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/dictionary.py +130 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/morph_tokenizer.py +144 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/utils.py +44 -0
- pre_bpe_morph_tr-0.1.0/custom_tokenizer/word_generator.py +267 -0
- pre_bpe_morph_tr-0.1.0/pre_bpe_morph_tr.egg-info/PKG-INFO +7 -0
- pre_bpe_morph_tr-0.1.0/pre_bpe_morph_tr.egg-info/SOURCES.txt +20 -0
- pre_bpe_morph_tr-0.1.0/pre_bpe_morph_tr.egg-info/dependency_links.txt +1 -0
- pre_bpe_morph_tr-0.1.0/pre_bpe_morph_tr.egg-info/requires.txt +2 -0
- pre_bpe_morph_tr-0.1.0/pre_bpe_morph_tr.egg-info/top_level.txt +2 -0
- pre_bpe_morph_tr-0.1.0/pyproject.toml +14 -0
- pre_bpe_morph_tr-0.1.0/setup.cfg +4 -0
- pre_bpe_morph_tr-0.1.0/tests/test_decode_and_encode.py +147 -0
- pre_bpe_morph_tr-0.1.0/tests/test_generation.py +83 -0
- pre_bpe_morph_tr-0.1.0/tests/test_morph_tokenizer.py +60 -0
- pre_bpe_morph_tr-0.1.0/tests/test_sentence_tokenizer.py +35 -0
- pre_bpe_morph_tr-0.1.0/zemberek/__init__.py +22 -0
- pre_bpe_morph_tr-0.1.0/zemberek/examples.py +102 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
Python port of the framework,
|
|
2
|
+
|
|
3
|
+
Copyright 2020 Loodos
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
|
16
|
+
|
|
17
|
+
=======================================================================
|
|
18
|
+
Original framework written in Java,
|
|
19
|
+
|
|
20
|
+
Copyright 2018 Ahmet A. Akın, Mehmet D. Akın
|
|
21
|
+
|
|
22
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
23
|
+
you may not use this file except in compliance with the License.
|
|
24
|
+
You may obtain a copy of the License at
|
|
25
|
+
|
|
26
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
27
|
+
|
|
28
|
+
Unless required by applicable law or agreed to in writing, software
|
|
29
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
30
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
31
|
+
See the License for the specific language governing permissions and
|
|
32
|
+
limitations under the License.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
This project is derived from the Python port of Zemberek by Loodos, which is itself based on the original Zemberek Java project by Ahmet A. Akın and Mehmet D. Akın.
|
|
2
|
+
|
|
3
|
+
# Goal of pre_bpe_morph
|
|
4
|
+
This package recognizes verbs, nouns, and named entities and their suffixes. Before BPE, this package removes suffixes and replaces them with their respective special tokens for identifiability. It also precedes a word type (Verb/Noun/NamedEntity) before any set of suffixes. The goal is to simplify Turkish grammar rules for small language models.
|
|
5
|
+
|
|
6
|
+
## Usage
|
|
7
|
+
```python
|
|
8
|
+
from pre_bpe_morph_tr import MorphTokenizer
|
|
9
|
+
tokenizer=MorphTokenizer("<|", "|>")
|
|
10
|
+
|
|
11
|
+
tokenizer.tokenize("gülüveriniz")
|
|
12
|
+
# response: ['g', 'ü', 'l', '<|Verb|>', '<|Hastily|>', '<|Req|>']
|
|
13
|
+
|
|
14
|
+
tokenizer.detokenize(['k', 'o', 'ş', '<|Verb|>', '<|Fut|>', '<|A1sg|>'])
|
|
15
|
+
# response: "koşacağım"
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Developed for language model
|
|
19
|
+
Since this is developed for LM training, I tried to avoid preprocessing text (like converting "hal" to "hâl", or lowercasing words). The text should remain unchanged when encoded and then decoded.
|
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from collections import OrderedDict
|
|
3
|
+
|
|
4
|
+
from custom_tokenizer.word_generator import CustomWordGenerator
|
|
5
|
+
|
|
6
|
+
class MorphDetokenizer:
|
|
7
|
+
def __init__(self, tk_start: str = "<|", tk_end: str = "|>", cache_limit: int = 100):
|
|
8
|
+
self.tk_start = tk_start
|
|
9
|
+
self.tk_end = tk_end
|
|
10
|
+
self.tk_start_len = len(tk_start)
|
|
11
|
+
self.tk_end_len = len(tk_end)
|
|
12
|
+
self.pos_tags = {"Noun", "Verb", "NamedEntity"}
|
|
13
|
+
self.generator = CustomWordGenerator()
|
|
14
|
+
self.generation_cache = OrderedDict()
|
|
15
|
+
self.cache_limit = cache_limit
|
|
16
|
+
|
|
17
|
+
def _is_special_token(self, token: str) -> bool:
|
|
18
|
+
return token.startswith(self.tk_start) and token.endswith(self.tk_end)
|
|
19
|
+
|
|
20
|
+
def _strip_token(self, token: str) -> str:
|
|
21
|
+
return token[self.tk_start_len : -self.tk_end_len]
|
|
22
|
+
|
|
23
|
+
def _is_word_char(self, token: str) -> bool:
|
|
24
|
+
return token.isalnum()
|
|
25
|
+
|
|
26
|
+
def _collect_suffixes(self, tokens: List[str], start_idx: int) -> tuple[List[str], int]:
|
|
27
|
+
"""Collect all contiguous suffix tokens starting at start_idx."""
|
|
28
|
+
suffixes = []
|
|
29
|
+
n = len(tokens)
|
|
30
|
+
i = start_idx
|
|
31
|
+
while i < n and self._is_special_token(tokens[i]):
|
|
32
|
+
suffix_inner = self._strip_token(tokens[i])
|
|
33
|
+
# If we hit a word type POS tag, stop collecting suffixes
|
|
34
|
+
if suffix_inner in self.pos_tags:
|
|
35
|
+
break
|
|
36
|
+
suffixes.append(suffix_inner)
|
|
37
|
+
i += 1
|
|
38
|
+
return suffixes, i
|
|
39
|
+
|
|
40
|
+
def _reconstruct_and_append(self, word_type: str, suffixes: List[str], current_chars: List[str], result_parts: List[str]):
|
|
41
|
+
"""Reconstruct the word from the root and suffixes and append it to result_parts."""
|
|
42
|
+
root = "".join(current_chars)
|
|
43
|
+
cache_key = (root, word_type, tuple(suffixes))
|
|
44
|
+
|
|
45
|
+
if cache_key in self.generation_cache:
|
|
46
|
+
reconstructed = self.generation_cache[cache_key]
|
|
47
|
+
self.generation_cache.move_to_end(cache_key)
|
|
48
|
+
else:
|
|
49
|
+
reconstructed = self.generator.generate_word(root, word_type, suffixes)
|
|
50
|
+
self.generation_cache[cache_key] = reconstructed
|
|
51
|
+
if len(self.generation_cache) > self.cache_limit:
|
|
52
|
+
self.generation_cache.popitem(last=False)
|
|
53
|
+
|
|
54
|
+
result_parts.append(reconstructed)
|
|
55
|
+
|
|
56
|
+
def detokenize(self, tokens: List[str]) -> str:
|
|
57
|
+
result_parts = []
|
|
58
|
+
current_chars = []
|
|
59
|
+
|
|
60
|
+
i = 0
|
|
61
|
+
n = len(tokens)
|
|
62
|
+
while i < n:
|
|
63
|
+
token = tokens[i]
|
|
64
|
+
|
|
65
|
+
if self._is_special_token(token):
|
|
66
|
+
inner = self._strip_token(token)
|
|
67
|
+
|
|
68
|
+
if inner in self.pos_tags:
|
|
69
|
+
# Case 1: POS tag followed by suffixes
|
|
70
|
+
suffixes, i = self._collect_suffixes(tokens, i + 1)
|
|
71
|
+
self._reconstruct_and_append(inner, suffixes, current_chars, result_parts)
|
|
72
|
+
current_chars = []
|
|
73
|
+
else:
|
|
74
|
+
# Case 2: Suffix token without a preceding POS tag
|
|
75
|
+
suffixes, i = self._collect_suffixes(tokens, i + 1)
|
|
76
|
+
suffixes.insert(0, inner)
|
|
77
|
+
self._reconstruct_and_append("Noun", suffixes, current_chars, result_parts)
|
|
78
|
+
current_chars = []
|
|
79
|
+
else:
|
|
80
|
+
# Normal character, whitespace, or punctuation
|
|
81
|
+
if self._is_word_char(token):
|
|
82
|
+
current_chars.append(token)
|
|
83
|
+
else:
|
|
84
|
+
if current_chars:
|
|
85
|
+
result_parts.append("".join(current_chars))
|
|
86
|
+
current_chars = []
|
|
87
|
+
result_parts.append(token)
|
|
88
|
+
i += 1
|
|
89
|
+
|
|
90
|
+
# Append any remaining characters
|
|
91
|
+
if current_chars:
|
|
92
|
+
result_parts.append("".join(current_chars))
|
|
93
|
+
|
|
94
|
+
return "".join(result_parts)
|
|
95
|
+
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from zemberek.morphology.morphotactics.turkish_morphotactics import TurkishMorphotactics, get_morpheme_map
|
|
2
|
+
|
|
3
|
+
# Trigger class definition to ensure side-effects populate the map
|
|
4
|
+
_ = TurkishMorphotactics
|
|
5
|
+
|
|
6
|
+
morpheme_map = get_morpheme_map()
|
|
7
|
+
|
|
8
|
+
idx2morpheme_id =[
|
|
9
|
+
"Root",
|
|
10
|
+
"Noun",
|
|
11
|
+
"Adj",
|
|
12
|
+
"Verb",
|
|
13
|
+
"Pron",
|
|
14
|
+
"Adv",
|
|
15
|
+
"Conj",
|
|
16
|
+
"Punc",
|
|
17
|
+
"Ques",
|
|
18
|
+
"Postp",
|
|
19
|
+
"Det",
|
|
20
|
+
"Num",
|
|
21
|
+
"Dup",
|
|
22
|
+
"Interj",
|
|
23
|
+
"A1sg",
|
|
24
|
+
"A2sg",
|
|
25
|
+
"A3sg",
|
|
26
|
+
"A1pl",
|
|
27
|
+
"A2pl",
|
|
28
|
+
"A3pl",
|
|
29
|
+
"Pnon",
|
|
30
|
+
"P1sg",
|
|
31
|
+
"P2sg",
|
|
32
|
+
"P3sg",
|
|
33
|
+
"P1pl",
|
|
34
|
+
"P2pl",
|
|
35
|
+
"P3pl",
|
|
36
|
+
"Nom",
|
|
37
|
+
"Dat",
|
|
38
|
+
"Acc",
|
|
39
|
+
"Abl",
|
|
40
|
+
"Loc",
|
|
41
|
+
"Ins",
|
|
42
|
+
"Gen",
|
|
43
|
+
"Equ",
|
|
44
|
+
"Dim",
|
|
45
|
+
"Ness",
|
|
46
|
+
"With",
|
|
47
|
+
"Without",
|
|
48
|
+
"Related",
|
|
49
|
+
"JustLike",
|
|
50
|
+
"Rel",
|
|
51
|
+
"Agt",
|
|
52
|
+
"Become",
|
|
53
|
+
"Acquire",
|
|
54
|
+
"Ly",
|
|
55
|
+
"Caus",
|
|
56
|
+
"Recip",
|
|
57
|
+
"Reflex",
|
|
58
|
+
"Able",
|
|
59
|
+
"Pass",
|
|
60
|
+
"Inf1",
|
|
61
|
+
"Inf2",
|
|
62
|
+
"Inf3",
|
|
63
|
+
"ActOf",
|
|
64
|
+
"PastPart",
|
|
65
|
+
"NarrPart",
|
|
66
|
+
"FutPart",
|
|
67
|
+
"PresPart",
|
|
68
|
+
"AorPart",
|
|
69
|
+
"NotState",
|
|
70
|
+
"FeelLike",
|
|
71
|
+
"EverSince",
|
|
72
|
+
"Repeat",
|
|
73
|
+
"Almost",
|
|
74
|
+
"Hastily",
|
|
75
|
+
"Stay",
|
|
76
|
+
"Start",
|
|
77
|
+
"AsIf",
|
|
78
|
+
"While",
|
|
79
|
+
"When",
|
|
80
|
+
"SinceDoingSo",
|
|
81
|
+
"AsLongAs",
|
|
82
|
+
"ByDoingSo",
|
|
83
|
+
"Adamantly",
|
|
84
|
+
"AfterDoingSo",
|
|
85
|
+
"WithoutHavingDoneSo",
|
|
86
|
+
"WithoutBeingAbleToHaveDoneSo",
|
|
87
|
+
"Zero",
|
|
88
|
+
"Cop",
|
|
89
|
+
"Neg",
|
|
90
|
+
"Unable",
|
|
91
|
+
"Pres",
|
|
92
|
+
"Past",
|
|
93
|
+
"Narr",
|
|
94
|
+
"Cond",
|
|
95
|
+
"Prog1",
|
|
96
|
+
"Prog2",
|
|
97
|
+
"Aor",
|
|
98
|
+
"Fut",
|
|
99
|
+
"Imp",
|
|
100
|
+
"Req",
|
|
101
|
+
"Opt",
|
|
102
|
+
"Desr",
|
|
103
|
+
"Neces",
|
|
104
|
+
"Intrj"
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
morpheme_id2idx = {
|
|
108
|
+
morpheme_id: idx for idx, morpheme_id in enumerate(idx2morpheme_id)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
idx2morpheme = {
|
|
112
|
+
idx: morpheme_map[morpheme_id] for idx, morpheme_id in enumerate(idx2morpheme_id)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
wordPos_ids = {
|
|
116
|
+
"Root",
|
|
117
|
+
"Noun",
|
|
118
|
+
"Adj",
|
|
119
|
+
"Verb",
|
|
120
|
+
"Pron",
|
|
121
|
+
"Adv",
|
|
122
|
+
"Conj",
|
|
123
|
+
"Punc",
|
|
124
|
+
"Ques",
|
|
125
|
+
"Postp",
|
|
126
|
+
"Det",
|
|
127
|
+
"Num",
|
|
128
|
+
"Dup",
|
|
129
|
+
"Interj",
|
|
130
|
+
}
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from zemberek.morphology.analysis.word_analysis import WordAnalysis
|
|
3
|
+
from zemberek import TurkishSentenceExtractor
|
|
4
|
+
from custom_tokenizer.utils import get_morphology, match_capitilization, is_morph_analysis_ok
|
|
5
|
+
|
|
6
|
+
from zemberek.core.turkish import PrimaryPos, SecondaryPos
|
|
7
|
+
from custom_tokenizer.detokenizer import MorphDetokenizer
|
|
8
|
+
from zemberek.core.turkish.root_attribute import RootAttribute
|
|
9
|
+
|
|
10
|
+
class MorphTokenizer:
|
|
11
|
+
def __init__(self, tk_start, tk_end):
|
|
12
|
+
self.tk_start = tk_start
|
|
13
|
+
self.tk_end = tk_end
|
|
14
|
+
self.morphology = get_morphology()
|
|
15
|
+
self.extractor = TurkishSentenceExtractor()
|
|
16
|
+
self.special_token = lambda s: tk_start + s + tk_end
|
|
17
|
+
|
|
18
|
+
def tokenize(self, text: str) -> List[str]:
|
|
19
|
+
sentences = self.extractor.from_paragraph(text)
|
|
20
|
+
|
|
21
|
+
all_tokens = []
|
|
22
|
+
current_pos = 0
|
|
23
|
+
|
|
24
|
+
for i, sentence_text in enumerate(sentences):
|
|
25
|
+
# Find the sentence in the original text to capture preceding whitespace/newlines
|
|
26
|
+
start_idx = text.find(sentence_text, current_pos)
|
|
27
|
+
prefix = text[current_pos:start_idx]
|
|
28
|
+
if prefix:
|
|
29
|
+
all_tokens.extend(list(prefix))
|
|
30
|
+
|
|
31
|
+
# Tokenize the sentence itself
|
|
32
|
+
all_tokens.extend(self.__tokenize_sentence(sentence_text))
|
|
33
|
+
|
|
34
|
+
current_pos = start_idx + len(sentence_text)
|
|
35
|
+
|
|
36
|
+
# Add any trailing whitespace after the last sentence
|
|
37
|
+
trailing = text[current_pos:]
|
|
38
|
+
if trailing:
|
|
39
|
+
all_tokens.extend(list(trailing))
|
|
40
|
+
|
|
41
|
+
return all_tokens
|
|
42
|
+
|
|
43
|
+
def detokenize(self, tokens: List[str]) -> str:
|
|
44
|
+
detokenizer = MorphDetokenizer(tk_start=self.tk_start, tk_end=self.tk_end)
|
|
45
|
+
return detokenizer.detokenize(tokens)
|
|
46
|
+
|
|
47
|
+
def __tokenize_sentence(self, sentence: str) -> List[str]:
|
|
48
|
+
after = self.morphology.analyze_sentence(sentence)
|
|
49
|
+
|
|
50
|
+
whitespaces = self.__collect_whitespaces(sentence, after)
|
|
51
|
+
|
|
52
|
+
words = []
|
|
53
|
+
for word_analysis in after:
|
|
54
|
+
tokens = self.__get_word_tokens(word_analysis)
|
|
55
|
+
words.append(tokens)
|
|
56
|
+
|
|
57
|
+
split_by_words = self.__reconstruct_sentence(words, whitespaces)
|
|
58
|
+
return [token for word in split_by_words for token in word]
|
|
59
|
+
|
|
60
|
+
def __get_word_type(self, item, original_surface: str) -> str:
|
|
61
|
+
if item.primary_pos == PrimaryPos.Verb:
|
|
62
|
+
return "Verb"
|
|
63
|
+
|
|
64
|
+
# Check for numeric words recognized by Zemberek
|
|
65
|
+
numeric_secondary_pos = {
|
|
66
|
+
SecondaryPos.Cardinal, SecondaryPos.Clock, SecondaryPos.Date,
|
|
67
|
+
SecondaryPos.Ordinal, SecondaryPos.Percentage, SecondaryPos.Ratio,
|
|
68
|
+
SecondaryPos.Real, SecondaryPos.Distribution, SecondaryPos.Range
|
|
69
|
+
}
|
|
70
|
+
if item.secondary_pos in numeric_secondary_pos:
|
|
71
|
+
return "Noun"
|
|
72
|
+
|
|
73
|
+
# NamedEntity is anything with an apostrophe (that isn't a verb or numeric)
|
|
74
|
+
if "'" in original_surface:
|
|
75
|
+
return "NamedEntity"
|
|
76
|
+
|
|
77
|
+
if item.primary_pos not in [PrimaryPos.Unknown, PrimaryPos.Punctuation]:
|
|
78
|
+
return "Noun"
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
def __get_word_tokens(self, word_analysis: WordAnalysis) -> List[str]:
|
|
82
|
+
original_surface = word_analysis.inp
|
|
83
|
+
|
|
84
|
+
if not is_morph_analysis_ok(original_surface) or not word_analysis.analysis_results:
|
|
85
|
+
return list(original_surface) # declare word as unknown
|
|
86
|
+
|
|
87
|
+
best = word_analysis.analysis_results[0]
|
|
88
|
+
item = best.item
|
|
89
|
+
|
|
90
|
+
word_type = self.__get_word_type(item, original_surface)
|
|
91
|
+
|
|
92
|
+
tokens = []
|
|
93
|
+
suffixes = []
|
|
94
|
+
for i, m_data in enumerate(best.morpheme_data_list):
|
|
95
|
+
if i == 0:
|
|
96
|
+
# Use the lemma/root form for the stem rather than the surface allomorph
|
|
97
|
+
if (not item.is_unknown()) and (not RootAttribute.CompoundP3sg in item.attributes):
|
|
98
|
+
stem = item.normalized_lemma()
|
|
99
|
+
else:
|
|
100
|
+
stem = m_data.surface
|
|
101
|
+
if "'" in original_surface and not item.is_unknown():
|
|
102
|
+
stem = original_surface.split("'")[0]
|
|
103
|
+
tokens.extend(list(match_capitilization(original_surface, stem)))
|
|
104
|
+
elif len(m_data.surface) > 0:
|
|
105
|
+
suffixes.append(self.special_token(m_data.morpheme.id_))
|
|
106
|
+
|
|
107
|
+
if len(suffixes) > 0:
|
|
108
|
+
if word_type:
|
|
109
|
+
tokens.append(self.special_token(word_type))
|
|
110
|
+
tokens.extend(suffixes)
|
|
111
|
+
else:
|
|
112
|
+
return list(original_surface)
|
|
113
|
+
return tokens
|
|
114
|
+
|
|
115
|
+
def __reconstruct_sentence(self, words: List[List[str]], whitespaces: List[str]) -> List[List[str]]:
|
|
116
|
+
sentence = []
|
|
117
|
+
for i in range(len(words) + len(whitespaces)):
|
|
118
|
+
if i % 2 == 1:
|
|
119
|
+
if i // 2 < len(words):
|
|
120
|
+
sentence.append(words[i // 2])
|
|
121
|
+
else:
|
|
122
|
+
if i // 2 < len(whitespaces) and len(whitespaces[i // 2]) > 0:
|
|
123
|
+
sentence.append(list(whitespaces[i // 2]))
|
|
124
|
+
return sentence
|
|
125
|
+
|
|
126
|
+
def __collect_whitespaces(self, sentence: str, analyses: List[WordAnalysis]) -> List[str]:
|
|
127
|
+
whitespaces = []
|
|
128
|
+
current_pos = 0
|
|
129
|
+
|
|
130
|
+
for wa in analyses:
|
|
131
|
+
original_surface = wa.inp
|
|
132
|
+
# Find the start of this word in the original sentence
|
|
133
|
+
start_idx = sentence.find(original_surface, current_pos)
|
|
134
|
+
|
|
135
|
+
# The gap before this word
|
|
136
|
+
whitespaces.append(sentence[current_pos:start_idx])
|
|
137
|
+
|
|
138
|
+
# Move current_pos past the word
|
|
139
|
+
current_pos = start_idx + len(original_surface)
|
|
140
|
+
|
|
141
|
+
# Add the trailing characters (if any)
|
|
142
|
+
whitespaces.append(sentence[current_pos:])
|
|
143
|
+
|
|
144
|
+
return whitespaces
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from zemberek import TurkishMorphology
|
|
2
|
+
from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
_morphotactics = None
|
|
6
|
+
_morphology = None
|
|
7
|
+
|
|
8
|
+
def get_morphotactics():
|
|
9
|
+
global _morphotactics
|
|
10
|
+
if _morphotactics is None:
|
|
11
|
+
from zemberek.morphology.lexicon.root_lexicon import RootLexicon
|
|
12
|
+
from zemberek.morphology.morphotactics.turkish_morphotactics import TurkishMorphotactics
|
|
13
|
+
lexicon = RootLexicon.get_default()
|
|
14
|
+
_morphotactics = TurkishMorphotactics(lexicon)
|
|
15
|
+
return _morphotactics
|
|
16
|
+
|
|
17
|
+
def get_morphology():
|
|
18
|
+
global _morphology
|
|
19
|
+
if _morphology is None:
|
|
20
|
+
_morphology = TurkishMorphology.create_with_defaults()
|
|
21
|
+
return _morphology
|
|
22
|
+
|
|
23
|
+
def match_capitilization(ref: str, target: str) -> str:
|
|
24
|
+
if not ref or not target:
|
|
25
|
+
return target
|
|
26
|
+
|
|
27
|
+
alphabet = TurkishAlphabet.INSTANCE
|
|
28
|
+
|
|
29
|
+
if ref.isupper() and len(ref) > 1:
|
|
30
|
+
return target.translate(alphabet.upper_map).upper()
|
|
31
|
+
|
|
32
|
+
if ref[0].isupper():
|
|
33
|
+
return target[0].translate(alphabet.upper_map).upper() + target[1:]
|
|
34
|
+
|
|
35
|
+
return target[0].translate(alphabet.lower_map).lower() + target[1:]
|
|
36
|
+
|
|
37
|
+
def is_morph_analysis_ok(word: str) -> bool:
|
|
38
|
+
parts = word[1:].split("'")
|
|
39
|
+
trailing = parts[-1]
|
|
40
|
+
if len(parts) > 2: return False
|
|
41
|
+
if word.isupper(): return len(parts) == 1
|
|
42
|
+
for c in trailing:
|
|
43
|
+
if c.isupper(): return False
|
|
44
|
+
return True
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from custom_tokenizer.utils import get_morphotactics, match_capitilization
|
|
2
|
+
from zemberek.morphology.morphotactics.morpheme import Morpheme
|
|
3
|
+
from zemberek.morphology.lexicon.dictionary_item import DictionaryItem
|
|
4
|
+
from zemberek.morphology.morphotactics.stem_transition import StemTransition
|
|
5
|
+
from zemberek.core.turkish.root_attribute import RootAttribute
|
|
6
|
+
from zemberek.core.turkish import PhoneticAttribute
|
|
7
|
+
from zemberek.morphology.analysis.attributes_helper import AttributesHelper
|
|
8
|
+
from zemberek.core.turkish.primary_pos import PrimaryPos
|
|
9
|
+
from zemberek.core.turkish.secondary_pos import SecondaryPos
|
|
10
|
+
from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
|
|
11
|
+
from zemberek.morphology.generator.word_generator import WordGenerator
|
|
12
|
+
from custom_tokenizer.dictionary import morpheme_map
|
|
13
|
+
from zemberek.morphology.analysis.tr.pronunciation_guesser import PronunciationGuesser
|
|
14
|
+
import logging
|
|
15
|
+
from typing import List, Union, Literal, Set
|
|
16
|
+
|
|
17
|
+
class CustomWordGenerator:
|
|
18
|
+
def __init__(self):
|
|
19
|
+
self.guesser = PronunciationGuesser()
|
|
20
|
+
self.alphabet = TurkishAlphabet.INSTANCE
|
|
21
|
+
self.morphotactics = get_morphotactics()
|
|
22
|
+
|
|
23
|
+
def generate_word(self, root: str, word_type: Literal["Noun", "Verb", "NamedEntity"], suffixes: Union[List[Morpheme], List[str]]) -> str:
|
|
24
|
+
"""Generate a word form using Zemberek's WordGenerator.
|
|
25
|
+
|
|
26
|
+
Handles dictionary items, unknown words, and NamedEntity special logic.
|
|
27
|
+
"""
|
|
28
|
+
if not suffixes:
|
|
29
|
+
return root
|
|
30
|
+
|
|
31
|
+
suffix_objs = [s if isinstance(s, Morpheme) else morpheme_map[s] for s in suffixes]
|
|
32
|
+
p_pos, s_pos = self._get_pos_enums(word_type)
|
|
33
|
+
|
|
34
|
+
lexicon = self.morphotactics.get_root_lexicon()
|
|
35
|
+
matching_items = self._find_lexicon_items(root, word_type, p_pos, lexicon)
|
|
36
|
+
candidates = self._get_stem_candidates(root, matching_items, word_type, p_pos, s_pos)
|
|
37
|
+
|
|
38
|
+
generator = WordGenerator(self.morphotactics)
|
|
39
|
+
results = generator.generate(morphemes=tuple(suffix_objs), candidates=tuple(candidates))
|
|
40
|
+
|
|
41
|
+
if results:
|
|
42
|
+
return self._apply_post_processing(root, results[0].surface, word_type)
|
|
43
|
+
|
|
44
|
+
forced_result = self.force_suffixes_on_word(root, word_type == "NamedEntity", suffix_objs)
|
|
45
|
+
return match_capitilization(root, forced_result)
|
|
46
|
+
|
|
47
|
+
def _get_pos_enums(self, word_type: str) -> tuple[PrimaryPos, SecondaryPos]:
|
|
48
|
+
if word_type in ["Noun", "NamedEntity"]:
|
|
49
|
+
p_pos = PrimaryPos.Noun
|
|
50
|
+
elif word_type == "Adj":
|
|
51
|
+
p_pos = PrimaryPos.Adjective
|
|
52
|
+
else: # Verb and others
|
|
53
|
+
p_pos = PrimaryPos.Verb
|
|
54
|
+
|
|
55
|
+
s_pos = SecondaryPos.ProperNoun if word_type == "NamedEntity" else SecondaryPos.None_
|
|
56
|
+
return p_pos, s_pos
|
|
57
|
+
|
|
58
|
+
def _filter_lexicon_items(self, items: List[DictionaryItem], word_type: str, p_pos: PrimaryPos = None) -> List[DictionaryItem]:
|
|
59
|
+
if p_pos is not None:
|
|
60
|
+
items = [item for item in items if item.primary_pos == p_pos]
|
|
61
|
+
else:
|
|
62
|
+
items = [item for item in items if item.primary_pos != PrimaryPos.Verb]
|
|
63
|
+
|
|
64
|
+
if word_type == "Noun":
|
|
65
|
+
items = [item for item in items if item.secondary_pos != SecondaryPos.ProperNoun]
|
|
66
|
+
elif word_type == "NamedEntity":
|
|
67
|
+
proper_items = [item for item in items if item.secondary_pos == SecondaryPos.ProperNoun]
|
|
68
|
+
if proper_items:
|
|
69
|
+
items = proper_items
|
|
70
|
+
|
|
71
|
+
return items
|
|
72
|
+
|
|
73
|
+
def _find_lexicon_items(self, root: str, word_type: str, p_pos: PrimaryPos, lexicon) -> List[DictionaryItem]:
|
|
74
|
+
lex_key = root
|
|
75
|
+
if word_type == "Verb":
|
|
76
|
+
lex_key = self._add_Inf1_suffix(root)
|
|
77
|
+
|
|
78
|
+
items = self._filter_lexicon_items(lexicon.item_map.get(lex_key, []), word_type, p_pos)
|
|
79
|
+
|
|
80
|
+
if not items and (root.istitle() or root.isupper()):
|
|
81
|
+
alt_lex_key = root.translate(self.alphabet.lower_map).lower()
|
|
82
|
+
if word_type == "Verb":
|
|
83
|
+
alt_lex_key = self._add_Inf1_suffix(alt_lex_key)
|
|
84
|
+
items = self._filter_lexicon_items(lexicon.item_map.get(alt_lex_key, []), word_type, p_pos)
|
|
85
|
+
|
|
86
|
+
# If still no items found, try any available POS as a fallback
|
|
87
|
+
# (but exclude Verb to avoid inappropriate morphological rules)
|
|
88
|
+
if not items:
|
|
89
|
+
items = self._filter_lexicon_items(lexicon.item_map.get(lex_key, []), word_type)
|
|
90
|
+
if not items and (root.istitle() or root.isupper()):
|
|
91
|
+
alt_lex_key = root.translate(self.alphabet.lower_map).lower()
|
|
92
|
+
if word_type == "Verb":
|
|
93
|
+
alt_lex_key = self._add_Inf1_suffix(alt_lex_key)
|
|
94
|
+
items = self._filter_lexicon_items(lexicon.item_map.get(alt_lex_key, []), word_type)
|
|
95
|
+
|
|
96
|
+
return items
|
|
97
|
+
|
|
98
|
+
def _get_stem_candidates(self, root: str, items: List[DictionaryItem], word_type: str, p_pos: PrimaryPos, s_pos: SecondaryPos) -> List[StemTransition]:
|
|
99
|
+
candidates = []
|
|
100
|
+
has_vowel = any(self.alphabet.is_vowel(c) for c in root)
|
|
101
|
+
for item in items:
|
|
102
|
+
if word_type == "NamedEntity" and not has_vowel:
|
|
103
|
+
# For vowel-less NamedEntities (abbreviations), ensure they have phonetic attributes
|
|
104
|
+
# even if using a dictionary item, as some dictionary entries might be missing them.
|
|
105
|
+
start_state = self.morphotactics.noun_S
|
|
106
|
+
phonetic_attrs = self._get_phonetic_attributes(root)
|
|
107
|
+
candidates.append(StemTransition(root, item, phonetic_attrs, start_state))
|
|
108
|
+
else:
|
|
109
|
+
candidates.extend(self.morphotactics.stem_transitions.get_transitions_for_item(item))
|
|
110
|
+
|
|
111
|
+
if not candidates:
|
|
112
|
+
candidates.append(self._create_stem_transition(root, p_pos, s_pos))
|
|
113
|
+
return candidates
|
|
114
|
+
|
|
115
|
+
def _turkish_lower(self, s: str) -> str:
|
|
116
|
+
return s.translate(self.alphabet.lower_map).lower()
|
|
117
|
+
|
|
118
|
+
def _apply_post_processing(self, root: str, generated_surface: str, word_type: str) -> str:
|
|
119
|
+
is_named_entity = word_type == "NamedEntity"
|
|
120
|
+
is_number = self.alphabet.contains_digit(root)
|
|
121
|
+
root_lower = self._turkish_lower(root)
|
|
122
|
+
generated_lower = self._turkish_lower(generated_surface)
|
|
123
|
+
|
|
124
|
+
if (is_named_entity or is_number) and generated_lower != root_lower:
|
|
125
|
+
if generated_lower.startswith(root_lower):
|
|
126
|
+
suffix = generated_lower[len(root_lower):]
|
|
127
|
+
return root + "'" + suffix
|
|
128
|
+
|
|
129
|
+
return match_capitilization(root, generated_surface)
|
|
130
|
+
|
|
131
|
+
def force_suffixes_on_word(self, root: str, is_named_entity: bool, suffixes: List[Morpheme]) -> str:
|
|
132
|
+
logging.warning(
|
|
133
|
+
f"Couldn't add suffixes: {[s.id_ for s in suffixes]} to \"{root}\" "
|
|
134
|
+
f"via zemberek's own method. Deploying work around"
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
generator = WordGenerator(self.morphotactics)
|
|
138
|
+
current_surface = root
|
|
139
|
+
apostrophe_added = False
|
|
140
|
+
|
|
141
|
+
for suffix in suffixes:
|
|
142
|
+
if suffix.id_ == "Rel":
|
|
143
|
+
current_surface += "ki"
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
generated_surface = self._try_force_generate_suffix(
|
|
147
|
+
current_surface, suffix, is_named_entity, apostrophe_added, generator
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if generated_surface:
|
|
151
|
+
current_surface, apostrophe_added = self._update_forced_surface(
|
|
152
|
+
current_surface, generated_surface, is_named_entity, apostrophe_added
|
|
153
|
+
)
|
|
154
|
+
else:
|
|
155
|
+
logging.error(f"Could not generate suffix {suffix.id_} for {current_surface}")
|
|
156
|
+
|
|
157
|
+
return current_surface
|
|
158
|
+
|
|
159
|
+
def _try_force_generate_suffix(self, current_surface: str, suffix: Morpheme, is_named_entity: bool, apostrophe_added: bool, generator: WordGenerator) -> Union[str, None]:
|
|
160
|
+
possible_pos = self._get_primary_pos_for_suffix(suffix)
|
|
161
|
+
for p_pos in possible_pos:
|
|
162
|
+
s_pos = SecondaryPos.ProperNoun if is_named_entity and not apostrophe_added else SecondaryPos.None_
|
|
163
|
+
candidate = self._create_stem_transition(current_surface, p_pos, s_pos)
|
|
164
|
+
results = generator.generate(morphemes=(suffix,), candidates=(candidate,))
|
|
165
|
+
if results:
|
|
166
|
+
return results[0].surface
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
def _update_forced_surface(self, current_surface: str, generated_surface: str, is_named_entity: bool, apostrophe_added: bool) -> tuple[str, bool]:
|
|
170
|
+
if is_named_entity and not apostrophe_added and generated_surface != current_surface:
|
|
171
|
+
current_lower = self._turkish_lower(current_surface)
|
|
172
|
+
generated_lower = self._turkish_lower(generated_surface)
|
|
173
|
+
if generated_lower.startswith(current_lower):
|
|
174
|
+
suffix_surface = generated_lower[len(current_lower):]
|
|
175
|
+
return f"{current_surface}'{suffix_surface}", True
|
|
176
|
+
else:
|
|
177
|
+
return generated_surface, False
|
|
178
|
+
return generated_surface, apostrophe_added
|
|
179
|
+
|
|
180
|
+
def _get_primary_pos_for_suffix(self, morpheme: Morpheme) -> List[PrimaryPos]:
|
|
181
|
+
m_id = morpheme.id_
|
|
182
|
+
|
|
183
|
+
noun_suffixes = {
|
|
184
|
+
"Pnon", "P1sg", "P2sg", "P3sg", "P1pl", "P2pl", "P3pl", "Nom", "Dat", "Acc", "Abl", "Loc", "Ins", "Gen", "Equ",
|
|
185
|
+
"Dim", "Ness", "With", "Without", "Related", "JustLike", "Rel", "Agt", "Become", "Acquire", "Zero", "Root",
|
|
186
|
+
"A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl", "Past", "Narr", "Cond", "Cop", "Noun"
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
adjective_suffixes = {
|
|
190
|
+
"Ly", "AsIf", "Agt", "JustLike", "Become", "Acquire"
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
verb_suffixes = {
|
|
194
|
+
"Caus", "Recip", "Reflex", "Able", "Pass", "Neg", "Unable", "Pres", "Prog1", "Prog2", "Aor", "Fut", "Imp", "Req",
|
|
195
|
+
"Opt", "Desr", "Neces", "Inf1", "Inf2", "Inf3", "ActOf", "PastPart", "NarrPart", "FutPart", "PresPart", "AorPart",
|
|
196
|
+
"NotState", "FeelLike", "EverSince", "Repeat", "Almost", "Hastily", "Stay", "Start", "AsIf", "While",
|
|
197
|
+
"When", "SinceDoingSo", "AsLongAs", "ByDoingSo", "Adamantly", "AfterDoingSo", "WithoutHavingDoneSo",
|
|
198
|
+
"WithoutBeingAbleToHaveDoneSo", "A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl", "Past", "Narr", "Cond", "Cop", "Verb",
|
|
199
|
+
"Intrj"
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
results = []
|
|
203
|
+
if m_id in noun_suffixes: results.append(PrimaryPos.Noun)
|
|
204
|
+
if m_id in adjective_suffixes: results.append(PrimaryPos.Adjective)
|
|
205
|
+
if m_id in verb_suffixes: results.append(PrimaryPos.Verb)
|
|
206
|
+
|
|
207
|
+
if results: return results
|
|
208
|
+
|
|
209
|
+
return [PrimaryPos.Unknown]
|
|
210
|
+
|
|
211
|
+
def _is_single_syllable(self, word: str) -> bool:
|
|
212
|
+
from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
|
|
213
|
+
vowel_count = sum(1 for char in word if TurkishAlphabet.INSTANCE.is_vowel(char))
|
|
214
|
+
return vowel_count == 1
|
|
215
|
+
|
|
216
|
+
def _add_Inf1_suffix(self, verb: str) -> str:
|
|
217
|
+
alphabet =TurkishAlphabet()
|
|
218
|
+
is_frontal = alphabet.get_last_vowel(verb).is_frontal()
|
|
219
|
+
sfx = "mek" if is_frontal else "mak"
|
|
220
|
+
return verb+sfx
|
|
221
|
+
|
|
222
|
+
def _create_stem_transition(self, root: str, p_pos: PrimaryPos, s_pos: SecondaryPos = SecondaryPos.None_) -> StemTransition:
|
|
223
|
+
attributes = set()
|
|
224
|
+
if p_pos == PrimaryPos.Verb:
|
|
225
|
+
if self._is_single_syllable(root):
|
|
226
|
+
attributes.add(RootAttribute.Aorist_A)
|
|
227
|
+
else:
|
|
228
|
+
attributes.add(RootAttribute.Aorist_I)
|
|
229
|
+
|
|
230
|
+
dummy_item = DictionaryItem(root, root, p_pos, s_pos, attributes=attributes)
|
|
231
|
+
|
|
232
|
+
# Determine the appropriate start state based on PrimaryPos
|
|
233
|
+
if p_pos == PrimaryPos.Verb:
|
|
234
|
+
start_state = self._get_verb_root_state(root)
|
|
235
|
+
elif p_pos == PrimaryPos.Adjective:
|
|
236
|
+
start_state = self.morphotactics.adjectiveRoot_ST
|
|
237
|
+
else: # Noun and others
|
|
238
|
+
start_state = self.morphotactics.noun_S
|
|
239
|
+
|
|
240
|
+
phonetic_attrs = self._get_phonetic_attributes(root)
|
|
241
|
+
res = StemTransition(root, dummy_item, phonetic_attrs, start_state)
|
|
242
|
+
return res
|
|
243
|
+
|
|
244
|
+
def _get_verb_root_state(self, root: str):
|
|
245
|
+
root_lower = root.translate(self.alphabet.lower_map).lower()
|
|
246
|
+
if root_lower in {"di", "yi", "de", "ye"}:
|
|
247
|
+
return self.morphotactics.vDeYeRoot_S
|
|
248
|
+
# Unknown verb roots should use the general verb root state.
|
|
249
|
+
# Vowel-ending stems without dictionary evidence should not be assumed
|
|
250
|
+
# to be vowel-drop verbs, otherwise suffixes like Fut fail.
|
|
251
|
+
return self.morphotactics.verbRoot_S
|
|
252
|
+
|
|
253
|
+
def _get_phonetic_attributes(self, root: str) -> Set[PhoneticAttribute]:
|
|
254
|
+
has_vowel = any(self.alphabet.is_vowel(c) for c in root)
|
|
255
|
+
|
|
256
|
+
if not has_vowel and len(root) > 0:
|
|
257
|
+
# Letter names are usually defined for lowercase letters in the guesser
|
|
258
|
+
normalized_root = root.translate(self.alphabet.lower_map).lower()
|
|
259
|
+
pronunciation = self.guesser.to_turkish_letter_pronunciations(normalized_root)
|
|
260
|
+
if pronunciation:
|
|
261
|
+
return AttributesHelper.get_morphemic_attributes(pronunciation)
|
|
262
|
+
|
|
263
|
+
if self.alphabet.contains_digit(root):
|
|
264
|
+
pronunciation = self.guesser.to_turkish_letter_pronunciation_with_digit(root)
|
|
265
|
+
return AttributesHelper.get_morphemic_attributes(pronunciation)
|
|
266
|
+
|
|
267
|
+
return AttributesHelper.get_morphemic_attributes(root)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
LICENSE.md
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
custom_tokenizer/__init__.py
|
|
5
|
+
custom_tokenizer/detokenizer.py
|
|
6
|
+
custom_tokenizer/dictionary.py
|
|
7
|
+
custom_tokenizer/morph_tokenizer.py
|
|
8
|
+
custom_tokenizer/utils.py
|
|
9
|
+
custom_tokenizer/word_generator.py
|
|
10
|
+
pre_bpe_morph_tr.egg-info/PKG-INFO
|
|
11
|
+
pre_bpe_morph_tr.egg-info/SOURCES.txt
|
|
12
|
+
pre_bpe_morph_tr.egg-info/dependency_links.txt
|
|
13
|
+
pre_bpe_morph_tr.egg-info/requires.txt
|
|
14
|
+
pre_bpe_morph_tr.egg-info/top_level.txt
|
|
15
|
+
tests/test_decode_and_encode.py
|
|
16
|
+
tests/test_generation.py
|
|
17
|
+
tests/test_morph_tokenizer.py
|
|
18
|
+
tests/test_sentence_tokenizer.py
|
|
19
|
+
zemberek/__init__.py
|
|
20
|
+
zemberek/examples.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools<82.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pre_bpe_morph_tr"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"antlr4-python3-runtime==4.8",
|
|
10
|
+
"numpy>=1.19.0",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.setuptools]
|
|
14
|
+
packages = ["zemberek", "custom_tokenizer"]
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
# Add the parent directory to sys.path to allow importing from the root
|
|
6
|
+
sys.path.append(str(Path(__file__).resolve().parent.parent))
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from custom_tokenizer.morph_tokenizer import MorphTokenizer
|
|
10
|
+
|
|
11
|
+
# Configure logging to hide noisy library messages
|
|
12
|
+
logging.basicConfig(level=logging.ERROR)
|
|
13
|
+
|
|
14
|
+
def run_decode_encode_test():
|
|
15
|
+
xlsx_path = Path(__file__).parent / "TurkishTweets.xlsx"
|
|
16
|
+
|
|
17
|
+
if not xlsx_path.exists():
|
|
18
|
+
print(f"Error: Could not find TurkishTweets.xlsx at {xlsx_path}")
|
|
19
|
+
sys.exit(1)
|
|
20
|
+
|
|
21
|
+
print(f"Reading Excel file: {xlsx_path.name}...")
|
|
22
|
+
try:
|
|
23
|
+
df = pd.read_excel(xlsx_path)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
print(f"Error reading Excel file: {e}")
|
|
26
|
+
print("Please ensure that 'pandas' and 'openpyxl' are installed in your environment.")
|
|
27
|
+
print("To install them, run:")
|
|
28
|
+
print(" ./.venv/bin/pip install pandas openpyxl")
|
|
29
|
+
sys.exit(1)
|
|
30
|
+
|
|
31
|
+
# Dynamically find the tweet column (case-insensitive check for 'tweet')
|
|
32
|
+
tweet_col = None
|
|
33
|
+
for col in df.columns:
|
|
34
|
+
if "tweet" in str(col).lower():
|
|
35
|
+
tweet_col = col
|
|
36
|
+
break
|
|
37
|
+
|
|
38
|
+
if tweet_col is None:
|
|
39
|
+
tweet_col = df.columns[0]
|
|
40
|
+
print(f"Could not find a column named 'Tweets'. Using the first column: '{tweet_col}'")
|
|
41
|
+
else:
|
|
42
|
+
print(f"Using column: '{tweet_col}'")
|
|
43
|
+
|
|
44
|
+
print("Initializing MorphTokenizer...")
|
|
45
|
+
tokenizer = MorphTokenizer("<|", "|>")
|
|
46
|
+
|
|
47
|
+
mismatches = []
|
|
48
|
+
total_count = 0
|
|
49
|
+
success_count = 0
|
|
50
|
+
|
|
51
|
+
print(f"Processing {len(df)} rows for the encode-decode round-trip test...")
|
|
52
|
+
|
|
53
|
+
for idx, row in df.iterrows():
|
|
54
|
+
original_tweet = row[tweet_col]
|
|
55
|
+
if pd.isna(original_tweet):
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
original_tweet = str(original_tweet)
|
|
59
|
+
total_count += 1
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
# Tokenize (Encode)
|
|
63
|
+
tokens = tokenizer.tokenize(original_tweet)
|
|
64
|
+
# Detokenize (Decode)
|
|
65
|
+
decoded_tweet = tokenizer.detokenize(tokens)
|
|
66
|
+
|
|
67
|
+
if decoded_tweet == original_tweet:
|
|
68
|
+
success_count += 1
|
|
69
|
+
else:
|
|
70
|
+
mismatches.append({
|
|
71
|
+
"row": idx + 2, # 1-based index, account for header row (+2)
|
|
72
|
+
"original": original_tweet,
|
|
73
|
+
"decoded": decoded_tweet,
|
|
74
|
+
"tokens": tokens
|
|
75
|
+
})
|
|
76
|
+
except Exception as e:
|
|
77
|
+
mismatches.append({
|
|
78
|
+
"row": idx + 2,
|
|
79
|
+
"original": original_tweet,
|
|
80
|
+
"error": str(e)
|
|
81
|
+
})
|
|
82
|
+
|
|
83
|
+
# Output test summary
|
|
84
|
+
print("\n" + "="*40)
|
|
85
|
+
print(" TEST RESULTS SUMMARY ")
|
|
86
|
+
print("="*40)
|
|
87
|
+
print(f"Total tweets processed: {total_count}")
|
|
88
|
+
print(f"Successful round-trips: {success_count}")
|
|
89
|
+
print(f"Failed round-trips: {len(mismatches)}")
|
|
90
|
+
if total_count > 0:
|
|
91
|
+
success_rate = (success_count / total_count) * 100
|
|
92
|
+
print(f"Success Rate: {success_rate:.2f}%")
|
|
93
|
+
print("="*40)
|
|
94
|
+
|
|
95
|
+
# Print first few failures if any exist
|
|
96
|
+
if mismatches:
|
|
97
|
+
print(f"\nShowing up to the first 10 mismatches out of {len(mismatches)} total failures:")
|
|
98
|
+
for i, m in enumerate(mismatches[:10]):
|
|
99
|
+
print(f"\nMismatch #{i+1} (Excel Row {m['row']}):")
|
|
100
|
+
print(f" Original: {repr(m['original'])}")
|
|
101
|
+
if "error" in m:
|
|
102
|
+
print(f" Error: {m['error']}")
|
|
103
|
+
else:
|
|
104
|
+
print(f" Decoded: {repr(m['decoded'])}")
|
|
105
|
+
print(f" Tokens: {m['tokens']}")
|
|
106
|
+
else:
|
|
107
|
+
print("\nAll tweets successfully passed the encode-decode round-trip test!")
|
|
108
|
+
|
|
109
|
+
# Write all failures to log file
|
|
110
|
+
logs_dir = Path(__file__).resolve().parent / "logs"
|
|
111
|
+
logs_dir.mkdir(parents=True, exist_ok=True)
|
|
112
|
+
log_file_path = logs_dir / "failed_tests.log"
|
|
113
|
+
|
|
114
|
+
from datetime import datetime
|
|
115
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
116
|
+
|
|
117
|
+
with open(log_file_path, "w", encoding="utf-8") as f:
|
|
118
|
+
f.write("="*80 + "\n")
|
|
119
|
+
f.write(f"ENCODE-DECODE ROUND-TRIP TEST LOG - {timestamp}\n")
|
|
120
|
+
f.write("="*80 + "\n")
|
|
121
|
+
f.write(f"Total tweets processed: {total_count}\n")
|
|
122
|
+
f.write(f"Successful round-trips: {success_count}\n")
|
|
123
|
+
f.write(f"Failed round-trips: {len(mismatches)}\n")
|
|
124
|
+
if total_count > 0:
|
|
125
|
+
success_rate = (success_count / total_count) * 100
|
|
126
|
+
f.write(f"Success Rate: {success_rate:.2f}%\n")
|
|
127
|
+
f.write("="*80 + "\n\n")
|
|
128
|
+
|
|
129
|
+
if mismatches:
|
|
130
|
+
f.write("DETAILED FAILURES:\n")
|
|
131
|
+
f.write("-"*80 + "\n")
|
|
132
|
+
for idx, m in enumerate(mismatches):
|
|
133
|
+
f.write(f"Failure #{idx+1} (Excel Row {m['row']}):\n")
|
|
134
|
+
f.write(f" Original: {repr(m['original'])}\n")
|
|
135
|
+
if "error" in m:
|
|
136
|
+
f.write(f" Error: {m['error']}\n")
|
|
137
|
+
else:
|
|
138
|
+
f.write(f" Decoded: {repr(m['decoded'])}\n")
|
|
139
|
+
f.write(f" Tokens: {m['tokens']}\n")
|
|
140
|
+
f.write("-"*80 + "\n")
|
|
141
|
+
else:
|
|
142
|
+
f.write("All tests passed successfully! No mismatches found.\n")
|
|
143
|
+
|
|
144
|
+
print(f"\nFailing tests and details written to: {log_file_path}")
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
run_decode_encode_test()
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
4
|
+
|
|
5
|
+
from custom_tokenizer.word_generator import CustomWordGenerator
|
|
6
|
+
import logging
|
|
7
|
+
|
|
8
|
+
# Configure logging to show errors but avoid noise
|
|
9
|
+
logging.basicConfig(level=logging.ERROR)
|
|
10
|
+
generator = CustomWordGenerator()
|
|
11
|
+
|
|
12
|
+
def test_generation(root, pos, suffix_ids, expected):
|
|
13
|
+
result = generator.generate_word(root, pos, suffix_ids)
|
|
14
|
+
|
|
15
|
+
output=""
|
|
16
|
+
if result != expected: output+="\033[31m"
|
|
17
|
+
output+=f"Root: {root} ({pos}), Suffixes: {suffix_ids} -> Result: {result}, Expected: {expected}"
|
|
18
|
+
print(output)
|
|
19
|
+
if result != expected: print("\033[0m", end="")
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
print("Starting generation tests...")
|
|
23
|
+
|
|
24
|
+
# Noun Cases (Dictionary Match)
|
|
25
|
+
test_generation("elma", "Noun", ["A3pl", "Dat"], "elmalara")
|
|
26
|
+
test_generation("Burun", "Noun", ["A3sg", "P1pl"], "Burnumuz")
|
|
27
|
+
test_generation("buRun", "Noun", ["A3sg", "P1pl"], "buRunumuz")
|
|
28
|
+
test_generation("hak", "Noun", ["Dat"], "hakka")
|
|
29
|
+
test_generation("burun", "Noun", ["Gen"], "burnun")
|
|
30
|
+
|
|
31
|
+
# Verb Cases (Dictionary Match)
|
|
32
|
+
test_generation("kaç", "Verb", ["Prog1", "A1sg"], "kaçıyorum")
|
|
33
|
+
test_generation("koş", "Verb", ["Aor", "A1pl"], "koşarız")
|
|
34
|
+
test_generation("gel", "Verb", ["AorPart"], "gelir")
|
|
35
|
+
test_generation("seyret", "Verb", ["Aor", "AsIf"], "seyredercesine")
|
|
36
|
+
|
|
37
|
+
# POS Inference
|
|
38
|
+
test_generation("at", "Noun", ["Dat"], "ata")
|
|
39
|
+
test_generation("at", "Verb", ["Fut", "Narr", "A1sg"], "atacakmışım")
|
|
40
|
+
|
|
41
|
+
# Aggressive POS Inference
|
|
42
|
+
test_generation("elma", "Verb", ["Fut", "Narr", "A1sg"], "elmayacakmışım")
|
|
43
|
+
test_generation("gel", "Noun", ["P1pl", "Gen"], "gelimizin")
|
|
44
|
+
|
|
45
|
+
# Pronoun
|
|
46
|
+
test_generation("biz", "Noun", ["A3pl"], "bizler")
|
|
47
|
+
|
|
48
|
+
# Named Entity (Proper Noun)
|
|
49
|
+
test_generation("Çıtçıt", "NamedEntity", ["Dat"], "Çıtçıt'a")
|
|
50
|
+
test_generation("Bürokratistan", "NamedEntity", ["Loc", "Rel", "A3pl", "Gen"], "Bürokratistan'dakilerin")
|
|
51
|
+
test_generation("Ahmet", "NamedEntity", ["Dat"], "Ahmet'e")
|
|
52
|
+
test_generation("Ayşe", "Noun", ["Gen"], "Ayşenin")
|
|
53
|
+
test_generation("Tüik", "NamedEntity", ["Dat"], "Tüik'e")
|
|
54
|
+
|
|
55
|
+
# Unknown Root
|
|
56
|
+
test_generation("bloop", "Noun", ["Dat"], "bloopa")
|
|
57
|
+
test_generation("bloop", "Verb", ["Prog1"], "bloopuyor")
|
|
58
|
+
|
|
59
|
+
# Broken Generation
|
|
60
|
+
test_generation("kap", "Noun", ["Prog1", "Dim", "A3pl"], "kapıyorcuklar")
|
|
61
|
+
test_generation("kitap", "Noun", ["Loc", "Rel", "Gen", "Aor", "Almost"], "kitaptakininireyaz")
|
|
62
|
+
|
|
63
|
+
# All Caps
|
|
64
|
+
test_generation("KAÇ", "Verb", ["Prog2", "A2pl"], "KAÇMAKTASINIZ")
|
|
65
|
+
test_generation("HAK", "Noun", ["P2sg"], "HAKKIN")
|
|
66
|
+
test_generation("TÜİK", "NamedEntity", ["P2sg"], "TÜİK'in")
|
|
67
|
+
test_generation("TÜK", "NamedEntity", ["P2sg"], "TÜK'ün")
|
|
68
|
+
|
|
69
|
+
# Numbers
|
|
70
|
+
test_generation("11", "Noun", ["Loc"], "11'de")
|
|
71
|
+
test_generation("12.00", "Noun", ["Loc"], "12.00'da")
|
|
72
|
+
test_generation("örnek2", "Noun", ["Gen"], "örnek2'nin") # Not supported by zemberek analyzer
|
|
73
|
+
test_generation("3/4", "Noun", ["Gen"], "3/4'ün") # Not supported by zemberek analyzer
|
|
74
|
+
|
|
75
|
+
# No vowels
|
|
76
|
+
test_generation("kg", "NamedEntity", ["Ness"], "kg'lik")
|
|
77
|
+
test_generation("TDK", "NamedEntity", ["Acc"], "TDK'yı")
|
|
78
|
+
test_generation("TMNB", "NamedEntity", ["Acc"], "TMNB'yi")
|
|
79
|
+
test_generation("z", "NamedEntity", ["Acc"], "z'yi")
|
|
80
|
+
test_generation("z", "Noun", ["Acc"], "zyi")
|
|
81
|
+
test_generation("k", "Verb", ["Fut"], "kyacak")
|
|
82
|
+
|
|
83
|
+
print("\nAll tests completed!")
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List
|
|
5
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from custom_tokenizer.morph_tokenizer import MorphTokenizer
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(level=logging.ERROR)
|
|
11
|
+
|
|
12
|
+
def test_generation(string: str, tokenizer: MorphTokenizer, expected: List[str]):
|
|
13
|
+
t0 = time.time()
|
|
14
|
+
tokens = tokenizer.tokenize(string)
|
|
15
|
+
t1 = time.time()
|
|
16
|
+
|
|
17
|
+
output="COMPLETED IN " + str(t1-t0) + " SECONDS\n"
|
|
18
|
+
if tokens != expected:
|
|
19
|
+
output+="\033[31m"
|
|
20
|
+
output+=f"String: {string} ->\n Result: {tokens}\n Expected: {expected}\033[0m\n"
|
|
21
|
+
|
|
22
|
+
# Test detokenization
|
|
23
|
+
detokenized = tokenizer.detokenize(tokens)
|
|
24
|
+
if detokenized != string:
|
|
25
|
+
output+="\033[31m"
|
|
26
|
+
output+=f"Detokenization failed!\nOriginal: {repr(string)}\nDetokenized: {repr(detokenized)}\033[0m\n"
|
|
27
|
+
else:
|
|
28
|
+
output+="\033[32mDetokenization successful matches original string!\033[0m\n"
|
|
29
|
+
print(output)
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
tokenizer = MorphTokenizer("<|", "|>")
|
|
33
|
+
print("initialized!")
|
|
34
|
+
start_time = time.time()
|
|
35
|
+
paragraph=(
|
|
36
|
+
"Ayşeyi, Ahmet'i Veli'yi ve ghim'i eve çağırmış."
|
|
37
|
+
" burnu *Burnu havada kitapçımız 2 kglik yani kg'lik TR'li eşyayı Yerismi'ye getirirmiş."
|
|
38
|
+
"\n\tYarın içinse AYŞE'Yİ ve ANNEM'i eVe çağırDI. "
|
|
39
|
+
"TDK'ye, UNKNOWN'a, TÜİK'e ve ALİ'ye göre olan hjKŞFh şeyler diyorlar. "
|
|
40
|
+
" Oysa Annem, BABAM ve kardeşlerime\ngöre\tdoğru olandır."
|
|
41
|
+
"Onlar 11'de veya 12.00'da burada olur."
|
|
42
|
+
"Sonra level2'nin 3/4'ü biter."
|
|
43
|
+
)
|
|
44
|
+
expected = [
|
|
45
|
+
'A', 'y', 'ş', 'e', '<|Noun|>', '<|Acc|>', ',', ' ', 'A', 'h', 'm', 'e', 't', '<|NamedEntity|>', '<|Acc|>', ' ', 'V', 'e', 'l', 'i', '<|NamedEntity|>', '<|Acc|>', ' ', 'v', 'e', ' ', 'g', 'h', 'i', 'm', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'v', '<|Noun|>', '<|Dat|>', ' ', 'ç', 'a', 'ğ', 'ı', 'r', '<|Verb|>', '<|Narr|>', '.',
|
|
46
|
+
' ', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', '*', 'B', 'u', 'r', 'u', 'n', '<|Noun|>', '<|Acc|>', ' ', 'h', 'a', 'v', 'a', '<|Noun|>', '<|Loc|>', ' ', 'k', 'i', 't', 'a', 'p', '<|Noun|>', '<|Agt|>', '<|P1pl|>', ' ', '2', ' ', 'k', 'g', '<|Noun|>', '<|Ness|>', ' ', 'y', 'a', 'n', 'i', ' ', 'k', 'g', '<|NamedEntity|>', '<|Ness|>', ' ', 'T', 'R', '<|NamedEntity|>', '<|With|>', ' ', 'e', 'ş', 'y', 'a', '<|Noun|>', '<|Acc|>', ' ', 'Y', 'e', 'r', 'i', 's', 'm', 'i', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'e', 't', 'i', 'r', '<|Verb|>', '<|Aor|>', '<|Narr|>', '.',
|
|
47
|
+
'\n', '\t', 'Y', 'a', 'r', 'ı', 'n', ' ', 'i', 'ç', 'i', 'n', '<|Noun|>', '<|Cond|>', ' ', 'A', 'Y', 'Ş', 'E', "'", 'Y', 'İ', ' ', 'v', 'e', ' ', 'A', 'N', 'N', 'E', 'M', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'V', 'e', ' ', 'ç', 'a', 'ğ', 'ı', 'r', 'D', 'I', '.', ' ', ' ', ' ',
|
|
48
|
+
'T', 'D', 'K', "'", 'y', 'e', ',', ' ', 'U', 'N', 'K', 'N', 'O', 'W', 'N', '<|NamedEntity|>', '<|Dat|>', ',', ' ', 'T', 'Ü', 'İ', 'K', '<|NamedEntity|>', '<|Dat|>', ' ', 'v', 'e', ' ', 'A', 'L', 'İ', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'ö', 'r', 'e', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', ' ', 'h', 'j', 'K', 'Ş', 'F', 'h', ' ', 'ş', 'e', 'y', '<|Noun|>', '<|A3pl|>', ' ', 'd', 'e', '<|Verb|>', '<|Prog1|>', '<|A3pl|>', '.',
|
|
49
|
+
' ', ' ', ' ', 'O', 'y', 's', 'a', ' ', 'A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ',', ' ', 'B', 'A', 'B', 'A', '<|Noun|>', '<|P1sg|>', ' ', 'v', 'e', ' ', ' ', ' ', 'k', 'a', 'r', 'd', 'e', 'ş', '<|Noun|>', '<|A3pl|>', '<|P1sg|>', '<|Dat|>', '\n', 'g', 'ö', 'r', 'e', '\t', 'd', 'o', 'ğ', 'r', 'u', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', '<|Cop|>', '.',
|
|
50
|
+
'O', '<|Noun|>', '<|A3pl|>', ' ', '1', '1', '<|Noun|>', '<|Loc|>', ' ', 'v', 'e', 'y', 'a', ' ', '1', '2', '.', '0', '0', '<|Noun|>', '<|Loc|>', ' ', 'b', 'u', 'r', 'a', '<|Noun|>', '<|Loc|>', ' ', 'o', 'l', '<|Verb|>', '<|Aor|>', '.',
|
|
51
|
+
'S', 'o', 'n', 'r', 'a', ' ', 'l', 'e', 'v', 'e', 'l', '2', "'", 'n', 'i', 'n', ' ', '3', '/', '4', "'", 'ü', ' ', 'b', 'i', 't', '<|Verb|>', '<|Aor|>', '.'
|
|
52
|
+
]
|
|
53
|
+
test_generation(paragraph, tokenizer, expected)
|
|
54
|
+
|
|
55
|
+
test_generation("Jazz bir kediydi. Arkadaşları vardı: Pamuk, Minnoş ve Tekir. Onlar dans etmeyi çok severdi. Bir gün, zor bir dans öğrendiler. Her gün dans ettiler. Sabah, öğle ve akşam.\n\nİlk başlarda çok zorlandılar. Ayakları karıştı, düştüler ve güldüler. Ama pes etmediler. Her gün daha iyi oldular. Jazz, Pamuk, Minnoş ve Tekir birlikte çalıştılar.\n\nSonunda, dansı öğrendiler! Çok mutluydular. Şimdi dans etmeyi biliyorlardı. Dans ederken zıpladılar, döndüler ve kahkaha attılar.\n\nArtık her zaman dans ediyorlardı. Parkta, bahçede ve evde. Jazz ve arkadaşları dans etmeyi çok seviyorlardı!\n", tokenizer, [])
|
|
56
|
+
test_generation("haftasonu vakti", tokenizer, ['h', 'a', 'f', 't', 'a', 's', 'o', 'n', '<|Noun|>', '<|Acc|>', ' ', 'v', 'a', 'k', 'i', 't', '<|Noun|>', '<|Acc|>'])
|
|
57
|
+
|
|
58
|
+
test_generation("Geldiler. Ama pes etmediler.", tokenizer, ['G', 'e', 'l', '<|Verb|>', '<|Past|>', '<|A3pl|>', '.', ' ', 'A', 'm', 'a', ' ', 'p', 'e', 's', ' ', 'e', 't', '<|Verb|>', '<|Neg|>', '<|Past|>', '<|A3pl|>', '.'])
|
|
59
|
+
test_generation(".burnumuzun ", tokenizer, ['.', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P1pl|>', '<|Gen|>', ' '])
|
|
60
|
+
print("All tests completed!")
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import logging
|
|
4
|
+
from typing import List
|
|
5
|
+
sys.path.append(str(Path(__file__).parent.parent))
|
|
6
|
+
|
|
7
|
+
from custom_tokenizer.morph_tokenizer import MorphTokenizer
|
|
8
|
+
|
|
9
|
+
logging.basicConfig(level=logging.ERROR)
|
|
10
|
+
|
|
11
|
+
def test_generation(sentence: str, tokenizer: MorphTokenizer, expected: List[str]):
|
|
12
|
+
tokens = tokenizer._MorphTokenizer__tokenize_sentence(sentence)
|
|
13
|
+
|
|
14
|
+
output=""
|
|
15
|
+
if tokens != expected:
|
|
16
|
+
output+="\033[31m"
|
|
17
|
+
output+=f"Sentence: {sentence} ->\n Result: {tokens}\n Expected: {expected}\033[0m"
|
|
18
|
+
print(output)
|
|
19
|
+
|
|
20
|
+
if __name__ == "__main__":
|
|
21
|
+
tokenizer = MorphTokenizer("<|", "|>")
|
|
22
|
+
print("initialized!")
|
|
23
|
+
test_generation("Ayşeyi, Ahmet'i Veli'yi ve ghim'i eve çağırdı.", tokenizer, ['A', 'y', 'ş', 'e', '<|Noun|>', '<|Acc|>', ',', ' ', 'A', 'h', 'm', 'e', 't', '<|NamedEntity|>', '<|Acc|>', ' ', 'V', 'e', 'l', 'i', '<|NamedEntity|>', '<|Acc|>', ' ', 'v', 'e', ' ', 'g', 'h', 'i', 'm', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'v', '<|Noun|>', '<|Dat|>', ' ', 'ç', 'a', 'ğ', 'ı', 'r', '<|Verb|>', '<|Past|>', '.'])
|
|
24
|
+
test_generation("kitapçı 2 kglik yani kg'lik TR'li eşya getirirmiş", tokenizer, ['k', 'i', 't', 'a', 'p', '<|Noun|>', '<|Agt|>', ' ', '2', ' ', 'k', 'g', '<|Noun|>', '<|Ness|>', ' ', 'y', 'a', 'n', 'i', ' ', 'k', 'g', '<|NamedEntity|>', '<|Ness|>', ' ', 'T', 'R', '<|NamedEntity|>', '<|With|>', ' ', 'e', 'ş', 'y', 'a', ' ', 'g', 'e', 't', 'i', 'r', '<|Verb|>', '<|Aor|>', '<|Narr|>'])
|
|
25
|
+
test_generation("Burnu burnu havadadır onun.", tokenizer, ['B', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', 'h', 'a', 'v', 'a', '<|Noun|>', '<|Loc|>', '<|Cop|>', ' ', 'o', '<|Noun|>', '<|Gen|>', '.'])
|
|
26
|
+
test_generation("Yerisimi'ye gelirmiş.", tokenizer, ['Y', 'e', 'r', 'i', 's', 'i', 'm', 'i', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'e', 'l', '<|Verb|>', '<|Aor|>', '<|Narr|>', '.'])
|
|
27
|
+
test_generation("AYŞE'Yİ ve ANNEM'i eVe çağırDI", tokenizer, ['A', 'Y', 'Ş', 'E', "'", 'Y', 'İ', ' ', 'v', 'e', ' ', 'A', 'N', 'N', 'E', 'M', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'V', 'e', ' ', 'ç', 'a', 'ğ', 'ı', 'r', 'D', 'I'])
|
|
28
|
+
test_generation("TDK'ye, UNKNOWN'a, TÜİK'e ve ALİ'ye göre olan hjKŞFh şeyler", tokenizer, ['T', 'D', 'K', "'", 'y', 'e', ',', ' ', 'U', 'N', 'K', 'N', 'O', 'W', 'N', '<|NamedEntity|>', '<|Dat|>', ',', ' ', 'T', 'Ü', 'İ', 'K', '<|NamedEntity|>', '<|Dat|>', ' ', 'v', 'e', ' ', 'A', 'L', 'İ', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'ö', 'r', 'e', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', ' ', 'h', 'j', 'K', 'Ş', 'F', 'h', ' ', 'ş', 'e', 'y', '<|Noun|>', '<|A3pl|>'])
|
|
29
|
+
test_generation("Annem, BABAM ve kardeşlerime\ngöre\tdoğru olandır. ", tokenizer, ['A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ',', ' ', 'B', 'A', 'B', 'A', '<|Noun|>', '<|P1sg|>', ' ', 'v', 'e', ' ', ' ', ' ', 'k', 'a', 'r', 'd', 'e', 'ş', '<|Noun|>', '<|A3pl|>', '<|P1sg|>', '<|Dat|>', '\n', 'g', 'ö', 'r', 'e', '\t', 'd', 'o', 'ğ', 'r', 'u', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', '<|Cop|>', '.', ' '])
|
|
30
|
+
test_generation("Annem 11'de veya 12.00'da burada olur.", tokenizer, ['A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ' ', '1', '1', '<|Noun|>', '<|Loc|>', ' ', 'v', 'e', 'y', 'a', ' ', '1', '2', '.', '0', '0', '<|Noun|>', '<|Loc|>', ' ', 'b', 'u', 'r', 'a', '<|Noun|>', '<|Loc|>', ' ', 'o', 'l', '<|Verb|>', '<|Aor|>', '.'])
|
|
31
|
+
test_generation("18ini de alıp 6'lı ayırdık", tokenizer, ['1', '8', 'i', 'n', 'i', ' ', 'd', 'e', ' ', 'a', 'l', '<|Verb|>', '<|AfterDoingSo|>', ' ', '6', '<|Noun|>', '<|With|>', ' ', 'a', 'y', 'ı', 'r', '<|Verb|>', '<|Past|>', '<|A1pl|>'])
|
|
32
|
+
|
|
33
|
+
# unsupported 👇
|
|
34
|
+
test_generation("Örnek2'nin 3/4'ü oldu.", tokenizer, ['Ö', 'r', 'n', 'e', 'k', '2', "'", 'n', 'i', 'n', ' ', '3', '/', '4', '<|Noun|>', '<|Acc|>', ' ', 'o', 'l', '<|Verb|>', '<|Past|>', '.'])
|
|
35
|
+
print("All tests completed!")
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from .morphology import TurkishMorphology
|
|
2
|
+
from .normalization import TurkishSentenceNormalizer, TurkishSpellChecker
|
|
3
|
+
from .tokenization import TurkishSentenceExtractor, TurkishTokenizer
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
# Suppress the pkg_resources deprecation warning from zemberek dependencies
|
|
7
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="pkg_resources")
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
__version__ = '0.2.3'
|
|
13
|
+
|
|
14
|
+
root = logging.getLogger()
|
|
15
|
+
root.setLevel(logging.INFO)
|
|
16
|
+
|
|
17
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
18
|
+
handler.setLevel(logging.INFO)
|
|
19
|
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s\nMsg: %(message)s\n')
|
|
20
|
+
handler.setFormatter(formatter)
|
|
21
|
+
root.addHandler(handler)
|
|
22
|
+
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from zemberek import (
|
|
5
|
+
TurkishSpellChecker,
|
|
6
|
+
TurkishSentenceNormalizer,
|
|
7
|
+
TurkishSentenceExtractor,
|
|
8
|
+
TurkishMorphology,
|
|
9
|
+
TurkishTokenizer
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
examples = ["Yrn okua gidicem",
|
|
15
|
+
"Tmm, yarin havuza giricem ve aksama kadar yaticam :)",
|
|
16
|
+
"ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo",
|
|
17
|
+
"gercek mı bu? Yuh! Artık unutulması bile beklenmiyo",
|
|
18
|
+
"Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.",
|
|
19
|
+
"yok hocam kesınlıkle oyle birşey yok",
|
|
20
|
+
"herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa",
|
|
21
|
+
"email adresim zemberek_python@loodos.com",
|
|
22
|
+
"Kredi başvrusu yapmk istiyrum.",
|
|
23
|
+
"Bankanizin hesp blgilerini ogrenmek istyorum."]
|
|
24
|
+
|
|
25
|
+
morphology = TurkishMorphology.create_with_defaults()
|
|
26
|
+
|
|
27
|
+
# SENTENCE NORMALIZATION
|
|
28
|
+
start = time.time()
|
|
29
|
+
normalizer = TurkishSentenceNormalizer(morphology)
|
|
30
|
+
logger.info(f"Normalization instance created in: {time.time() - start} s")
|
|
31
|
+
|
|
32
|
+
start = time.time()
|
|
33
|
+
for example in examples:
|
|
34
|
+
print(example)
|
|
35
|
+
print(normalizer.normalize(example), "\n")
|
|
36
|
+
logger.info(f"Sentences normalized in: {time.time() - start} s")
|
|
37
|
+
|
|
38
|
+
start = time.time()
|
|
39
|
+
sc = TurkishSpellChecker(morphology)
|
|
40
|
+
logger.info(f"Spell checker instance created in: {time.time() - start} s")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# SPELLING SUGGESTION
|
|
44
|
+
li = ["okuyablirim", "tartısıyor", "Ankar'ada", "knlıca", "yapablrim", "kıredi", "geldm", "geliyom", "aldm", "asln"]
|
|
45
|
+
start = time.time()
|
|
46
|
+
for word in li:
|
|
47
|
+
print(word + " = " + ' '.join(sc.suggest_for_word(word)))
|
|
48
|
+
logger.info(f"Spells checked in: {time.time() - start} s")
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# SENTENCE BOUNDARY DETECTION
|
|
52
|
+
start = time.time()
|
|
53
|
+
extractor = TurkishSentenceExtractor()
|
|
54
|
+
print("Extractor instance created in: ", time.time() - start, " s")
|
|
55
|
+
|
|
56
|
+
text = "İnsanoğlu aslında ne para ne sevgi ne kariyer ne şöhret ne de çevre ile sonsuza dek mutlu olabilecek bir " \
|
|
57
|
+
"yapıya sahiptir. Dış kaynaklardan gelebilecek bu mutluluklar sadece belirli bir zaman için insanı mutlu " \
|
|
58
|
+
"kılıyor. Kişi bu kaynakları elde ettiği zaman belirli bir dönem için kendini iyi hissediyor, ancak alışma " \
|
|
59
|
+
"dönemine girdiği andan itibaren bu iyilik hali hızla tükeniyor. Mutlu olma sanatının özü bu değildir. Gerçek " \
|
|
60
|
+
"mutluluk, kişinin her türlü olaya ve duruma karşı kendini pozitif tutarak mutlu hissedebilmesi halidir. Bu " \
|
|
61
|
+
"davranış şeklini edinen insan, zor günlerde güçlü, mutlu günlerde zevk alan biri olur ve mutluluğu kalıcı " \
|
|
62
|
+
"kılar. "
|
|
63
|
+
|
|
64
|
+
start = time.time()
|
|
65
|
+
sentences = extractor.from_paragraph(text)
|
|
66
|
+
print(f"Sentences separated in {time.time() - start}s")
|
|
67
|
+
|
|
68
|
+
for sentence in sentences:
|
|
69
|
+
print(sentence)
|
|
70
|
+
print("\n")
|
|
71
|
+
|
|
72
|
+
# SINGLE WORD MORPHOLOGICAL ANALYSIS
|
|
73
|
+
results = morphology.analyze("kalemin")
|
|
74
|
+
for result in results:
|
|
75
|
+
print(result)
|
|
76
|
+
print("\n")
|
|
77
|
+
|
|
78
|
+
# SENTENCE ANALYSIS AND DISAMBIGUATION
|
|
79
|
+
|
|
80
|
+
sentence = "Yarın kar yağacak."
|
|
81
|
+
analysis = morphology.analyze_sentence(sentence)
|
|
82
|
+
after = morphology.disambiguate(sentence, analysis)
|
|
83
|
+
|
|
84
|
+
print("\nBefore disambiguation")
|
|
85
|
+
for e in analysis:
|
|
86
|
+
print(f"Word = {e.inp}")
|
|
87
|
+
for s in e:
|
|
88
|
+
print(s.format_string())
|
|
89
|
+
|
|
90
|
+
print("\nAfter disambiguation")
|
|
91
|
+
for s in after.best_analysis():
|
|
92
|
+
print(s.format_string())
|
|
93
|
+
|
|
94
|
+
# TOKENIZATION
|
|
95
|
+
tokenizer = TurkishTokenizer.DEFAULT
|
|
96
|
+
|
|
97
|
+
tokens = tokenizer.tokenize("Saat 12:00.")
|
|
98
|
+
for token in tokens:
|
|
99
|
+
print('Content = ', token.content)
|
|
100
|
+
print('Type = ', token.type_.name)
|
|
101
|
+
print('Start = ', token.start)
|
|
102
|
+
print('Stop = ', token.end, '\n')
|