pre-bpe-morph-tr 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ Python port of the framework,
2
+
3
+ Copyright 2020 Loodos
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
16
+
17
+ =======================================================================
18
+ Original framework written in Java,
19
+
20
+ Copyright 2018 Ahmet A. Akın, Mehmet D. Akın
21
+
22
+ Licensed under the Apache License, Version 2.0 (the "License");
23
+ you may not use this file except in compliance with the License.
24
+ You may obtain a copy of the License at
25
+
26
+ http://www.apache.org/licenses/LICENSE-2.0
27
+
28
+ Unless required by applicable law or agreed to in writing, software
29
+ distributed under the License is distributed on an "AS IS" BASIS,
30
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31
+ See the License for the specific language governing permissions and
32
+ limitations under the License.
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.4
2
+ Name: pre_bpe_morph_tr
3
+ Version: 0.1.0
4
+ License-File: LICENSE.md
5
+ Requires-Dist: antlr4-python3-runtime==4.8
6
+ Requires-Dist: numpy>=1.19.0
7
+ Dynamic: license-file
@@ -0,0 +1,19 @@
1
+ This project is derived from the Python port of Zemberek by Loodos, which is itself based on the original Zemberek Java project by Ahmet A. Akın and Mehmet D. Akın.
2
+
3
+ # Goal of pre_bpe_morph
4
+ This package recognizes verbs, nouns, and named entities and their suffixes. Before BPE, this package removes suffixes and replaces them with their respective special tokens for identifiability. It also precedes a word type (Verb/Noun/NamedEntity) before any set of suffixes. The goal is to simplify Turkish grammar rules for small language models.
5
+
6
+ ## Usage
7
+ ```python
8
+ from pre_bpe_morph_tr import MorphTokenizer
9
+ tokenizer=MorphTokenizer("<|", "|>")
10
+
11
+ tokenizer.tokenize("gülüveriniz")
12
+ # response: ['g', 'ü', 'l', '<|Verb|>', '<|Hastily|>', '<|Req|>']
13
+
14
+ tokenizer.detokenize(['k', 'o', 'ş', '<|Verb|>', '<|Fut|>', '<|A1sg|>'])
15
+ # response: "koşacağım"
16
+ ```
17
+
18
+ ## Developed for language model
19
+ Since this is developed for LM training, I tried to avoid preprocessing text (like converting "hal" to "hâl", or lowercasing words). The text should remain unchanged when encoded and then decoded.
File without changes
@@ -0,0 +1,95 @@
1
+ from typing import List
2
+ from collections import OrderedDict
3
+
4
+ from custom_tokenizer.word_generator import CustomWordGenerator
5
+
6
+ class MorphDetokenizer:
7
+ def __init__(self, tk_start: str = "<|", tk_end: str = "|>", cache_limit: int = 100):
8
+ self.tk_start = tk_start
9
+ self.tk_end = tk_end
10
+ self.tk_start_len = len(tk_start)
11
+ self.tk_end_len = len(tk_end)
12
+ self.pos_tags = {"Noun", "Verb", "NamedEntity"}
13
+ self.generator = CustomWordGenerator()
14
+ self.generation_cache = OrderedDict()
15
+ self.cache_limit = cache_limit
16
+
17
+ def _is_special_token(self, token: str) -> bool:
18
+ return token.startswith(self.tk_start) and token.endswith(self.tk_end)
19
+
20
+ def _strip_token(self, token: str) -> str:
21
+ return token[self.tk_start_len : -self.tk_end_len]
22
+
23
+ def _is_word_char(self, token: str) -> bool:
24
+ return token.isalnum()
25
+
26
+ def _collect_suffixes(self, tokens: List[str], start_idx: int) -> tuple[List[str], int]:
27
+ """Collect all contiguous suffix tokens starting at start_idx."""
28
+ suffixes = []
29
+ n = len(tokens)
30
+ i = start_idx
31
+ while i < n and self._is_special_token(tokens[i]):
32
+ suffix_inner = self._strip_token(tokens[i])
33
+ # If we hit a word type POS tag, stop collecting suffixes
34
+ if suffix_inner in self.pos_tags:
35
+ break
36
+ suffixes.append(suffix_inner)
37
+ i += 1
38
+ return suffixes, i
39
+
40
+ def _reconstruct_and_append(self, word_type: str, suffixes: List[str], current_chars: List[str], result_parts: List[str]):
41
+ """Reconstruct the word from the root and suffixes and append it to result_parts."""
42
+ root = "".join(current_chars)
43
+ cache_key = (root, word_type, tuple(suffixes))
44
+
45
+ if cache_key in self.generation_cache:
46
+ reconstructed = self.generation_cache[cache_key]
47
+ self.generation_cache.move_to_end(cache_key)
48
+ else:
49
+ reconstructed = self.generator.generate_word(root, word_type, suffixes)
50
+ self.generation_cache[cache_key] = reconstructed
51
+ if len(self.generation_cache) > self.cache_limit:
52
+ self.generation_cache.popitem(last=False)
53
+
54
+ result_parts.append(reconstructed)
55
+
56
+ def detokenize(self, tokens: List[str]) -> str:
57
+ result_parts = []
58
+ current_chars = []
59
+
60
+ i = 0
61
+ n = len(tokens)
62
+ while i < n:
63
+ token = tokens[i]
64
+
65
+ if self._is_special_token(token):
66
+ inner = self._strip_token(token)
67
+
68
+ if inner in self.pos_tags:
69
+ # Case 1: POS tag followed by suffixes
70
+ suffixes, i = self._collect_suffixes(tokens, i + 1)
71
+ self._reconstruct_and_append(inner, suffixes, current_chars, result_parts)
72
+ current_chars = []
73
+ else:
74
+ # Case 2: Suffix token without a preceding POS tag
75
+ suffixes, i = self._collect_suffixes(tokens, i + 1)
76
+ suffixes.insert(0, inner)
77
+ self._reconstruct_and_append("Noun", suffixes, current_chars, result_parts)
78
+ current_chars = []
79
+ else:
80
+ # Normal character, whitespace, or punctuation
81
+ if self._is_word_char(token):
82
+ current_chars.append(token)
83
+ else:
84
+ if current_chars:
85
+ result_parts.append("".join(current_chars))
86
+ current_chars = []
87
+ result_parts.append(token)
88
+ i += 1
89
+
90
+ # Append any remaining characters
91
+ if current_chars:
92
+ result_parts.append("".join(current_chars))
93
+
94
+ return "".join(result_parts)
95
+
@@ -0,0 +1,130 @@
1
+ from zemberek.morphology.morphotactics.turkish_morphotactics import TurkishMorphotactics, get_morpheme_map
2
+
3
+ # Trigger class definition to ensure side-effects populate the map
4
+ _ = TurkishMorphotactics
5
+
6
+ morpheme_map = get_morpheme_map()
7
+
8
+ idx2morpheme_id =[
9
+ "Root",
10
+ "Noun",
11
+ "Adj",
12
+ "Verb",
13
+ "Pron",
14
+ "Adv",
15
+ "Conj",
16
+ "Punc",
17
+ "Ques",
18
+ "Postp",
19
+ "Det",
20
+ "Num",
21
+ "Dup",
22
+ "Interj",
23
+ "A1sg",
24
+ "A2sg",
25
+ "A3sg",
26
+ "A1pl",
27
+ "A2pl",
28
+ "A3pl",
29
+ "Pnon",
30
+ "P1sg",
31
+ "P2sg",
32
+ "P3sg",
33
+ "P1pl",
34
+ "P2pl",
35
+ "P3pl",
36
+ "Nom",
37
+ "Dat",
38
+ "Acc",
39
+ "Abl",
40
+ "Loc",
41
+ "Ins",
42
+ "Gen",
43
+ "Equ",
44
+ "Dim",
45
+ "Ness",
46
+ "With",
47
+ "Without",
48
+ "Related",
49
+ "JustLike",
50
+ "Rel",
51
+ "Agt",
52
+ "Become",
53
+ "Acquire",
54
+ "Ly",
55
+ "Caus",
56
+ "Recip",
57
+ "Reflex",
58
+ "Able",
59
+ "Pass",
60
+ "Inf1",
61
+ "Inf2",
62
+ "Inf3",
63
+ "ActOf",
64
+ "PastPart",
65
+ "NarrPart",
66
+ "FutPart",
67
+ "PresPart",
68
+ "AorPart",
69
+ "NotState",
70
+ "FeelLike",
71
+ "EverSince",
72
+ "Repeat",
73
+ "Almost",
74
+ "Hastily",
75
+ "Stay",
76
+ "Start",
77
+ "AsIf",
78
+ "While",
79
+ "When",
80
+ "SinceDoingSo",
81
+ "AsLongAs",
82
+ "ByDoingSo",
83
+ "Adamantly",
84
+ "AfterDoingSo",
85
+ "WithoutHavingDoneSo",
86
+ "WithoutBeingAbleToHaveDoneSo",
87
+ "Zero",
88
+ "Cop",
89
+ "Neg",
90
+ "Unable",
91
+ "Pres",
92
+ "Past",
93
+ "Narr",
94
+ "Cond",
95
+ "Prog1",
96
+ "Prog2",
97
+ "Aor",
98
+ "Fut",
99
+ "Imp",
100
+ "Req",
101
+ "Opt",
102
+ "Desr",
103
+ "Neces",
104
+ "Intrj"
105
+ ]
106
+
107
+ morpheme_id2idx = {
108
+ morpheme_id: idx for idx, morpheme_id in enumerate(idx2morpheme_id)
109
+ }
110
+
111
+ idx2morpheme = {
112
+ idx: morpheme_map[morpheme_id] for idx, morpheme_id in enumerate(idx2morpheme_id)
113
+ }
114
+
115
+ wordPos_ids = {
116
+ "Root",
117
+ "Noun",
118
+ "Adj",
119
+ "Verb",
120
+ "Pron",
121
+ "Adv",
122
+ "Conj",
123
+ "Punc",
124
+ "Ques",
125
+ "Postp",
126
+ "Det",
127
+ "Num",
128
+ "Dup",
129
+ "Interj",
130
+ }
@@ -0,0 +1,144 @@
1
+ from typing import List
2
+ from zemberek.morphology.analysis.word_analysis import WordAnalysis
3
+ from zemberek import TurkishSentenceExtractor
4
+ from custom_tokenizer.utils import get_morphology, match_capitilization, is_morph_analysis_ok
5
+
6
+ from zemberek.core.turkish import PrimaryPos, SecondaryPos
7
+ from custom_tokenizer.detokenizer import MorphDetokenizer
8
+ from zemberek.core.turkish.root_attribute import RootAttribute
9
+
10
+ class MorphTokenizer:
11
+ def __init__(self, tk_start, tk_end):
12
+ self.tk_start = tk_start
13
+ self.tk_end = tk_end
14
+ self.morphology = get_morphology()
15
+ self.extractor = TurkishSentenceExtractor()
16
+ self.special_token = lambda s: tk_start + s + tk_end
17
+
18
+ def tokenize(self, text: str) -> List[str]:
19
+ sentences = self.extractor.from_paragraph(text)
20
+
21
+ all_tokens = []
22
+ current_pos = 0
23
+
24
+ for i, sentence_text in enumerate(sentences):
25
+ # Find the sentence in the original text to capture preceding whitespace/newlines
26
+ start_idx = text.find(sentence_text, current_pos)
27
+ prefix = text[current_pos:start_idx]
28
+ if prefix:
29
+ all_tokens.extend(list(prefix))
30
+
31
+ # Tokenize the sentence itself
32
+ all_tokens.extend(self.__tokenize_sentence(sentence_text))
33
+
34
+ current_pos = start_idx + len(sentence_text)
35
+
36
+ # Add any trailing whitespace after the last sentence
37
+ trailing = text[current_pos:]
38
+ if trailing:
39
+ all_tokens.extend(list(trailing))
40
+
41
+ return all_tokens
42
+
43
+ def detokenize(self, tokens: List[str]) -> str:
44
+ detokenizer = MorphDetokenizer(tk_start=self.tk_start, tk_end=self.tk_end)
45
+ return detokenizer.detokenize(tokens)
46
+
47
+ def __tokenize_sentence(self, sentence: str) -> List[str]:
48
+ after = self.morphology.analyze_sentence(sentence)
49
+
50
+ whitespaces = self.__collect_whitespaces(sentence, after)
51
+
52
+ words = []
53
+ for word_analysis in after:
54
+ tokens = self.__get_word_tokens(word_analysis)
55
+ words.append(tokens)
56
+
57
+ split_by_words = self.__reconstruct_sentence(words, whitespaces)
58
+ return [token for word in split_by_words for token in word]
59
+
60
+ def __get_word_type(self, item, original_surface: str) -> str:
61
+ if item.primary_pos == PrimaryPos.Verb:
62
+ return "Verb"
63
+
64
+ # Check for numeric words recognized by Zemberek
65
+ numeric_secondary_pos = {
66
+ SecondaryPos.Cardinal, SecondaryPos.Clock, SecondaryPos.Date,
67
+ SecondaryPos.Ordinal, SecondaryPos.Percentage, SecondaryPos.Ratio,
68
+ SecondaryPos.Real, SecondaryPos.Distribution, SecondaryPos.Range
69
+ }
70
+ if item.secondary_pos in numeric_secondary_pos:
71
+ return "Noun"
72
+
73
+ # NamedEntity is anything with an apostrophe (that isn't a verb or numeric)
74
+ if "'" in original_surface:
75
+ return "NamedEntity"
76
+
77
+ if item.primary_pos not in [PrimaryPos.Unknown, PrimaryPos.Punctuation]:
78
+ return "Noun"
79
+ return None
80
+
81
+ def __get_word_tokens(self, word_analysis: WordAnalysis) -> List[str]:
82
+ original_surface = word_analysis.inp
83
+
84
+ if not is_morph_analysis_ok(original_surface) or not word_analysis.analysis_results:
85
+ return list(original_surface) # declare word as unknown
86
+
87
+ best = word_analysis.analysis_results[0]
88
+ item = best.item
89
+
90
+ word_type = self.__get_word_type(item, original_surface)
91
+
92
+ tokens = []
93
+ suffixes = []
94
+ for i, m_data in enumerate(best.morpheme_data_list):
95
+ if i == 0:
96
+ # Use the lemma/root form for the stem rather than the surface allomorph
97
+ if (not item.is_unknown()) and (not RootAttribute.CompoundP3sg in item.attributes):
98
+ stem = item.normalized_lemma()
99
+ else:
100
+ stem = m_data.surface
101
+ if "'" in original_surface and not item.is_unknown():
102
+ stem = original_surface.split("'")[0]
103
+ tokens.extend(list(match_capitilization(original_surface, stem)))
104
+ elif len(m_data.surface) > 0:
105
+ suffixes.append(self.special_token(m_data.morpheme.id_))
106
+
107
+ if len(suffixes) > 0:
108
+ if word_type:
109
+ tokens.append(self.special_token(word_type))
110
+ tokens.extend(suffixes)
111
+ else:
112
+ return list(original_surface)
113
+ return tokens
114
+
115
+ def __reconstruct_sentence(self, words: List[List[str]], whitespaces: List[str]) -> List[List[str]]:
116
+ sentence = []
117
+ for i in range(len(words) + len(whitespaces)):
118
+ if i % 2 == 1:
119
+ if i // 2 < len(words):
120
+ sentence.append(words[i // 2])
121
+ else:
122
+ if i // 2 < len(whitespaces) and len(whitespaces[i // 2]) > 0:
123
+ sentence.append(list(whitespaces[i // 2]))
124
+ return sentence
125
+
126
+ def __collect_whitespaces(self, sentence: str, analyses: List[WordAnalysis]) -> List[str]:
127
+ whitespaces = []
128
+ current_pos = 0
129
+
130
+ for wa in analyses:
131
+ original_surface = wa.inp
132
+ # Find the start of this word in the original sentence
133
+ start_idx = sentence.find(original_surface, current_pos)
134
+
135
+ # The gap before this word
136
+ whitespaces.append(sentence[current_pos:start_idx])
137
+
138
+ # Move current_pos past the word
139
+ current_pos = start_idx + len(original_surface)
140
+
141
+ # Add the trailing characters (if any)
142
+ whitespaces.append(sentence[current_pos:])
143
+
144
+ return whitespaces
@@ -0,0 +1,44 @@
1
+ from zemberek import TurkishMorphology
2
+ from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
3
+ import logging
4
+
5
+ _morphotactics = None
6
+ _morphology = None
7
+
8
+ def get_morphotactics():
9
+ global _morphotactics
10
+ if _morphotactics is None:
11
+ from zemberek.morphology.lexicon.root_lexicon import RootLexicon
12
+ from zemberek.morphology.morphotactics.turkish_morphotactics import TurkishMorphotactics
13
+ lexicon = RootLexicon.get_default()
14
+ _morphotactics = TurkishMorphotactics(lexicon)
15
+ return _morphotactics
16
+
17
+ def get_morphology():
18
+ global _morphology
19
+ if _morphology is None:
20
+ _morphology = TurkishMorphology.create_with_defaults()
21
+ return _morphology
22
+
23
+ def match_capitilization(ref: str, target: str) -> str:
24
+ if not ref or not target:
25
+ return target
26
+
27
+ alphabet = TurkishAlphabet.INSTANCE
28
+
29
+ if ref.isupper() and len(ref) > 1:
30
+ return target.translate(alphabet.upper_map).upper()
31
+
32
+ if ref[0].isupper():
33
+ return target[0].translate(alphabet.upper_map).upper() + target[1:]
34
+
35
+ return target[0].translate(alphabet.lower_map).lower() + target[1:]
36
+
37
+ def is_morph_analysis_ok(word: str) -> bool:
38
+ parts = word[1:].split("'")
39
+ trailing = parts[-1]
40
+ if len(parts) > 2: return False
41
+ if word.isupper(): return len(parts) == 1
42
+ for c in trailing:
43
+ if c.isupper(): return False
44
+ return True
@@ -0,0 +1,267 @@
1
+ from custom_tokenizer.utils import get_morphotactics, match_capitilization
2
+ from zemberek.morphology.morphotactics.morpheme import Morpheme
3
+ from zemberek.morphology.lexicon.dictionary_item import DictionaryItem
4
+ from zemberek.morphology.morphotactics.stem_transition import StemTransition
5
+ from zemberek.core.turkish.root_attribute import RootAttribute
6
+ from zemberek.core.turkish import PhoneticAttribute
7
+ from zemberek.morphology.analysis.attributes_helper import AttributesHelper
8
+ from zemberek.core.turkish.primary_pos import PrimaryPos
9
+ from zemberek.core.turkish.secondary_pos import SecondaryPos
10
+ from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
11
+ from zemberek.morphology.generator.word_generator import WordGenerator
12
+ from custom_tokenizer.dictionary import morpheme_map
13
+ from zemberek.morphology.analysis.tr.pronunciation_guesser import PronunciationGuesser
14
+ import logging
15
+ from typing import List, Union, Literal, Set
16
+
17
+ class CustomWordGenerator:
18
+ def __init__(self):
19
+ self.guesser = PronunciationGuesser()
20
+ self.alphabet = TurkishAlphabet.INSTANCE
21
+ self.morphotactics = get_morphotactics()
22
+
23
+ def generate_word(self, root: str, word_type: Literal["Noun", "Verb", "NamedEntity"], suffixes: Union[List[Morpheme], List[str]]) -> str:
24
+ """Generate a word form using Zemberek's WordGenerator.
25
+
26
+ Handles dictionary items, unknown words, and NamedEntity special logic.
27
+ """
28
+ if not suffixes:
29
+ return root
30
+
31
+ suffix_objs = [s if isinstance(s, Morpheme) else morpheme_map[s] for s in suffixes]
32
+ p_pos, s_pos = self._get_pos_enums(word_type)
33
+
34
+ lexicon = self.morphotactics.get_root_lexicon()
35
+ matching_items = self._find_lexicon_items(root, word_type, p_pos, lexicon)
36
+ candidates = self._get_stem_candidates(root, matching_items, word_type, p_pos, s_pos)
37
+
38
+ generator = WordGenerator(self.morphotactics)
39
+ results = generator.generate(morphemes=tuple(suffix_objs), candidates=tuple(candidates))
40
+
41
+ if results:
42
+ return self._apply_post_processing(root, results[0].surface, word_type)
43
+
44
+ forced_result = self.force_suffixes_on_word(root, word_type == "NamedEntity", suffix_objs)
45
+ return match_capitilization(root, forced_result)
46
+
47
+ def _get_pos_enums(self, word_type: str) -> tuple[PrimaryPos, SecondaryPos]:
48
+ if word_type in ["Noun", "NamedEntity"]:
49
+ p_pos = PrimaryPos.Noun
50
+ elif word_type == "Adj":
51
+ p_pos = PrimaryPos.Adjective
52
+ else: # Verb and others
53
+ p_pos = PrimaryPos.Verb
54
+
55
+ s_pos = SecondaryPos.ProperNoun if word_type == "NamedEntity" else SecondaryPos.None_
56
+ return p_pos, s_pos
57
+
58
+ def _filter_lexicon_items(self, items: List[DictionaryItem], word_type: str, p_pos: PrimaryPos = None) -> List[DictionaryItem]:
59
+ if p_pos is not None:
60
+ items = [item for item in items if item.primary_pos == p_pos]
61
+ else:
62
+ items = [item for item in items if item.primary_pos != PrimaryPos.Verb]
63
+
64
+ if word_type == "Noun":
65
+ items = [item for item in items if item.secondary_pos != SecondaryPos.ProperNoun]
66
+ elif word_type == "NamedEntity":
67
+ proper_items = [item for item in items if item.secondary_pos == SecondaryPos.ProperNoun]
68
+ if proper_items:
69
+ items = proper_items
70
+
71
+ return items
72
+
73
+ def _find_lexicon_items(self, root: str, word_type: str, p_pos: PrimaryPos, lexicon) -> List[DictionaryItem]:
74
+ lex_key = root
75
+ if word_type == "Verb":
76
+ lex_key = self._add_Inf1_suffix(root)
77
+
78
+ items = self._filter_lexicon_items(lexicon.item_map.get(lex_key, []), word_type, p_pos)
79
+
80
+ if not items and (root.istitle() or root.isupper()):
81
+ alt_lex_key = root.translate(self.alphabet.lower_map).lower()
82
+ if word_type == "Verb":
83
+ alt_lex_key = self._add_Inf1_suffix(alt_lex_key)
84
+ items = self._filter_lexicon_items(lexicon.item_map.get(alt_lex_key, []), word_type, p_pos)
85
+
86
+ # If still no items found, try any available POS as a fallback
87
+ # (but exclude Verb to avoid inappropriate morphological rules)
88
+ if not items:
89
+ items = self._filter_lexicon_items(lexicon.item_map.get(lex_key, []), word_type)
90
+ if not items and (root.istitle() or root.isupper()):
91
+ alt_lex_key = root.translate(self.alphabet.lower_map).lower()
92
+ if word_type == "Verb":
93
+ alt_lex_key = self._add_Inf1_suffix(alt_lex_key)
94
+ items = self._filter_lexicon_items(lexicon.item_map.get(alt_lex_key, []), word_type)
95
+
96
+ return items
97
+
98
+ def _get_stem_candidates(self, root: str, items: List[DictionaryItem], word_type: str, p_pos: PrimaryPos, s_pos: SecondaryPos) -> List[StemTransition]:
99
+ candidates = []
100
+ has_vowel = any(self.alphabet.is_vowel(c) for c in root)
101
+ for item in items:
102
+ if word_type == "NamedEntity" and not has_vowel:
103
+ # For vowel-less NamedEntities (abbreviations), ensure they have phonetic attributes
104
+ # even if using a dictionary item, as some dictionary entries might be missing them.
105
+ start_state = self.morphotactics.noun_S
106
+ phonetic_attrs = self._get_phonetic_attributes(root)
107
+ candidates.append(StemTransition(root, item, phonetic_attrs, start_state))
108
+ else:
109
+ candidates.extend(self.morphotactics.stem_transitions.get_transitions_for_item(item))
110
+
111
+ if not candidates:
112
+ candidates.append(self._create_stem_transition(root, p_pos, s_pos))
113
+ return candidates
114
+
115
+ def _turkish_lower(self, s: str) -> str:
116
+ return s.translate(self.alphabet.lower_map).lower()
117
+
118
+ def _apply_post_processing(self, root: str, generated_surface: str, word_type: str) -> str:
119
+ is_named_entity = word_type == "NamedEntity"
120
+ is_number = self.alphabet.contains_digit(root)
121
+ root_lower = self._turkish_lower(root)
122
+ generated_lower = self._turkish_lower(generated_surface)
123
+
124
+ if (is_named_entity or is_number) and generated_lower != root_lower:
125
+ if generated_lower.startswith(root_lower):
126
+ suffix = generated_lower[len(root_lower):]
127
+ return root + "'" + suffix
128
+
129
+ return match_capitilization(root, generated_surface)
130
+
131
+ def force_suffixes_on_word(self, root: str, is_named_entity: bool, suffixes: List[Morpheme]) -> str:
132
+ logging.warning(
133
+ f"Couldn't add suffixes: {[s.id_ for s in suffixes]} to \"{root}\" "
134
+ f"via zemberek's own method. Deploying work around"
135
+ )
136
+
137
+ generator = WordGenerator(self.morphotactics)
138
+ current_surface = root
139
+ apostrophe_added = False
140
+
141
+ for suffix in suffixes:
142
+ if suffix.id_ == "Rel":
143
+ current_surface += "ki"
144
+ continue
145
+
146
+ generated_surface = self._try_force_generate_suffix(
147
+ current_surface, suffix, is_named_entity, apostrophe_added, generator
148
+ )
149
+
150
+ if generated_surface:
151
+ current_surface, apostrophe_added = self._update_forced_surface(
152
+ current_surface, generated_surface, is_named_entity, apostrophe_added
153
+ )
154
+ else:
155
+ logging.error(f"Could not generate suffix {suffix.id_} for {current_surface}")
156
+
157
+ return current_surface
158
+
159
+ def _try_force_generate_suffix(self, current_surface: str, suffix: Morpheme, is_named_entity: bool, apostrophe_added: bool, generator: WordGenerator) -> Union[str, None]:
160
+ possible_pos = self._get_primary_pos_for_suffix(suffix)
161
+ for p_pos in possible_pos:
162
+ s_pos = SecondaryPos.ProperNoun if is_named_entity and not apostrophe_added else SecondaryPos.None_
163
+ candidate = self._create_stem_transition(current_surface, p_pos, s_pos)
164
+ results = generator.generate(morphemes=(suffix,), candidates=(candidate,))
165
+ if results:
166
+ return results[0].surface
167
+ return None
168
+
169
+ def _update_forced_surface(self, current_surface: str, generated_surface: str, is_named_entity: bool, apostrophe_added: bool) -> tuple[str, bool]:
170
+ if is_named_entity and not apostrophe_added and generated_surface != current_surface:
171
+ current_lower = self._turkish_lower(current_surface)
172
+ generated_lower = self._turkish_lower(generated_surface)
173
+ if generated_lower.startswith(current_lower):
174
+ suffix_surface = generated_lower[len(current_lower):]
175
+ return f"{current_surface}'{suffix_surface}", True
176
+ else:
177
+ return generated_surface, False
178
+ return generated_surface, apostrophe_added
179
+
180
+ def _get_primary_pos_for_suffix(self, morpheme: Morpheme) -> List[PrimaryPos]:
181
+ m_id = morpheme.id_
182
+
183
+ noun_suffixes = {
184
+ "Pnon", "P1sg", "P2sg", "P3sg", "P1pl", "P2pl", "P3pl", "Nom", "Dat", "Acc", "Abl", "Loc", "Ins", "Gen", "Equ",
185
+ "Dim", "Ness", "With", "Without", "Related", "JustLike", "Rel", "Agt", "Become", "Acquire", "Zero", "Root",
186
+ "A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl", "Past", "Narr", "Cond", "Cop", "Noun"
187
+ }
188
+
189
+ adjective_suffixes = {
190
+ "Ly", "AsIf", "Agt", "JustLike", "Become", "Acquire"
191
+ }
192
+
193
+ verb_suffixes = {
194
+ "Caus", "Recip", "Reflex", "Able", "Pass", "Neg", "Unable", "Pres", "Prog1", "Prog2", "Aor", "Fut", "Imp", "Req",
195
+ "Opt", "Desr", "Neces", "Inf1", "Inf2", "Inf3", "ActOf", "PastPart", "NarrPart", "FutPart", "PresPart", "AorPart",
196
+ "NotState", "FeelLike", "EverSince", "Repeat", "Almost", "Hastily", "Stay", "Start", "AsIf", "While",
197
+ "When", "SinceDoingSo", "AsLongAs", "ByDoingSo", "Adamantly", "AfterDoingSo", "WithoutHavingDoneSo",
198
+ "WithoutBeingAbleToHaveDoneSo", "A1sg", "A2sg", "A3sg", "A1pl", "A2pl", "A3pl", "Past", "Narr", "Cond", "Cop", "Verb",
199
+ "Intrj"
200
+ }
201
+
202
+ results = []
203
+ if m_id in noun_suffixes: results.append(PrimaryPos.Noun)
204
+ if m_id in adjective_suffixes: results.append(PrimaryPos.Adjective)
205
+ if m_id in verb_suffixes: results.append(PrimaryPos.Verb)
206
+
207
+ if results: return results
208
+
209
+ return [PrimaryPos.Unknown]
210
+
211
+ def _is_single_syllable(self, word: str) -> bool:
212
+ from zemberek.core.turkish.turkish_alphabet import TurkishAlphabet
213
+ vowel_count = sum(1 for char in word if TurkishAlphabet.INSTANCE.is_vowel(char))
214
+ return vowel_count == 1
215
+
216
+ def _add_Inf1_suffix(self, verb: str) -> str:
217
+ alphabet =TurkishAlphabet()
218
+ is_frontal = alphabet.get_last_vowel(verb).is_frontal()
219
+ sfx = "mek" if is_frontal else "mak"
220
+ return verb+sfx
221
+
222
+ def _create_stem_transition(self, root: str, p_pos: PrimaryPos, s_pos: SecondaryPos = SecondaryPos.None_) -> StemTransition:
223
+ attributes = set()
224
+ if p_pos == PrimaryPos.Verb:
225
+ if self._is_single_syllable(root):
226
+ attributes.add(RootAttribute.Aorist_A)
227
+ else:
228
+ attributes.add(RootAttribute.Aorist_I)
229
+
230
+ dummy_item = DictionaryItem(root, root, p_pos, s_pos, attributes=attributes)
231
+
232
+ # Determine the appropriate start state based on PrimaryPos
233
+ if p_pos == PrimaryPos.Verb:
234
+ start_state = self._get_verb_root_state(root)
235
+ elif p_pos == PrimaryPos.Adjective:
236
+ start_state = self.morphotactics.adjectiveRoot_ST
237
+ else: # Noun and others
238
+ start_state = self.morphotactics.noun_S
239
+
240
+ phonetic_attrs = self._get_phonetic_attributes(root)
241
+ res = StemTransition(root, dummy_item, phonetic_attrs, start_state)
242
+ return res
243
+
244
+ def _get_verb_root_state(self, root: str):
245
+ root_lower = root.translate(self.alphabet.lower_map).lower()
246
+ if root_lower in {"di", "yi", "de", "ye"}:
247
+ return self.morphotactics.vDeYeRoot_S
248
+ # Unknown verb roots should use the general verb root state.
249
+ # Vowel-ending stems without dictionary evidence should not be assumed
250
+ # to be vowel-drop verbs, otherwise suffixes like Fut fail.
251
+ return self.morphotactics.verbRoot_S
252
+
253
+ def _get_phonetic_attributes(self, root: str) -> Set[PhoneticAttribute]:
254
+ has_vowel = any(self.alphabet.is_vowel(c) for c in root)
255
+
256
+ if not has_vowel and len(root) > 0:
257
+ # Letter names are usually defined for lowercase letters in the guesser
258
+ normalized_root = root.translate(self.alphabet.lower_map).lower()
259
+ pronunciation = self.guesser.to_turkish_letter_pronunciations(normalized_root)
260
+ if pronunciation:
261
+ return AttributesHelper.get_morphemic_attributes(pronunciation)
262
+
263
+ if self.alphabet.contains_digit(root):
264
+ pronunciation = self.guesser.to_turkish_letter_pronunciation_with_digit(root)
265
+ return AttributesHelper.get_morphemic_attributes(pronunciation)
266
+
267
+ return AttributesHelper.get_morphemic_attributes(root)
@@ -0,0 +1,7 @@
1
+ Metadata-Version: 2.4
2
+ Name: pre_bpe_morph_tr
3
+ Version: 0.1.0
4
+ License-File: LICENSE.md
5
+ Requires-Dist: antlr4-python3-runtime==4.8
6
+ Requires-Dist: numpy>=1.19.0
7
+ Dynamic: license-file
@@ -0,0 +1,20 @@
1
+ LICENSE.md
2
+ README.md
3
+ pyproject.toml
4
+ custom_tokenizer/__init__.py
5
+ custom_tokenizer/detokenizer.py
6
+ custom_tokenizer/dictionary.py
7
+ custom_tokenizer/morph_tokenizer.py
8
+ custom_tokenizer/utils.py
9
+ custom_tokenizer/word_generator.py
10
+ pre_bpe_morph_tr.egg-info/PKG-INFO
11
+ pre_bpe_morph_tr.egg-info/SOURCES.txt
12
+ pre_bpe_morph_tr.egg-info/dependency_links.txt
13
+ pre_bpe_morph_tr.egg-info/requires.txt
14
+ pre_bpe_morph_tr.egg-info/top_level.txt
15
+ tests/test_decode_and_encode.py
16
+ tests/test_generation.py
17
+ tests/test_morph_tokenizer.py
18
+ tests/test_sentence_tokenizer.py
19
+ zemberek/__init__.py
20
+ zemberek/examples.py
@@ -0,0 +1,2 @@
1
+ antlr4-python3-runtime==4.8
2
+ numpy>=1.19.0
@@ -0,0 +1,2 @@
1
+ custom_tokenizer
2
+ zemberek
@@ -0,0 +1,14 @@
1
+ [build-system]
2
+ requires = ["setuptools<82.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pre_bpe_morph_tr"
7
+ version = "0.1.0"
8
+ dependencies = [
9
+ "antlr4-python3-runtime==4.8",
10
+ "numpy>=1.19.0",
11
+ ]
12
+
13
+ [tool.setuptools]
14
+ packages = ["zemberek", "custom_tokenizer"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,147 @@
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+
5
+ # Add the parent directory to sys.path to allow importing from the root
6
+ sys.path.append(str(Path(__file__).resolve().parent.parent))
7
+
8
+ import pandas as pd
9
+ from custom_tokenizer.morph_tokenizer import MorphTokenizer
10
+
11
+ # Configure logging to hide noisy library messages
12
+ logging.basicConfig(level=logging.ERROR)
13
+
14
+ def run_decode_encode_test():
15
+ xlsx_path = Path(__file__).parent / "TurkishTweets.xlsx"
16
+
17
+ if not xlsx_path.exists():
18
+ print(f"Error: Could not find TurkishTweets.xlsx at {xlsx_path}")
19
+ sys.exit(1)
20
+
21
+ print(f"Reading Excel file: {xlsx_path.name}...")
22
+ try:
23
+ df = pd.read_excel(xlsx_path)
24
+ except Exception as e:
25
+ print(f"Error reading Excel file: {e}")
26
+ print("Please ensure that 'pandas' and 'openpyxl' are installed in your environment.")
27
+ print("To install them, run:")
28
+ print(" ./.venv/bin/pip install pandas openpyxl")
29
+ sys.exit(1)
30
+
31
+ # Dynamically find the tweet column (case-insensitive check for 'tweet')
32
+ tweet_col = None
33
+ for col in df.columns:
34
+ if "tweet" in str(col).lower():
35
+ tweet_col = col
36
+ break
37
+
38
+ if tweet_col is None:
39
+ tweet_col = df.columns[0]
40
+ print(f"Could not find a column named 'Tweets'. Using the first column: '{tweet_col}'")
41
+ else:
42
+ print(f"Using column: '{tweet_col}'")
43
+
44
+ print("Initializing MorphTokenizer...")
45
+ tokenizer = MorphTokenizer("<|", "|>")
46
+
47
+ mismatches = []
48
+ total_count = 0
49
+ success_count = 0
50
+
51
+ print(f"Processing {len(df)} rows for the encode-decode round-trip test...")
52
+
53
+ for idx, row in df.iterrows():
54
+ original_tweet = row[tweet_col]
55
+ if pd.isna(original_tweet):
56
+ continue
57
+
58
+ original_tweet = str(original_tweet)
59
+ total_count += 1
60
+
61
+ try:
62
+ # Tokenize (Encode)
63
+ tokens = tokenizer.tokenize(original_tweet)
64
+ # Detokenize (Decode)
65
+ decoded_tweet = tokenizer.detokenize(tokens)
66
+
67
+ if decoded_tweet == original_tweet:
68
+ success_count += 1
69
+ else:
70
+ mismatches.append({
71
+ "row": idx + 2, # 1-based index, account for header row (+2)
72
+ "original": original_tweet,
73
+ "decoded": decoded_tweet,
74
+ "tokens": tokens
75
+ })
76
+ except Exception as e:
77
+ mismatches.append({
78
+ "row": idx + 2,
79
+ "original": original_tweet,
80
+ "error": str(e)
81
+ })
82
+
83
+ # Output test summary
84
+ print("\n" + "="*40)
85
+ print(" TEST RESULTS SUMMARY ")
86
+ print("="*40)
87
+ print(f"Total tweets processed: {total_count}")
88
+ print(f"Successful round-trips: {success_count}")
89
+ print(f"Failed round-trips: {len(mismatches)}")
90
+ if total_count > 0:
91
+ success_rate = (success_count / total_count) * 100
92
+ print(f"Success Rate: {success_rate:.2f}%")
93
+ print("="*40)
94
+
95
+ # Print first few failures if any exist
96
+ if mismatches:
97
+ print(f"\nShowing up to the first 10 mismatches out of {len(mismatches)} total failures:")
98
+ for i, m in enumerate(mismatches[:10]):
99
+ print(f"\nMismatch #{i+1} (Excel Row {m['row']}):")
100
+ print(f" Original: {repr(m['original'])}")
101
+ if "error" in m:
102
+ print(f" Error: {m['error']}")
103
+ else:
104
+ print(f" Decoded: {repr(m['decoded'])}")
105
+ print(f" Tokens: {m['tokens']}")
106
+ else:
107
+ print("\nAll tweets successfully passed the encode-decode round-trip test!")
108
+
109
+ # Write all failures to log file
110
+ logs_dir = Path(__file__).resolve().parent / "logs"
111
+ logs_dir.mkdir(parents=True, exist_ok=True)
112
+ log_file_path = logs_dir / "failed_tests.log"
113
+
114
+ from datetime import datetime
115
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
116
+
117
+ with open(log_file_path, "w", encoding="utf-8") as f:
118
+ f.write("="*80 + "\n")
119
+ f.write(f"ENCODE-DECODE ROUND-TRIP TEST LOG - {timestamp}\n")
120
+ f.write("="*80 + "\n")
121
+ f.write(f"Total tweets processed: {total_count}\n")
122
+ f.write(f"Successful round-trips: {success_count}\n")
123
+ f.write(f"Failed round-trips: {len(mismatches)}\n")
124
+ if total_count > 0:
125
+ success_rate = (success_count / total_count) * 100
126
+ f.write(f"Success Rate: {success_rate:.2f}%\n")
127
+ f.write("="*80 + "\n\n")
128
+
129
+ if mismatches:
130
+ f.write("DETAILED FAILURES:\n")
131
+ f.write("-"*80 + "\n")
132
+ for idx, m in enumerate(mismatches):
133
+ f.write(f"Failure #{idx+1} (Excel Row {m['row']}):\n")
134
+ f.write(f" Original: {repr(m['original'])}\n")
135
+ if "error" in m:
136
+ f.write(f" Error: {m['error']}\n")
137
+ else:
138
+ f.write(f" Decoded: {repr(m['decoded'])}\n")
139
+ f.write(f" Tokens: {m['tokens']}\n")
140
+ f.write("-"*80 + "\n")
141
+ else:
142
+ f.write("All tests passed successfully! No mismatches found.\n")
143
+
144
+ print(f"\nFailing tests and details written to: {log_file_path}")
145
+
146
+ if __name__ == "__main__":
147
+ run_decode_encode_test()
@@ -0,0 +1,83 @@
1
+ import sys
2
+ from pathlib import Path
3
+ sys.path.append(str(Path(__file__).parent.parent))
4
+
5
+ from custom_tokenizer.word_generator import CustomWordGenerator
6
+ import logging
7
+
8
+ # Configure logging to show errors but avoid noise
9
+ logging.basicConfig(level=logging.ERROR)
10
+ generator = CustomWordGenerator()
11
+
12
+ def test_generation(root, pos, suffix_ids, expected):
13
+ result = generator.generate_word(root, pos, suffix_ids)
14
+
15
+ output=""
16
+ if result != expected: output+="\033[31m"
17
+ output+=f"Root: {root} ({pos}), Suffixes: {suffix_ids} -> Result: {result}, Expected: {expected}"
18
+ print(output)
19
+ if result != expected: print("\033[0m", end="")
20
+
21
+ if __name__ == "__main__":
22
+ print("Starting generation tests...")
23
+
24
+ # Noun Cases (Dictionary Match)
25
+ test_generation("elma", "Noun", ["A3pl", "Dat"], "elmalara")
26
+ test_generation("Burun", "Noun", ["A3sg", "P1pl"], "Burnumuz")
27
+ test_generation("buRun", "Noun", ["A3sg", "P1pl"], "buRunumuz")
28
+ test_generation("hak", "Noun", ["Dat"], "hakka")
29
+ test_generation("burun", "Noun", ["Gen"], "burnun")
30
+
31
+ # Verb Cases (Dictionary Match)
32
+ test_generation("kaç", "Verb", ["Prog1", "A1sg"], "kaçıyorum")
33
+ test_generation("koş", "Verb", ["Aor", "A1pl"], "koşarız")
34
+ test_generation("gel", "Verb", ["AorPart"], "gelir")
35
+ test_generation("seyret", "Verb", ["Aor", "AsIf"], "seyredercesine")
36
+
37
+ # POS Inference
38
+ test_generation("at", "Noun", ["Dat"], "ata")
39
+ test_generation("at", "Verb", ["Fut", "Narr", "A1sg"], "atacakmışım")
40
+
41
+ # Aggressive POS Inference
42
+ test_generation("elma", "Verb", ["Fut", "Narr", "A1sg"], "elmayacakmışım")
43
+ test_generation("gel", "Noun", ["P1pl", "Gen"], "gelimizin")
44
+
45
+ # Pronoun
46
+ test_generation("biz", "Noun", ["A3pl"], "bizler")
47
+
48
+ # Named Entity (Proper Noun)
49
+ test_generation("Çıtçıt", "NamedEntity", ["Dat"], "Çıtçıt'a")
50
+ test_generation("Bürokratistan", "NamedEntity", ["Loc", "Rel", "A3pl", "Gen"], "Bürokratistan'dakilerin")
51
+ test_generation("Ahmet", "NamedEntity", ["Dat"], "Ahmet'e")
52
+ test_generation("Ayşe", "Noun", ["Gen"], "Ayşenin")
53
+ test_generation("Tüik", "NamedEntity", ["Dat"], "Tüik'e")
54
+
55
+ # Unknown Root
56
+ test_generation("bloop", "Noun", ["Dat"], "bloopa")
57
+ test_generation("bloop", "Verb", ["Prog1"], "bloopuyor")
58
+
59
+ # Broken Generation
60
+ test_generation("kap", "Noun", ["Prog1", "Dim", "A3pl"], "kapıyorcuklar")
61
+ test_generation("kitap", "Noun", ["Loc", "Rel", "Gen", "Aor", "Almost"], "kitaptakininireyaz")
62
+
63
+ # All Caps
64
+ test_generation("KAÇ", "Verb", ["Prog2", "A2pl"], "KAÇMAKTASINIZ")
65
+ test_generation("HAK", "Noun", ["P2sg"], "HAKKIN")
66
+ test_generation("TÜİK", "NamedEntity", ["P2sg"], "TÜİK'in")
67
+ test_generation("TÜK", "NamedEntity", ["P2sg"], "TÜK'ün")
68
+
69
+ # Numbers
70
+ test_generation("11", "Noun", ["Loc"], "11'de")
71
+ test_generation("12.00", "Noun", ["Loc"], "12.00'da")
72
+ test_generation("örnek2", "Noun", ["Gen"], "örnek2'nin") # Not supported by zemberek analyzer
73
+ test_generation("3/4", "Noun", ["Gen"], "3/4'ün") # Not supported by zemberek analyzer
74
+
75
+ # No vowels
76
+ test_generation("kg", "NamedEntity", ["Ness"], "kg'lik")
77
+ test_generation("TDK", "NamedEntity", ["Acc"], "TDK'yı")
78
+ test_generation("TMNB", "NamedEntity", ["Acc"], "TMNB'yi")
79
+ test_generation("z", "NamedEntity", ["Acc"], "z'yi")
80
+ test_generation("z", "Noun", ["Acc"], "zyi")
81
+ test_generation("k", "Verb", ["Fut"], "kyacak")
82
+
83
+ print("\nAll tests completed!")
@@ -0,0 +1,60 @@
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+ from typing import List
5
+ sys.path.append(str(Path(__file__).parent.parent))
6
+ import time
7
+
8
+ from custom_tokenizer.morph_tokenizer import MorphTokenizer
9
+
10
+ logging.basicConfig(level=logging.ERROR)
11
+
12
+ def test_generation(string: str, tokenizer: MorphTokenizer, expected: List[str]):
13
+ t0 = time.time()
14
+ tokens = tokenizer.tokenize(string)
15
+ t1 = time.time()
16
+
17
+ output="COMPLETED IN " + str(t1-t0) + " SECONDS\n"
18
+ if tokens != expected:
19
+ output+="\033[31m"
20
+ output+=f"String: {string} ->\n Result: {tokens}\n Expected: {expected}\033[0m\n"
21
+
22
+ # Test detokenization
23
+ detokenized = tokenizer.detokenize(tokens)
24
+ if detokenized != string:
25
+ output+="\033[31m"
26
+ output+=f"Detokenization failed!\nOriginal: {repr(string)}\nDetokenized: {repr(detokenized)}\033[0m\n"
27
+ else:
28
+ output+="\033[32mDetokenization successful matches original string!\033[0m\n"
29
+ print(output)
30
+
31
+ if __name__ == "__main__":
32
+ tokenizer = MorphTokenizer("<|", "|>")
33
+ print("initialized!")
34
+ start_time = time.time()
35
+ paragraph=(
36
+ "Ayşeyi, Ahmet'i Veli'yi ve ghim'i eve çağırmış."
37
+ " burnu *Burnu havada kitapçımız 2 kglik yani kg'lik TR'li eşyayı Yerismi'ye getirirmiş."
38
+ "\n\tYarın içinse AYŞE'Yİ ve ANNEM'i eVe çağırDI. "
39
+ "TDK'ye, UNKNOWN'a, TÜİK'e ve ALİ'ye göre olan hjKŞFh şeyler diyorlar. "
40
+ " Oysa Annem, BABAM ve kardeşlerime\ngöre\tdoğru olandır."
41
+ "Onlar 11'de veya 12.00'da burada olur."
42
+ "Sonra level2'nin 3/4'ü biter."
43
+ )
44
+ expected = [
45
+ 'A', 'y', 'ş', 'e', '<|Noun|>', '<|Acc|>', ',', ' ', 'A', 'h', 'm', 'e', 't', '<|NamedEntity|>', '<|Acc|>', ' ', 'V', 'e', 'l', 'i', '<|NamedEntity|>', '<|Acc|>', ' ', 'v', 'e', ' ', 'g', 'h', 'i', 'm', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'v', '<|Noun|>', '<|Dat|>', ' ', 'ç', 'a', 'ğ', 'ı', 'r', '<|Verb|>', '<|Narr|>', '.',
46
+ ' ', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', '*', 'B', 'u', 'r', 'u', 'n', '<|Noun|>', '<|Acc|>', ' ', 'h', 'a', 'v', 'a', '<|Noun|>', '<|Loc|>', ' ', 'k', 'i', 't', 'a', 'p', '<|Noun|>', '<|Agt|>', '<|P1pl|>', ' ', '2', ' ', 'k', 'g', '<|Noun|>', '<|Ness|>', ' ', 'y', 'a', 'n', 'i', ' ', 'k', 'g', '<|NamedEntity|>', '<|Ness|>', ' ', 'T', 'R', '<|NamedEntity|>', '<|With|>', ' ', 'e', 'ş', 'y', 'a', '<|Noun|>', '<|Acc|>', ' ', 'Y', 'e', 'r', 'i', 's', 'm', 'i', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'e', 't', 'i', 'r', '<|Verb|>', '<|Aor|>', '<|Narr|>', '.',
47
+ '\n', '\t', 'Y', 'a', 'r', 'ı', 'n', ' ', 'i', 'ç', 'i', 'n', '<|Noun|>', '<|Cond|>', ' ', 'A', 'Y', 'Ş', 'E', "'", 'Y', 'İ', ' ', 'v', 'e', ' ', 'A', 'N', 'N', 'E', 'M', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'V', 'e', ' ', 'ç', 'a', 'ğ', 'ı', 'r', 'D', 'I', '.', ' ', ' ', ' ',
48
+ 'T', 'D', 'K', "'", 'y', 'e', ',', ' ', 'U', 'N', 'K', 'N', 'O', 'W', 'N', '<|NamedEntity|>', '<|Dat|>', ',', ' ', 'T', 'Ü', 'İ', 'K', '<|NamedEntity|>', '<|Dat|>', ' ', 'v', 'e', ' ', 'A', 'L', 'İ', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'ö', 'r', 'e', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', ' ', 'h', 'j', 'K', 'Ş', 'F', 'h', ' ', 'ş', 'e', 'y', '<|Noun|>', '<|A3pl|>', ' ', 'd', 'e', '<|Verb|>', '<|Prog1|>', '<|A3pl|>', '.',
49
+ ' ', ' ', ' ', 'O', 'y', 's', 'a', ' ', 'A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ',', ' ', 'B', 'A', 'B', 'A', '<|Noun|>', '<|P1sg|>', ' ', 'v', 'e', ' ', ' ', ' ', 'k', 'a', 'r', 'd', 'e', 'ş', '<|Noun|>', '<|A3pl|>', '<|P1sg|>', '<|Dat|>', '\n', 'g', 'ö', 'r', 'e', '\t', 'd', 'o', 'ğ', 'r', 'u', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', '<|Cop|>', '.',
50
+ 'O', '<|Noun|>', '<|A3pl|>', ' ', '1', '1', '<|Noun|>', '<|Loc|>', ' ', 'v', 'e', 'y', 'a', ' ', '1', '2', '.', '0', '0', '<|Noun|>', '<|Loc|>', ' ', 'b', 'u', 'r', 'a', '<|Noun|>', '<|Loc|>', ' ', 'o', 'l', '<|Verb|>', '<|Aor|>', '.',
51
+ 'S', 'o', 'n', 'r', 'a', ' ', 'l', 'e', 'v', 'e', 'l', '2', "'", 'n', 'i', 'n', ' ', '3', '/', '4', "'", 'ü', ' ', 'b', 'i', 't', '<|Verb|>', '<|Aor|>', '.'
52
+ ]
53
+ test_generation(paragraph, tokenizer, expected)
54
+
55
+ test_generation("Jazz bir kediydi. Arkadaşları vardı: Pamuk, Minnoş ve Tekir. Onlar dans etmeyi çok severdi. Bir gün, zor bir dans öğrendiler. Her gün dans ettiler. Sabah, öğle ve akşam.\n\nİlk başlarda çok zorlandılar. Ayakları karıştı, düştüler ve güldüler. Ama pes etmediler. Her gün daha iyi oldular. Jazz, Pamuk, Minnoş ve Tekir birlikte çalıştılar.\n\nSonunda, dansı öğrendiler! Çok mutluydular. Şimdi dans etmeyi biliyorlardı. Dans ederken zıpladılar, döndüler ve kahkaha attılar.\n\nArtık her zaman dans ediyorlardı. Parkta, bahçede ve evde. Jazz ve arkadaşları dans etmeyi çok seviyorlardı!\n", tokenizer, [])
56
+ test_generation("haftasonu vakti", tokenizer, ['h', 'a', 'f', 't', 'a', 's', 'o', 'n', '<|Noun|>', '<|Acc|>', ' ', 'v', 'a', 'k', 'i', 't', '<|Noun|>', '<|Acc|>'])
57
+
58
+ test_generation("Geldiler. Ama pes etmediler.", tokenizer, ['G', 'e', 'l', '<|Verb|>', '<|Past|>', '<|A3pl|>', '.', ' ', 'A', 'm', 'a', ' ', 'p', 'e', 's', ' ', 'e', 't', '<|Verb|>', '<|Neg|>', '<|Past|>', '<|A3pl|>', '.'])
59
+ test_generation(".burnumuzun ", tokenizer, ['.', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P1pl|>', '<|Gen|>', ' '])
60
+ print("All tests completed!")
@@ -0,0 +1,35 @@
1
+ import sys
2
+ from pathlib import Path
3
+ import logging
4
+ from typing import List
5
+ sys.path.append(str(Path(__file__).parent.parent))
6
+
7
+ from custom_tokenizer.morph_tokenizer import MorphTokenizer
8
+
9
+ logging.basicConfig(level=logging.ERROR)
10
+
11
+ def test_generation(sentence: str, tokenizer: MorphTokenizer, expected: List[str]):
12
+ tokens = tokenizer._MorphTokenizer__tokenize_sentence(sentence)
13
+
14
+ output=""
15
+ if tokens != expected:
16
+ output+="\033[31m"
17
+ output+=f"Sentence: {sentence} ->\n Result: {tokens}\n Expected: {expected}\033[0m"
18
+ print(output)
19
+
20
+ if __name__ == "__main__":
21
+ tokenizer = MorphTokenizer("<|", "|>")
22
+ print("initialized!")
23
+ test_generation("Ayşeyi, Ahmet'i Veli'yi ve ghim'i eve çağırdı.", tokenizer, ['A', 'y', 'ş', 'e', '<|Noun|>', '<|Acc|>', ',', ' ', 'A', 'h', 'm', 'e', 't', '<|NamedEntity|>', '<|Acc|>', ' ', 'V', 'e', 'l', 'i', '<|NamedEntity|>', '<|Acc|>', ' ', 'v', 'e', ' ', 'g', 'h', 'i', 'm', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'v', '<|Noun|>', '<|Dat|>', ' ', 'ç', 'a', 'ğ', 'ı', 'r', '<|Verb|>', '<|Past|>', '.'])
24
+ test_generation("kitapçı 2 kglik yani kg'lik TR'li eşya getirirmiş", tokenizer, ['k', 'i', 't', 'a', 'p', '<|Noun|>', '<|Agt|>', ' ', '2', ' ', 'k', 'g', '<|Noun|>', '<|Ness|>', ' ', 'y', 'a', 'n', 'i', ' ', 'k', 'g', '<|NamedEntity|>', '<|Ness|>', ' ', 'T', 'R', '<|NamedEntity|>', '<|With|>', ' ', 'e', 'ş', 'y', 'a', ' ', 'g', 'e', 't', 'i', 'r', '<|Verb|>', '<|Aor|>', '<|Narr|>'])
25
+ test_generation("Burnu burnu havadadır onun.", tokenizer, ['B', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', 'b', 'u', 'r', 'u', 'n', '<|Noun|>', '<|P3sg|>', ' ', 'h', 'a', 'v', 'a', '<|Noun|>', '<|Loc|>', '<|Cop|>', ' ', 'o', '<|Noun|>', '<|Gen|>', '.'])
26
+ test_generation("Yerisimi'ye gelirmiş.", tokenizer, ['Y', 'e', 'r', 'i', 's', 'i', 'm', 'i', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'e', 'l', '<|Verb|>', '<|Aor|>', '<|Narr|>', '.'])
27
+ test_generation("AYŞE'Yİ ve ANNEM'i eVe çağırDI", tokenizer, ['A', 'Y', 'Ş', 'E', "'", 'Y', 'İ', ' ', 'v', 'e', ' ', 'A', 'N', 'N', 'E', 'M', '<|NamedEntity|>', '<|Acc|>', ' ', 'e', 'V', 'e', ' ', 'ç', 'a', 'ğ', 'ı', 'r', 'D', 'I'])
28
+ test_generation("TDK'ye, UNKNOWN'a, TÜİK'e ve ALİ'ye göre olan hjKŞFh şeyler", tokenizer, ['T', 'D', 'K', "'", 'y', 'e', ',', ' ', 'U', 'N', 'K', 'N', 'O', 'W', 'N', '<|NamedEntity|>', '<|Dat|>', ',', ' ', 'T', 'Ü', 'İ', 'K', '<|NamedEntity|>', '<|Dat|>', ' ', 'v', 'e', ' ', 'A', 'L', 'İ', '<|NamedEntity|>', '<|Dat|>', ' ', 'g', 'ö', 'r', 'e', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', ' ', 'h', 'j', 'K', 'Ş', 'F', 'h', ' ', 'ş', 'e', 'y', '<|Noun|>', '<|A3pl|>'])
29
+ test_generation("Annem, BABAM ve kardeşlerime\ngöre\tdoğru olandır. ", tokenizer, ['A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ',', ' ', 'B', 'A', 'B', 'A', '<|Noun|>', '<|P1sg|>', ' ', 'v', 'e', ' ', ' ', ' ', 'k', 'a', 'r', 'd', 'e', 'ş', '<|Noun|>', '<|A3pl|>', '<|P1sg|>', '<|Dat|>', '\n', 'g', 'ö', 'r', 'e', '\t', 'd', 'o', 'ğ', 'r', 'u', ' ', 'o', 'l', '<|Verb|>', '<|PresPart|>', '<|Cop|>', '.', ' '])
30
+ test_generation("Annem 11'de veya 12.00'da burada olur.", tokenizer, ['A', 'n', 'n', 'e', '<|Noun|>', '<|P1sg|>', ' ', '1', '1', '<|Noun|>', '<|Loc|>', ' ', 'v', 'e', 'y', 'a', ' ', '1', '2', '.', '0', '0', '<|Noun|>', '<|Loc|>', ' ', 'b', 'u', 'r', 'a', '<|Noun|>', '<|Loc|>', ' ', 'o', 'l', '<|Verb|>', '<|Aor|>', '.'])
31
+ test_generation("18ini de alıp 6'lı ayırdık", tokenizer, ['1', '8', 'i', 'n', 'i', ' ', 'd', 'e', ' ', 'a', 'l', '<|Verb|>', '<|AfterDoingSo|>', ' ', '6', '<|Noun|>', '<|With|>', ' ', 'a', 'y', 'ı', 'r', '<|Verb|>', '<|Past|>', '<|A1pl|>'])
32
+
33
+ # unsupported 👇
34
+ test_generation("Örnek2'nin 3/4'ü oldu.", tokenizer, ['Ö', 'r', 'n', 'e', 'k', '2', "'", 'n', 'i', 'n', ' ', '3', '/', '4', '<|Noun|>', '<|Acc|>', ' ', 'o', 'l', '<|Verb|>', '<|Past|>', '.'])
35
+ print("All tests completed!")
@@ -0,0 +1,22 @@
1
+ from .morphology import TurkishMorphology
2
+ from .normalization import TurkishSentenceNormalizer, TurkishSpellChecker
3
+ from .tokenization import TurkishSentenceExtractor, TurkishTokenizer
4
+ import warnings
5
+
6
+ # Suppress the pkg_resources deprecation warning from zemberek dependencies
7
+ warnings.filterwarnings("ignore", category=UserWarning, module="pkg_resources")
8
+
9
+ import logging
10
+ import sys
11
+
12
+ __version__ = '0.2.3'
13
+
14
+ root = logging.getLogger()
15
+ root.setLevel(logging.INFO)
16
+
17
+ handler = logging.StreamHandler(sys.stdout)
18
+ handler.setLevel(logging.INFO)
19
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s\nMsg: %(message)s\n')
20
+ handler.setFormatter(formatter)
21
+ root.addHandler(handler)
22
+
@@ -0,0 +1,102 @@
1
+ import time
2
+ import logging
3
+
4
+ from zemberek import (
5
+ TurkishSpellChecker,
6
+ TurkishSentenceNormalizer,
7
+ TurkishSentenceExtractor,
8
+ TurkishMorphology,
9
+ TurkishTokenizer
10
+ )
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ examples = ["Yrn okua gidicem",
15
+ "Tmm, yarin havuza giricem ve aksama kadar yaticam :)",
16
+ "ah aynen ya annemde fark ettı siz evinizden cıkmayın diyo",
17
+ "gercek mı bu? Yuh! Artık unutulması bile beklenmiyo",
18
+ "Hayır hayat telaşm olmasa alacam buraları gökdelen dikicem.",
19
+ "yok hocam kesınlıkle oyle birşey yok",
20
+ "herseyi soyle hayatında olmaması gerek bence boyle ınsanların falan baskı yapıyosa",
21
+ "email adresim zemberek_python@loodos.com",
22
+ "Kredi başvrusu yapmk istiyrum.",
23
+ "Bankanizin hesp blgilerini ogrenmek istyorum."]
24
+
25
+ morphology = TurkishMorphology.create_with_defaults()
26
+
27
+ # SENTENCE NORMALIZATION
28
+ start = time.time()
29
+ normalizer = TurkishSentenceNormalizer(morphology)
30
+ logger.info(f"Normalization instance created in: {time.time() - start} s")
31
+
32
+ start = time.time()
33
+ for example in examples:
34
+ print(example)
35
+ print(normalizer.normalize(example), "\n")
36
+ logger.info(f"Sentences normalized in: {time.time() - start} s")
37
+
38
+ start = time.time()
39
+ sc = TurkishSpellChecker(morphology)
40
+ logger.info(f"Spell checker instance created in: {time.time() - start} s")
41
+
42
+
43
+ # SPELLING SUGGESTION
44
+ li = ["okuyablirim", "tartısıyor", "Ankar'ada", "knlıca", "yapablrim", "kıredi", "geldm", "geliyom", "aldm", "asln"]
45
+ start = time.time()
46
+ for word in li:
47
+ print(word + " = " + ' '.join(sc.suggest_for_word(word)))
48
+ logger.info(f"Spells checked in: {time.time() - start} s")
49
+
50
+
51
+ # SENTENCE BOUNDARY DETECTION
52
+ start = time.time()
53
+ extractor = TurkishSentenceExtractor()
54
+ print("Extractor instance created in: ", time.time() - start, " s")
55
+
56
+ text = "İnsanoğlu aslında ne para ne sevgi ne kariyer ne şöhret ne de çevre ile sonsuza dek mutlu olabilecek bir " \
57
+ "yapıya sahiptir. Dış kaynaklardan gelebilecek bu mutluluklar sadece belirli bir zaman için insanı mutlu " \
58
+ "kılıyor. Kişi bu kaynakları elde ettiği zaman belirli bir dönem için kendini iyi hissediyor, ancak alışma " \
59
+ "dönemine girdiği andan itibaren bu iyilik hali hızla tükeniyor. Mutlu olma sanatının özü bu değildir. Gerçek " \
60
+ "mutluluk, kişinin her türlü olaya ve duruma karşı kendini pozitif tutarak mutlu hissedebilmesi halidir. Bu " \
61
+ "davranış şeklini edinen insan, zor günlerde güçlü, mutlu günlerde zevk alan biri olur ve mutluluğu kalıcı " \
62
+ "kılar. "
63
+
64
+ start = time.time()
65
+ sentences = extractor.from_paragraph(text)
66
+ print(f"Sentences separated in {time.time() - start}s")
67
+
68
+ for sentence in sentences:
69
+ print(sentence)
70
+ print("\n")
71
+
72
+ # SINGLE WORD MORPHOLOGICAL ANALYSIS
73
+ results = morphology.analyze("kalemin")
74
+ for result in results:
75
+ print(result)
76
+ print("\n")
77
+
78
+ # SENTENCE ANALYSIS AND DISAMBIGUATION
79
+
80
+ sentence = "Yarın kar yağacak."
81
+ analysis = morphology.analyze_sentence(sentence)
82
+ after = morphology.disambiguate(sentence, analysis)
83
+
84
+ print("\nBefore disambiguation")
85
+ for e in analysis:
86
+ print(f"Word = {e.inp}")
87
+ for s in e:
88
+ print(s.format_string())
89
+
90
+ print("\nAfter disambiguation")
91
+ for s in after.best_analysis():
92
+ print(s.format_string())
93
+
94
+ # TOKENIZATION
95
+ tokenizer = TurkishTokenizer.DEFAULT
96
+
97
+ tokens = tokenizer.tokenize("Saat 12:00.")
98
+ for token in tokens:
99
+ print('Content = ', token.content)
100
+ print('Type = ', token.type_.name)
101
+ print('Start = ', token.start)
102
+ print('Stop = ', token.end, '\n')