dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,551 @@
1
+ """Stemming and morphological analysis module.
2
+
3
+ This module is completely self-contained and does not depend on external scripts.
4
+ All necessary functions are included here.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ from collections import deque
10
+ from types import MethodType
11
+
12
+ from camel_tools.data.catalogue import Catalogue
13
+ from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
14
+ from camel_tools.disambig.mle import MLEDisambiguator
15
+ from camel_tools.utils.dediac import dediac_ar
16
+ from datasets import Dataset
17
+
18
+ from dalla.utils.logger import get_logger
19
+ from dalla.utils.tokenize import simple_word_tokenize
20
+
21
+ logger = get_logger(__name__)
22
+
23
+
24
+ def normalize_arabic(text: str) -> str:
25
+ """Normalize Arabic text."""
26
+ _DIAC_RE = re.compile(
27
+ r"[\u0610-\u061A\u064B-\u065F\u0670\u06D6-\u06DC\u06DF-\u06E8\u06EA-\u06ED\u08D3-\u08FF]"
28
+ )
29
+ _TATWEEL_RE = re.compile(r"\u0640")
30
+ _ALIF_RE = re.compile(r"[آأإٱ]")
31
+ _ALIF_MAK_RE = re.compile(r"ى")
32
+ _TEH_MARB_RE = re.compile(r"ة")
33
+
34
+ text = _DIAC_RE.sub("", text)
35
+ text = _TATWEEL_RE.sub("", text)
36
+ text = _ALIF_RE.sub("ا", text)
37
+ text = _ALIF_MAK_RE.sub("ي", text)
38
+ text = _TEH_MARB_RE.sub("ه", text)
39
+ return text
40
+
41
+
42
+ def has_diacritics(word):
43
+ """Check if word has diacritics."""
44
+ diacritic_marks = {
45
+ "\u064b",
46
+ "\u064c",
47
+ "\u064d",
48
+ "\u064e",
49
+ "\u064f",
50
+ "\u0650",
51
+ "\u0651",
52
+ "\u0652",
53
+ "\u0670",
54
+ }
55
+ return any(char in diacritic_marks for char in word)
56
+
57
+
58
+ def apply_diacritics_to_segments_keep_markers(segments, diacritized_word, sep_token="<+>"):
59
+ """Apply diacritics from original word to segmented tokens."""
60
+ result = []
61
+ diacritic_marks = {
62
+ "\u064b",
63
+ "\u064c",
64
+ "\u064d",
65
+ "\u064e",
66
+ "\u064f",
67
+ "\u0650",
68
+ "\u0651",
69
+ "\u0652",
70
+ "\u0670",
71
+ }
72
+ sep_len = len(sep_token)
73
+
74
+ leading_diacritics = []
75
+ i = 0
76
+ while i < len(diacritized_word) and diacritized_word[i] in diacritic_marks:
77
+ leading_diacritics.append(diacritized_word[i])
78
+ i += 1
79
+
80
+ diacritic_index = len(leading_diacritics)
81
+
82
+ for segment_idx, segment in enumerate(segments):
83
+ if segment == sep_token:
84
+ result.append(segment)
85
+ else:
86
+ diacritized_segment = []
87
+
88
+ if segment_idx == 0 and leading_diacritics:
89
+ diacritized_segment.extend(leading_diacritics)
90
+
91
+ i = 0
92
+ while i < len(segment):
93
+ char = segment[i]
94
+ if segment[i : i + sep_len] == sep_token:
95
+ diacritized_segment.append(sep_token)
96
+ i += sep_len
97
+ continue
98
+
99
+ if diacritic_index < len(diacritized_word):
100
+ while (
101
+ diacritic_index < len(diacritized_word)
102
+ and diacritized_word[diacritic_index] in diacritic_marks
103
+ ):
104
+ diacritic_index += 1
105
+
106
+ if (
107
+ diacritic_index < len(diacritized_word)
108
+ and diacritized_word[diacritic_index] == char
109
+ ):
110
+ diacritized_segment.append(char)
111
+ diacritic_index += 1
112
+
113
+ while (
114
+ diacritic_index < len(diacritized_word)
115
+ and diacritized_word[diacritic_index] in diacritic_marks
116
+ ):
117
+ diacritized_segment.append(diacritized_word[diacritic_index])
118
+ diacritic_index += 1
119
+ else:
120
+ diacritized_segment.append(char)
121
+ else:
122
+ diacritized_segment.append(char)
123
+
124
+ i += 1
125
+
126
+ result.append("".join(diacritized_segment))
127
+
128
+ return result
129
+
130
+
131
+ def read_and_dediacritize(file_name):
132
+ """Read words from file and dediacritize them."""
133
+ words = []
134
+ with open(file_name, encoding="utf-8") as file:
135
+ for line in file:
136
+ word = line.strip()
137
+ dediacritized_word = dediac_ar(word)
138
+ words.append(dediacritized_word)
139
+ return words
140
+
141
+
142
+ def par_is_utf8_encoded(paragraph):
143
+ """Check if paragraph is UTF-8 encoded."""
144
+ try:
145
+ paragraph.encode("utf-8")
146
+ return True
147
+ except UnicodeEncodeError:
148
+ return False
149
+
150
+
151
+ def tokenize(text):
152
+ """Tokenize text into words."""
153
+ if par_is_utf8_encoded(text):
154
+ text_list = simple_word_tokenize(text)
155
+ return text_list
156
+ else:
157
+ return None
158
+
159
+
160
+ def merge_alef_and_alef_lam(input_list, sep_token="<+>"):
161
+ """Merge specific Arabic morpheme patterns."""
162
+ pattern = [f"\u0644{sep_token}".encode(), f"\u0627\u0644{sep_token}".encode()]
163
+ replacement = f"\u0644\u0644{sep_token}"
164
+
165
+ modified_list = []
166
+ i = 0
167
+
168
+ while i < len(input_list):
169
+ if i < len(input_list) - 1:
170
+ current_element = input_list[i].encode("utf-8")
171
+ next_element = input_list[i + 1].encode("utf-8")
172
+
173
+ if current_element == pattern[0] and next_element == pattern[1]:
174
+ modified_list.append(replacement)
175
+ i += 2
176
+ continue
177
+
178
+ modified_list.append(input_list[i])
179
+ i += 1
180
+
181
+ return modified_list
182
+
183
+
184
+ def process_NOAN_word(list_al_t, list_al, list_t, word, sep_token="<+>"):
185
+ """Process words marked as NOAN (no analysis)."""
186
+ alef_lam = b"\xd8\xa7\xd9\x84"
187
+ taa_marbouta_detached = b"\xef\xba\x93"
188
+ taa_marbouta_attached = b"\xd8\xa9"
189
+ word_bytes = word.encode("utf-8")
190
+
191
+ if (
192
+ word_bytes.startswith(alef_lam)
193
+ and (
194
+ word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached)
195
+ )
196
+ and word in list_al_t
197
+ ):
198
+ stripped_word = word[2:-1]
199
+ first_part = word[0:2] + sep_token
200
+ last_part = sep_token + word[-1]
201
+ return [first_part, stripped_word, last_part]
202
+
203
+ if word_bytes.startswith(alef_lam) and word in list_al:
204
+ stripped_word = word[2:]
205
+ first_part = word[0:2] + sep_token
206
+ return [first_part, stripped_word]
207
+
208
+ if word_bytes.endswith(taa_marbouta_detached) or word_bytes.endswith(taa_marbouta_attached):
209
+ if word in list_t:
210
+ stripped_word = word[:-1]
211
+ last_part = sep_token + word[-1]
212
+ return [stripped_word, last_part]
213
+
214
+ return [word]
215
+
216
+
217
+ def merge_tokens(tokens, original_word, sep_token="<+>"):
218
+ """Merge tokenized segments back into a word."""
219
+ parts = []
220
+ sep_len = len(sep_token)
221
+ for tok in tokens:
222
+ if tok == sep_token:
223
+ parts.append("_")
224
+ elif tok.endswith(sep_token):
225
+ tok = tok[:-sep_len]
226
+ parts.append(tok)
227
+ elif tok.startswith(sep_token):
228
+ tok = tok[sep_len:]
229
+ parts.append(tok)
230
+ elif tok.endswith("+"):
231
+ tok = tok[:-1]
232
+ parts.append(tok)
233
+ elif tok.startswith("+"):
234
+ tok = tok[1:]
235
+ parts.append(tok)
236
+ else:
237
+ parts.append(tok)
238
+
239
+ merged_word = "".join(parts)
240
+ return merged_word
241
+
242
+
243
+ def split_token_on_t(list_toks, sep_token="<+>"):
244
+ """Split tokens on taa marbouta character."""
245
+ new_list = []
246
+ taa_marbouta_detached = b"\xef\xba\x93"
247
+ taa_marbouta_attached = b"\xd8\xa9"
248
+ haa_attached = b"\xd9\x87"
249
+
250
+ for token in list_toks:
251
+ token_bytes = token.encode("utf-8")
252
+ if (
253
+ token_bytes.endswith(taa_marbouta_detached)
254
+ or token_bytes.endswith(taa_marbouta_attached)
255
+ or token_bytes.endswith(haa_attached)
256
+ ):
257
+ if token_bytes == b"\xd9\x87":
258
+ token = sep_token + taa_marbouta_attached.decode("utf-8")
259
+ new_list.append(token)
260
+ else:
261
+ part1 = token[:-1]
262
+ part2 = sep_token + token[-1]
263
+ new_list.append(part1)
264
+ new_list.append(part2)
265
+ else:
266
+ new_list.append(token)
267
+
268
+ return new_list
269
+
270
+
271
+ def replace_separator(toks, sep_token="<+>"):
272
+ """Replace + with sep_token in tokens."""
273
+ result = list(toks)
274
+
275
+ for i, tok in enumerate(result):
276
+ if tok.startswith("+"):
277
+ result[i] = sep_token + tok[1:]
278
+ if tok.endswith("+"):
279
+ result[i] = tok[:-1] + sep_token
280
+ return result
281
+
282
+
283
+ def morph_tokenize(
284
+ words, disambiguator, list_al_t, list_al, list_t, scheme="d3tok", split=True, sep_token="<+>"
285
+ ):
286
+ """Generate morphological tokens for a list of words."""
287
+ disambig_words = disambiguator.disambiguate(words)
288
+ result = deque()
289
+ err_disambig = []
290
+ err_camel = []
291
+ has_diacritics_in_par = False
292
+
293
+ for original, disambig_word in zip(words, disambig_words, strict=False):
294
+ scored_analyses = disambig_word.analyses
295
+ original_word = original
296
+ dediac_word = dediac_ar(original_word)
297
+
298
+ if has_diacritics(original_word):
299
+ has_diacritics_in_par = True
300
+
301
+ if not scored_analyses:
302
+ result.append(original_word)
303
+ continue
304
+
305
+ analysis = scored_analyses[0].analysis
306
+ tok = dediac_ar(analysis.get(scheme, None))
307
+ tok_bw = dediac_ar(analysis.get("bwtok", None))
308
+ seg_d3 = dediac_ar(analysis.get("d3seg", None))
309
+
310
+ taa_marbouta_detached = b"\xef\xba\x93"
311
+ taa_marbouta_attached = b"\xd8\xa9"
312
+ original_word_bytes = dediac_word.encode("utf-8")
313
+
314
+ if original_word_bytes.endswith(taa_marbouta_attached) or original_word_bytes.endswith(
315
+ taa_marbouta_detached
316
+ ):
317
+ if "+ة_+" in tok_bw or "+ه" in tok_bw or "+ة" in tok_bw:
318
+ toks = tok.split("_")
319
+ toks = split_token_on_t(toks, sep_token)
320
+ toks = replace_separator(toks, sep_token)
321
+ toks = merge_alef_and_alef_lam(toks, sep_token)
322
+ merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
323
+
324
+ d3_seg_tok = seg_d3.split("_")
325
+ d3_seg_tok = split_token_on_t(d3_seg_tok, sep_token)
326
+ d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
327
+ d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
328
+ merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
329
+
330
+ bw_toks = tok_bw.split("_")
331
+ bw_toks = split_token_on_t(bw_toks, sep_token)
332
+ bw_toks = replace_separator(bw_toks, sep_token)
333
+ bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
334
+ merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
335
+
336
+ if merged_toks == dediac_word and len(toks) > 1:
337
+ if has_diacritics(original):
338
+ toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
339
+ result.extend(toks)
340
+ continue
341
+
342
+ elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
343
+ if has_diacritics(original):
344
+ d3_seg_tok = apply_diacritics_to_segments_keep_markers(
345
+ d3_seg_tok, original, sep_token
346
+ )
347
+ result.extend(d3_seg_tok)
348
+ continue
349
+
350
+ elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
351
+ if has_diacritics(original):
352
+ bw_toks = apply_diacritics_to_segments_keep_markers(
353
+ bw_toks, original, sep_token
354
+ )
355
+ result.extend(bw_toks)
356
+ continue
357
+
358
+ else:
359
+ result.append(original_word)
360
+ err_disambig.append(dediac_word)
361
+ err_camel.append(merged_toks)
362
+ continue
363
+
364
+ if tok is None or "NOAN" in tok:
365
+ tok = process_NOAN_word(list_al_t, list_al, list_t, dediac_word, sep_token)
366
+ if has_diacritics(original):
367
+ toks = apply_diacritics_to_segments_keep_markers(tok, original, sep_token)
368
+ else:
369
+ toks = tok
370
+ result.extend(toks)
371
+
372
+ elif split:
373
+ tok = dediac_ar(tok)
374
+ toks = tok.split("_")
375
+ toks = replace_separator(toks, sep_token)
376
+ toks = merge_alef_and_alef_lam(toks, sep_token)
377
+ merged_toks = dediac_ar(merge_tokens(toks, dediac_word, sep_token))
378
+
379
+ bw_toks = tok_bw.split("_")
380
+ bw_toks = replace_separator(bw_toks, sep_token)
381
+ bw_toks = merge_alef_and_alef_lam(bw_toks, sep_token)
382
+ merged_toks_bw = dediac_ar(merge_tokens(bw_toks, dediac_word, sep_token))
383
+
384
+ d3_seg_tok = seg_d3.split("_")
385
+ d3_seg_tok = replace_separator(d3_seg_tok, sep_token)
386
+ d3_seg_tok = merge_alef_and_alef_lam(d3_seg_tok, sep_token)
387
+ merged_toks_seg = dediac_ar(merge_tokens(d3_seg_tok, dediac_word, sep_token))
388
+
389
+ if merged_toks == dediac_word and len(toks) > 1:
390
+ if has_diacritics(original):
391
+ toks = apply_diacritics_to_segments_keep_markers(toks, original, sep_token)
392
+ result.extend(toks)
393
+ elif merged_toks_seg == dediac_word and len(d3_seg_tok) > 1:
394
+ if has_diacritics(original):
395
+ d3_seg_tok = apply_diacritics_to_segments_keep_markers(
396
+ d3_seg_tok, original, sep_token
397
+ )
398
+ result.extend(d3_seg_tok)
399
+ elif merged_toks_bw == dediac_word and len(bw_toks) > 1:
400
+ if has_diacritics(original):
401
+ bw_toks = apply_diacritics_to_segments_keep_markers(
402
+ bw_toks, original, sep_token
403
+ )
404
+ result.extend(bw_toks)
405
+ else:
406
+ result.append(original_word)
407
+ err_disambig.append(dediac_word)
408
+ err_camel.append(merged_toks)
409
+
410
+ else:
411
+ tok = dediac_ar(tok)
412
+ if tok == dediac_word:
413
+ result.append(original_word)
414
+ else:
415
+ result.append(original_word)
416
+ err_disambig.append(dediac_word)
417
+ err_camel.append(tok)
418
+
419
+ return list(result), err_disambig, err_camel, has_diacritics_in_par
420
+
421
+
422
+ def stem_dataset(
423
+ dataset: Dataset,
424
+ column: str = "text",
425
+ sep_token: str = "<+>",
426
+ normalize: bool = False,
427
+ keep_diacritics: bool = True,
428
+ num_proc: int | None = None,
429
+ model: str = "mle",
430
+ use_gpu: bool = False,
431
+ ) -> Dataset:
432
+ """
433
+ Apply stemming and morphological analysis to dataset.
434
+
435
+ Args:
436
+ dataset: HuggingFace dataset
437
+ column: Column to process
438
+ sep_token: Separator token for morphological splits (default: '<+>')
439
+ normalize: Apply Arabic normalization (default: False)
440
+ keep_diacritics: Keep dediacritized column (default: True)
441
+ num_proc: Number of parallel processes
442
+ model: Disambiguator model to use - "mle" or "bert" (default: "mle")
443
+ use_gpu: Whether to use GPU for BERT model (default: False)
444
+
445
+ Returns:
446
+ Dataset with {column}_stemmed and optionally {column}_dediac columns
447
+
448
+ Example:
449
+ >>> # Stem with defaults (MLE, keeps diacritics)
450
+ >>> stemmed = stem_dataset(dataset)
451
+ >>> # Result has 'text_stemmed' and 'text_dediac' columns
452
+
453
+ >>> # Stem using BERT with GPU
454
+ >>> stemmed = stem_dataset(dataset, model="bert", use_gpu=True)
455
+
456
+ >>> # Stem without keeping diacritics
457
+ >>> stemmed = stem_dataset(dataset, keep_diacritics=False)
458
+ >>> # Result has only 'text_stemmed' column
459
+ """
460
+ model = model.lower()
461
+ if model not in ["mle", "bert"]:
462
+ raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
463
+
464
+ logger.info(f"Starting stemming of {len(dataset)} examples")
465
+ logger.info(
466
+ f"Model: {model.upper()}, Column: {column}, Sep token: {sep_token}, Normalize: {normalize}"
467
+ )
468
+ logger.info(f"Keep diacritics: {keep_diacritics}, Workers: {num_proc or 'auto'}")
469
+ if model == "bert":
470
+ logger.info(f"GPU: {use_gpu}")
471
+
472
+ logger.info("Checking CAMeL Tools data packages...")
473
+ catalogue = Catalogue.load_catalogue()
474
+ try:
475
+ catalogue.download_package("morphology-db-msa-r13")
476
+ if model == "mle":
477
+ catalogue.download_package("disambig-mle-calima-msa-r13")
478
+ # For BERT, let it download automatically when pretrained() is called
479
+ logger.info("CAMeL Tools data packages ready")
480
+ except Exception as e:
481
+ logger.warning(f"Could not verify CAMeL packages: {e}")
482
+
483
+ logger.info("Loading additional words lists...")
484
+ words_dir = os.path.join(os.path.dirname(__file__), "data")
485
+ list_al_t = set(read_and_dediacritize(os.path.join(words_dir, "words_al_t.txt")))
486
+ list_al = set(read_and_dediacritize(os.path.join(words_dir, "words_al.txt")))
487
+ list_t = set(read_and_dediacritize(os.path.join(words_dir, "words_t.txt")))
488
+ logger.info("Loaded word list entries")
489
+
490
+ logger.info(f"Initializing {model.upper()} disambiguator...")
491
+ if model == "mle":
492
+ disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
493
+ else: # bert
494
+ disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=use_gpu)
495
+ logger.info("Disambiguator ready")
496
+
497
+ def new_scored_analysis(self, word_dd):
498
+ if word_dd in self._cache:
499
+ return self._cache[word_dd]
500
+ result = self._scored_analyses(word_dd)
501
+ self._cache[word_dd] = result
502
+ return result
503
+
504
+ disambiguator._scored_analyses_cached = MethodType(new_scored_analysis, disambiguator)
505
+ disambiguator._score_fn = disambiguator._scored_analyses_cached
506
+
507
+ def process_row(row):
508
+ text = row.get(column, "")
509
+ if not text:
510
+ row[f"{column}_stemmed"] = ""
511
+ if keep_diacritics:
512
+ row[f"{column}_dediac"] = ""
513
+ return row
514
+
515
+ word_list = tokenize(text)
516
+ if word_list is None:
517
+ row[f"{column}_stemmed"] = text
518
+ if keep_diacritics:
519
+ row[f"{column}_dediac"] = dediac_ar(text)
520
+ return row
521
+
522
+ tokenized, _, _, has_diacs = morph_tokenize(
523
+ word_list, disambiguator, list_al_t, list_al, list_t, sep_token=sep_token
524
+ )
525
+
526
+ if tokenized is not None:
527
+ tokenized = merge_alef_and_alef_lam(tokenized, sep_token)
528
+ stemmed = "".join(tokenized)
529
+
530
+ if normalize:
531
+ stemmed = normalize_arabic(stemmed)
532
+
533
+ row[f"{column}_stemmed"] = stemmed
534
+
535
+ if keep_diacritics:
536
+ row[f"{column}_dediac"] = dediac_ar(stemmed)
537
+ else:
538
+ row[f"{column}_stemmed"] = text
539
+ if keep_diacritics:
540
+ row[f"{column}_dediac"] = dediac_ar(text)
541
+
542
+ return row
543
+
544
+ logger.info("Starting morphological tokenization...")
545
+ result = dataset.map(process_row, num_proc=num_proc, desc="Stemming")
546
+
547
+ logger.info(f"Stemming complete! Processed {len(result)} examples")
548
+ return result
549
+
550
+
551
+ __all__ = ["stem_dataset"]