araclean 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
araclean/__init__.py ADDED
@@ -0,0 +1,152 @@
1
+ """araclean — Arabic text normalization and cleaning."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from araclean.api import normalize
6
+ from araclean.config import NormalizeConfig, ProfileName
7
+ from araclean.offsets import OffsetMap
8
+ from araclean.pipeline import Pipeline
9
+ from araclean.profiles import CLASSICAL, LIGHT, ML, SEARCH, SOCIAL, Profile
10
+ from araclean.safety import SafetyClass, SafetyReport
11
+ from araclean.steps import (
12
+ AlignmentNotSupportedError,
13
+ CleanHashtags,
14
+ CleanHTML,
15
+ CleanMentions,
16
+ CleanMode,
17
+ CleanURLs,
18
+ CollapseWhitespace,
19
+ DigitTarget,
20
+ EmojiMode,
21
+ EmojiSupportNotInstalledError,
22
+ FoldAlef,
23
+ FoldAlefMaqsura,
24
+ FoldHamza,
25
+ FoldPresentationForms,
26
+ FoldTanweenAlef,
27
+ FoldTehMarbuta,
28
+ HandleEmoji,
29
+ HashtagMode,
30
+ MapDigits,
31
+ MapPunctuation,
32
+ MapQuotes,
33
+ MarkClass,
34
+ NormalizeUnicode,
35
+ ReduceElongation,
36
+ RemoveForeign,
37
+ RemovePunctuation,
38
+ RemoveStopwords,
39
+ RemoveTashkeel,
40
+ RemoveTatweel,
41
+ Step,
42
+ StripBidi,
43
+ TehMarbutaTarget,
44
+ Trim,
45
+ UnifyLookalikes,
46
+ clean_hashtags,
47
+ clean_html,
48
+ clean_mentions,
49
+ clean_urls,
50
+ collapse_whitespace,
51
+ fold_alef,
52
+ fold_alef_maqsura,
53
+ fold_hamza,
54
+ fold_presentation_forms,
55
+ fold_tanween_alef,
56
+ fold_teh_marbuta,
57
+ handle_emoji,
58
+ map_digits,
59
+ map_punctuation,
60
+ map_quotes,
61
+ normalize_unicode,
62
+ reduce_elongation,
63
+ remove_foreign,
64
+ remove_punctuation,
65
+ remove_stopwords,
66
+ remove_tashkeel,
67
+ remove_tatweel,
68
+ strip_bidi,
69
+ trim,
70
+ unify_lookalikes,
71
+ )
72
+
73
+ try:
74
+ __version__: str = version("araclean")
75
+ except PackageNotFoundError: # pragma: no cover - source tree without install metadata
76
+ __version__ = "0.0.0"
77
+
78
+ __all__ = [
79
+ "CLASSICAL",
80
+ "LIGHT",
81
+ "ML",
82
+ "SEARCH",
83
+ "SOCIAL",
84
+ "AlignmentNotSupportedError",
85
+ "CleanHTML",
86
+ "CleanHashtags",
87
+ "CleanMentions",
88
+ "CleanMode",
89
+ "CleanURLs",
90
+ "CollapseWhitespace",
91
+ "DigitTarget",
92
+ "EmojiMode",
93
+ "EmojiSupportNotInstalledError",
94
+ "FoldAlef",
95
+ "FoldAlefMaqsura",
96
+ "FoldHamza",
97
+ "FoldPresentationForms",
98
+ "FoldTanweenAlef",
99
+ "FoldTehMarbuta",
100
+ "HandleEmoji",
101
+ "HashtagMode",
102
+ "MapDigits",
103
+ "MapPunctuation",
104
+ "MapQuotes",
105
+ "MarkClass",
106
+ "NormalizeConfig",
107
+ "NormalizeUnicode",
108
+ "OffsetMap",
109
+ "Pipeline",
110
+ "Profile",
111
+ "ProfileName",
112
+ "ReduceElongation",
113
+ "RemoveForeign",
114
+ "RemovePunctuation",
115
+ "RemoveStopwords",
116
+ "RemoveTashkeel",
117
+ "RemoveTatweel",
118
+ "SafetyClass",
119
+ "SafetyReport",
120
+ "Step",
121
+ "StripBidi",
122
+ "TehMarbutaTarget",
123
+ "Trim",
124
+ "UnifyLookalikes",
125
+ "__version__",
126
+ "clean_hashtags",
127
+ "clean_html",
128
+ "clean_mentions",
129
+ "clean_urls",
130
+ "collapse_whitespace",
131
+ "fold_alef",
132
+ "fold_alef_maqsura",
133
+ "fold_hamza",
134
+ "fold_presentation_forms",
135
+ "fold_tanween_alef",
136
+ "fold_teh_marbuta",
137
+ "handle_emoji",
138
+ "map_digits",
139
+ "map_punctuation",
140
+ "map_quotes",
141
+ "normalize",
142
+ "normalize_unicode",
143
+ "reduce_elongation",
144
+ "remove_foreign",
145
+ "remove_punctuation",
146
+ "remove_stopwords",
147
+ "remove_tashkeel",
148
+ "remove_tatweel",
149
+ "strip_bidi",
150
+ "trim",
151
+ "unify_lookalikes",
152
+ ]
araclean/api.py ADDED
@@ -0,0 +1,69 @@
1
+ """Layer 3 — the one-call `normalize` facade (a thin adapter over `Pipeline`, ADR-0003).
2
+
3
+ This is the public trust boundary: `@validate_call` validates the call (the profile name, the
4
+ overrides) against `NormalizeConfig` before any work happens, so a bad option is rejected here with
5
+ a clear error and the validation-free core (`pipe(text)`, `pipe.batch()`, the bare step functions)
6
+ never validates per string (ADR-0003). The facade assembles the effective `Pipeline` once per call
7
+ and runs it.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from pydantic import validate_call
13
+
14
+ from araclean.config import NormalizeConfig
15
+ from araclean.pipeline import Pipeline
16
+ from araclean.profiles import LIGHT, Profile
17
+
18
+
19
+ @validate_call
20
+ def normalize(
21
+ text: str,
22
+ *,
23
+ profile: str | Profile | None = None,
24
+ config: NormalizeConfig | None = None,
25
+ **overrides: object,
26
+ ) -> str:
27
+ """Normalize Arabic text with a named profile (default `LIGHT` — lossless encoding repair).
28
+
29
+ `profile=None` applies `LIGHT`. Pass ``profile="search"`` (etc.) for a named preset, a `Profile`
30
+ object for a fully custom pipeline, or per-knob `**overrides` to tune a named profile —
31
+ ``normalize(text, profile="ml", map_digits=True)`` folds digits, ``profile="social",
32
+ emoji="strip"`` drops emoji. Overrides are validated against `NormalizeConfig`, so an unknown
33
+ knob or a bad value is rejected here. A prebuilt `config=NormalizeConfig(...)` may be passed
34
+ instead, but not together with `profile`/overrides.
35
+ """
36
+ if config is not None:
37
+ if profile is not None or overrides:
38
+ raise TypeError("normalize(): pass either `config=` or `profile=`/overrides, not both.")
39
+ return Pipeline.from_profile(config.resolve())(text)
40
+ if isinstance(profile, Profile):
41
+ if overrides:
42
+ raise TypeError(
43
+ "normalize(): **overrides require a profile name (a str), not a Profile object; "
44
+ "build the Profile with the steps you want instead."
45
+ )
46
+ return Pipeline.from_profile(profile)(text)
47
+ # profile is a name (or None -> LIGHT): build_pipeline runs the full pydantic boundary over the
48
+ # untrusted name + overrides (rejecting an unknown name, knob, or value) before any work, so the
49
+ # validation-free core never sees an invalid option.
50
+ return build_pipeline(profile, overrides)(text)
51
+
52
+
53
+ def build_pipeline(profile: str | None, overrides: dict[str, object]) -> Pipeline:
54
+ """Validate a profile name + overrides at the trust boundary and assemble the pipeline once.
55
+
56
+ The single source of truth behind the `normalize` facade's name-branch and the CLI/pandas/polars
57
+ adapters (ADR-0003): `model_validate` runs the full pydantic boundary over the untrusted name +
58
+ overrides before any work happens, so the validation-free core (`pipe(text)`, the bare step
59
+ functions) never sees an invalid option. `profile=None` selects `LIGHT`. Assembling once (not
60
+ per line/row/string) keeps the hot path validation-free.
61
+
62
+ Raises `ValidationError` for a bad option value (or an empty/unknown profile name), `ValueError`
63
+ for an override that does not apply to the profile (from `resolve()`), and
64
+ `EmojiSupportNotInstalledError` for ``emoji="demojize"`` without the ``[emoji]`` extra — each
65
+ caller catches what it needs to.
66
+ """
67
+ name = profile if profile is not None else LIGHT.name
68
+ config = NormalizeConfig.model_validate({"profile": name, **overrides})
69
+ return Pipeline.from_profile(config.resolve())