araclean 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- araclean/__init__.py +152 -0
- araclean/api.py +69 -0
- araclean/chars.py +742 -0
- araclean/cli.py +259 -0
- araclean/config.py +189 -0
- araclean/fusion.py +128 -0
- araclean/offsets.py +170 -0
- araclean/pandas.py +95 -0
- araclean/pipeline.py +206 -0
- araclean/polars.py +97 -0
- araclean/profiles.py +276 -0
- araclean/py.typed +0 -0
- araclean/registry.py +40 -0
- araclean/safety.py +48 -0
- araclean/steps.py +2013 -0
- araclean/stopwords.py +217 -0
- araclean-0.2.0.dist-info/METADATA +141 -0
- araclean-0.2.0.dist-info/RECORD +21 -0
- araclean-0.2.0.dist-info/WHEEL +4 -0
- araclean-0.2.0.dist-info/entry_points.txt +2 -0
- araclean-0.2.0.dist-info/licenses/LICENSE +21 -0
araclean/__init__.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""araclean — Arabic text normalization and cleaning."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from araclean.api import normalize
|
|
6
|
+
from araclean.config import NormalizeConfig, ProfileName
|
|
7
|
+
from araclean.offsets import OffsetMap
|
|
8
|
+
from araclean.pipeline import Pipeline
|
|
9
|
+
from araclean.profiles import CLASSICAL, LIGHT, ML, SEARCH, SOCIAL, Profile
|
|
10
|
+
from araclean.safety import SafetyClass, SafetyReport
|
|
11
|
+
from araclean.steps import (
|
|
12
|
+
AlignmentNotSupportedError,
|
|
13
|
+
CleanHashtags,
|
|
14
|
+
CleanHTML,
|
|
15
|
+
CleanMentions,
|
|
16
|
+
CleanMode,
|
|
17
|
+
CleanURLs,
|
|
18
|
+
CollapseWhitespace,
|
|
19
|
+
DigitTarget,
|
|
20
|
+
EmojiMode,
|
|
21
|
+
EmojiSupportNotInstalledError,
|
|
22
|
+
FoldAlef,
|
|
23
|
+
FoldAlefMaqsura,
|
|
24
|
+
FoldHamza,
|
|
25
|
+
FoldPresentationForms,
|
|
26
|
+
FoldTanweenAlef,
|
|
27
|
+
FoldTehMarbuta,
|
|
28
|
+
HandleEmoji,
|
|
29
|
+
HashtagMode,
|
|
30
|
+
MapDigits,
|
|
31
|
+
MapPunctuation,
|
|
32
|
+
MapQuotes,
|
|
33
|
+
MarkClass,
|
|
34
|
+
NormalizeUnicode,
|
|
35
|
+
ReduceElongation,
|
|
36
|
+
RemoveForeign,
|
|
37
|
+
RemovePunctuation,
|
|
38
|
+
RemoveStopwords,
|
|
39
|
+
RemoveTashkeel,
|
|
40
|
+
RemoveTatweel,
|
|
41
|
+
Step,
|
|
42
|
+
StripBidi,
|
|
43
|
+
TehMarbutaTarget,
|
|
44
|
+
Trim,
|
|
45
|
+
UnifyLookalikes,
|
|
46
|
+
clean_hashtags,
|
|
47
|
+
clean_html,
|
|
48
|
+
clean_mentions,
|
|
49
|
+
clean_urls,
|
|
50
|
+
collapse_whitespace,
|
|
51
|
+
fold_alef,
|
|
52
|
+
fold_alef_maqsura,
|
|
53
|
+
fold_hamza,
|
|
54
|
+
fold_presentation_forms,
|
|
55
|
+
fold_tanween_alef,
|
|
56
|
+
fold_teh_marbuta,
|
|
57
|
+
handle_emoji,
|
|
58
|
+
map_digits,
|
|
59
|
+
map_punctuation,
|
|
60
|
+
map_quotes,
|
|
61
|
+
normalize_unicode,
|
|
62
|
+
reduce_elongation,
|
|
63
|
+
remove_foreign,
|
|
64
|
+
remove_punctuation,
|
|
65
|
+
remove_stopwords,
|
|
66
|
+
remove_tashkeel,
|
|
67
|
+
remove_tatweel,
|
|
68
|
+
strip_bidi,
|
|
69
|
+
trim,
|
|
70
|
+
unify_lookalikes,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
__version__: str = version("araclean")
|
|
75
|
+
except PackageNotFoundError: # pragma: no cover - source tree without install metadata
|
|
76
|
+
__version__ = "0.0.0"
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
"CLASSICAL",
|
|
80
|
+
"LIGHT",
|
|
81
|
+
"ML",
|
|
82
|
+
"SEARCH",
|
|
83
|
+
"SOCIAL",
|
|
84
|
+
"AlignmentNotSupportedError",
|
|
85
|
+
"CleanHTML",
|
|
86
|
+
"CleanHashtags",
|
|
87
|
+
"CleanMentions",
|
|
88
|
+
"CleanMode",
|
|
89
|
+
"CleanURLs",
|
|
90
|
+
"CollapseWhitespace",
|
|
91
|
+
"DigitTarget",
|
|
92
|
+
"EmojiMode",
|
|
93
|
+
"EmojiSupportNotInstalledError",
|
|
94
|
+
"FoldAlef",
|
|
95
|
+
"FoldAlefMaqsura",
|
|
96
|
+
"FoldHamza",
|
|
97
|
+
"FoldPresentationForms",
|
|
98
|
+
"FoldTanweenAlef",
|
|
99
|
+
"FoldTehMarbuta",
|
|
100
|
+
"HandleEmoji",
|
|
101
|
+
"HashtagMode",
|
|
102
|
+
"MapDigits",
|
|
103
|
+
"MapPunctuation",
|
|
104
|
+
"MapQuotes",
|
|
105
|
+
"MarkClass",
|
|
106
|
+
"NormalizeConfig",
|
|
107
|
+
"NormalizeUnicode",
|
|
108
|
+
"OffsetMap",
|
|
109
|
+
"Pipeline",
|
|
110
|
+
"Profile",
|
|
111
|
+
"ProfileName",
|
|
112
|
+
"ReduceElongation",
|
|
113
|
+
"RemoveForeign",
|
|
114
|
+
"RemovePunctuation",
|
|
115
|
+
"RemoveStopwords",
|
|
116
|
+
"RemoveTashkeel",
|
|
117
|
+
"RemoveTatweel",
|
|
118
|
+
"SafetyClass",
|
|
119
|
+
"SafetyReport",
|
|
120
|
+
"Step",
|
|
121
|
+
"StripBidi",
|
|
122
|
+
"TehMarbutaTarget",
|
|
123
|
+
"Trim",
|
|
124
|
+
"UnifyLookalikes",
|
|
125
|
+
"__version__",
|
|
126
|
+
"clean_hashtags",
|
|
127
|
+
"clean_html",
|
|
128
|
+
"clean_mentions",
|
|
129
|
+
"clean_urls",
|
|
130
|
+
"collapse_whitespace",
|
|
131
|
+
"fold_alef",
|
|
132
|
+
"fold_alef_maqsura",
|
|
133
|
+
"fold_hamza",
|
|
134
|
+
"fold_presentation_forms",
|
|
135
|
+
"fold_tanween_alef",
|
|
136
|
+
"fold_teh_marbuta",
|
|
137
|
+
"handle_emoji",
|
|
138
|
+
"map_digits",
|
|
139
|
+
"map_punctuation",
|
|
140
|
+
"map_quotes",
|
|
141
|
+
"normalize",
|
|
142
|
+
"normalize_unicode",
|
|
143
|
+
"reduce_elongation",
|
|
144
|
+
"remove_foreign",
|
|
145
|
+
"remove_punctuation",
|
|
146
|
+
"remove_stopwords",
|
|
147
|
+
"remove_tashkeel",
|
|
148
|
+
"remove_tatweel",
|
|
149
|
+
"strip_bidi",
|
|
150
|
+
"trim",
|
|
151
|
+
"unify_lookalikes",
|
|
152
|
+
]
|
araclean/api.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Layer 3 — the one-call `normalize` facade (a thin adapter over `Pipeline`, ADR-0003).
|
|
2
|
+
|
|
3
|
+
This is the public trust boundary: `@validate_call` validates the call (the profile name, the
|
|
4
|
+
overrides) against `NormalizeConfig` before any work happens, so a bad option is rejected here with
|
|
5
|
+
a clear error and the validation-free core (`pipe(text)`, `pipe.batch()`, the bare step functions)
|
|
6
|
+
never validates per string (ADR-0003). The facade assembles the effective `Pipeline` once per call
|
|
7
|
+
and runs it.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from pydantic import validate_call
|
|
13
|
+
|
|
14
|
+
from araclean.config import NormalizeConfig
|
|
15
|
+
from araclean.pipeline import Pipeline
|
|
16
|
+
from araclean.profiles import LIGHT, Profile
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@validate_call
|
|
20
|
+
def normalize(
|
|
21
|
+
text: str,
|
|
22
|
+
*,
|
|
23
|
+
profile: str | Profile | None = None,
|
|
24
|
+
config: NormalizeConfig | None = None,
|
|
25
|
+
**overrides: object,
|
|
26
|
+
) -> str:
|
|
27
|
+
"""Normalize Arabic text with a named profile (default `LIGHT` — lossless encoding repair).
|
|
28
|
+
|
|
29
|
+
`profile=None` applies `LIGHT`. Pass ``profile="search"`` (etc.) for a named preset, a `Profile`
|
|
30
|
+
object for a fully custom pipeline, or per-knob `**overrides` to tune a named profile —
|
|
31
|
+
``normalize(text, profile="ml", map_digits=True)`` folds digits, ``profile="social",
|
|
32
|
+
emoji="strip"`` drops emoji. Overrides are validated against `NormalizeConfig`, so an unknown
|
|
33
|
+
knob or a bad value is rejected here. A prebuilt `config=NormalizeConfig(...)` may be passed
|
|
34
|
+
instead, but not together with `profile`/overrides.
|
|
35
|
+
"""
|
|
36
|
+
if config is not None:
|
|
37
|
+
if profile is not None or overrides:
|
|
38
|
+
raise TypeError("normalize(): pass either `config=` or `profile=`/overrides, not both.")
|
|
39
|
+
return Pipeline.from_profile(config.resolve())(text)
|
|
40
|
+
if isinstance(profile, Profile):
|
|
41
|
+
if overrides:
|
|
42
|
+
raise TypeError(
|
|
43
|
+
"normalize(): **overrides require a profile name (a str), not a Profile object; "
|
|
44
|
+
"build the Profile with the steps you want instead."
|
|
45
|
+
)
|
|
46
|
+
return Pipeline.from_profile(profile)(text)
|
|
47
|
+
# profile is a name (or None -> LIGHT): build_pipeline runs the full pydantic boundary over the
|
|
48
|
+
# untrusted name + overrides (rejecting an unknown name, knob, or value) before any work, so the
|
|
49
|
+
# validation-free core never sees an invalid option.
|
|
50
|
+
return build_pipeline(profile, overrides)(text)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def build_pipeline(profile: str | None, overrides: dict[str, object]) -> Pipeline:
|
|
54
|
+
"""Validate a profile name + overrides at the trust boundary and assemble the pipeline once.
|
|
55
|
+
|
|
56
|
+
The single source of truth behind the `normalize` facade's name-branch and the CLI/pandas/polars
|
|
57
|
+
adapters (ADR-0003): `model_validate` runs the full pydantic boundary over the untrusted name +
|
|
58
|
+
overrides before any work happens, so the validation-free core (`pipe(text)`, the bare step
|
|
59
|
+
functions) never sees an invalid option. `profile=None` selects `LIGHT`. Assembling once (not
|
|
60
|
+
per line/row/string) keeps the hot path validation-free.
|
|
61
|
+
|
|
62
|
+
Raises `ValidationError` for a bad option value (or an empty/unknown profile name), `ValueError`
|
|
63
|
+
for an override that does not apply to the profile (from `resolve()`), and
|
|
64
|
+
`EmojiSupportNotInstalledError` for ``emoji="demojize"`` without the ``[emoji]`` extra — each
|
|
65
|
+
caller catches what it needs to.
|
|
66
|
+
"""
|
|
67
|
+
name = profile if profile is not None else LIGHT.name
|
|
68
|
+
config = NormalizeConfig.model_validate({"profile": name, **overrides})
|
|
69
|
+
return Pipeline.from_profile(config.resolve())
|