araclean 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. araclean-0.2.0/.cspell/project-words.txt +95 -0
  2. araclean-0.2.0/.github/ISSUE_TEMPLATE/bug_report.md +41 -0
  3. araclean-0.2.0/.github/ISSUE_TEMPLATE/config.yml +13 -0
  4. araclean-0.2.0/.github/ISSUE_TEMPLATE/feature_request.md +36 -0
  5. araclean-0.2.0/.github/PULL_REQUEST_TEMPLATE.md +30 -0
  6. araclean-0.2.0/.github/workflows/ci.yml +171 -0
  7. araclean-0.2.0/.gitignore +27 -0
  8. araclean-0.2.0/.pre-commit-config.yaml +71 -0
  9. araclean-0.2.0/CHANGELOG.md +52 -0
  10. araclean-0.2.0/CODE_OF_CONDUCT.md +83 -0
  11. araclean-0.2.0/CONTEXT.md +95 -0
  12. araclean-0.2.0/CONTRIBUTING.md +120 -0
  13. araclean-0.2.0/GLOSSARY.md +74 -0
  14. araclean-0.2.0/LICENSE +21 -0
  15. araclean-0.2.0/PKG-INFO +141 -0
  16. araclean-0.2.0/PRD.md +410 -0
  17. araclean-0.2.0/README.md +104 -0
  18. araclean-0.2.0/SECURITY.md +29 -0
  19. araclean-0.2.0/asv.conf.json +18 -0
  20. araclean-0.2.0/benchmarks/README.md +78 -0
  21. araclean-0.2.0/benchmarks/__init__.py +8 -0
  22. araclean-0.2.0/benchmarks/bench_normalize.py +96 -0
  23. araclean-0.2.0/cspell.json +205 -0
  24. araclean-0.2.0/docs/adr/0001-v1-scope-normalization-core.md +18 -0
  25. araclean-0.2.0/docs/adr/0002-build-new-mit-library.md +18 -0
  26. araclean-0.2.0/docs/adr/0003-three-layer-api-validation-boundary.md +26 -0
  27. araclean-0.2.0/docs/adr/0004-non-destructive-by-default.md +21 -0
  28. araclean-0.2.0/docs/adr/0005-defer-offset-tracking.md +20 -0
  29. araclean-0.2.0/docs/adr/0006-pure-python-translate-engine.md +23 -0
  30. araclean-0.2.0/docs/adr/0007-arabic-primary-terminology.md +30 -0
  31. araclean-0.2.0/docs/adr/0008-commit-driven-versioning-commitizen.md +44 -0
  32. araclean-0.2.0/docs/adr/0009-canonical-equivalent-output.md +46 -0
  33. araclean-0.2.0/docs/adr/0010-preserve-line-structure-by-default.md +47 -0
  34. araclean-0.2.0/docs/adr/0011-cleaning-is-a-third-safety-class.md +46 -0
  35. araclean-0.2.0/docs/adr/0012-offset-preserving-normalization.md +97 -0
  36. araclean-0.2.0/docs/concepts/architecture.md +92 -0
  37. araclean-0.2.0/docs/concepts/safety.md +88 -0
  38. araclean-0.2.0/docs/concepts/why-araclean.md +84 -0
  39. araclean-0.2.0/docs/faq.md +85 -0
  40. araclean-0.2.0/docs/getting-started.md +83 -0
  41. araclean-0.2.0/docs/glossary.md +30 -0
  42. araclean-0.2.0/docs/guides/cli.md +76 -0
  43. araclean-0.2.0/docs/guides/composing-pipelines.md +128 -0
  44. araclean-0.2.0/docs/guides/custom-steps.md +113 -0
  45. araclean-0.2.0/docs/guides/dataframes.md +78 -0
  46. araclean-0.2.0/docs/guides/offset-preserving.md +125 -0
  47. araclean-0.2.0/docs/guides/reproducibility.md +108 -0
  48. araclean-0.2.0/docs/guides/stopwords.md +85 -0
  49. araclean-0.2.0/docs/guides/tuning-profiles.md +80 -0
  50. araclean-0.2.0/docs/includes/abbreviations.md +52 -0
  51. araclean-0.2.0/docs/index.md +90 -0
  52. araclean-0.2.0/docs/profiles/classical.md +19 -0
  53. araclean-0.2.0/docs/profiles/index.md +35 -0
  54. araclean-0.2.0/docs/profiles/light.md +19 -0
  55. araclean-0.2.0/docs/profiles/ml.md +23 -0
  56. araclean-0.2.0/docs/profiles/search.md +30 -0
  57. araclean-0.2.0/docs/profiles/social.md +28 -0
  58. araclean-0.2.0/docs/reference/cli.md +48 -0
  59. araclean-0.2.0/docs/reference.md +156 -0
  60. araclean-0.2.0/docs_gen.py +363 -0
  61. araclean-0.2.0/mkdocs.yml +119 -0
  62. araclean-0.2.0/pyproject.toml +203 -0
  63. araclean-0.2.0/src/araclean/__init__.py +152 -0
  64. araclean-0.2.0/src/araclean/api.py +69 -0
  65. araclean-0.2.0/src/araclean/chars.py +742 -0
  66. araclean-0.2.0/src/araclean/cli.py +259 -0
  67. araclean-0.2.0/src/araclean/config.py +189 -0
  68. araclean-0.2.0/src/araclean/fusion.py +128 -0
  69. araclean-0.2.0/src/araclean/offsets.py +170 -0
  70. araclean-0.2.0/src/araclean/pandas.py +95 -0
  71. araclean-0.2.0/src/araclean/pipeline.py +206 -0
  72. araclean-0.2.0/src/araclean/polars.py +97 -0
  73. araclean-0.2.0/src/araclean/profiles.py +276 -0
  74. araclean-0.2.0/src/araclean/py.typed +0 -0
  75. araclean-0.2.0/src/araclean/registry.py +40 -0
  76. araclean-0.2.0/src/araclean/safety.py +48 -0
  77. araclean-0.2.0/src/araclean/steps.py +2013 -0
  78. araclean-0.2.0/src/araclean/stopwords.py +217 -0
  79. araclean-0.2.0/tests/__snapshots__/test_pipeline.ambr +4 -0
  80. araclean-0.2.0/tests/__snapshots__/test_snapshots.ambr +98 -0
  81. araclean-0.2.0/tests/test_alignment.py +45 -0
  82. araclean-0.2.0/tests/test_api.py +288 -0
  83. araclean-0.2.0/tests/test_apply_aligned.py +519 -0
  84. araclean-0.2.0/tests/test_benchmarks_suite.py +55 -0
  85. araclean-0.2.0/tests/test_cli.py +181 -0
  86. araclean-0.2.0/tests/test_config.py +369 -0
  87. araclean-0.2.0/tests/test_cspell_config.py +45 -0
  88. araclean-0.2.0/tests/test_differential_oracles.py +191 -0
  89. araclean-0.2.0/tests/test_docs.py +192 -0
  90. araclean-0.2.0/tests/test_fused_engine.py +184 -0
  91. araclean-0.2.0/tests/test_offset_map.py +234 -0
  92. araclean-0.2.0/tests/test_oracle_benchmarks.py +158 -0
  93. araclean-0.2.0/tests/test_pandas.py +124 -0
  94. araclean-0.2.0/tests/test_pipeline.py +199 -0
  95. araclean-0.2.0/tests/test_polars.py +134 -0
  96. araclean-0.2.0/tests/test_profiles.py +781 -0
  97. araclean-0.2.0/tests/test_smoke.py +28 -0
  98. araclean-0.2.0/tests/test_snapshots.py +351 -0
  99. araclean-0.2.0/tests/test_steps.py +2136 -0
  100. araclean-0.2.0/tests/test_stopwords.py +96 -0
  101. araclean-0.2.0/uv.lock +1916 -0
@@ -0,0 +1,95 @@
1
+ # Project & tooling vocabulary that is not Arabic-domain terminology.
2
+ # The Arabic glossary lives in cspell.json `words` (ADR-0007); this file holds
3
+ # the non-domain words cspell's bundled dictionaries don't already know.
4
+ # One word per line; blank lines and `#` comments are ignored.
5
+
6
+ # Unicode code points referenced in prose (cf. FEFF/FDFF in the glossary)
7
+ FEFC
8
+
9
+ # Unicode Standard technical vocabulary (UAX #15 / UCD), not Arabic-domain terms
10
+ noncharacter
11
+ noncharacters
12
+ nonspacing
13
+ precomposed
14
+ doachashmee
15
+ chashmi
16
+ pepet
17
+
18
+ # Domain-adjacent proper nouns
19
+ Hijri
20
+ Uyghur
21
+ Jawi
22
+ Pegon
23
+
24
+ # ruff lint vocabulary (the ambiguous-unicode allow-list, RUF001-003)
25
+ confusables
26
+
27
+ # pytest / tooling config keys
28
+ addopts
29
+ testpaths
30
+ excinfo
31
+ capsys
32
+ optionflags
33
+ doctests
34
+
35
+ # pandas / polars vocabulary (the dataframe-accessor docs, issue 0021/0022)
36
+ dataframes
37
+ tolist
38
+
39
+ # Typer / CLI vocabulary (issue 0020): Typer's styled-echo helper
40
+ secho
41
+
42
+ # informal English used in the spec prose
43
+ grabbable
44
+ invisibles
45
+ precomputes
46
+ configurably
47
+ elongatable
48
+ disjointness
49
+ toolkits
50
+
51
+ # linguistics vocabulary (stopword handling, issue 0017) the bundled dicts lack
52
+ clitics
53
+ proclitics
54
+ enclitics
55
+ subordinators
56
+
57
+ # emoji handling (issue 0013) — the `emoji` library's verb and its inflections
58
+ demojize
59
+ demojizing
60
+
61
+ # release/versioning tooling (ADR-0008)
62
+ Commitizen
63
+ commitizen
64
+ SemVer
65
+ changelog
66
+
67
+ # Python / ML / packaging ecosystem referenced in the ADRs
68
+ stdlib
69
+ vectorizer
70
+ maturin
71
+ pypa
72
+ arabicstopwords
73
+
74
+ # mkdocs / docs-site tooling (issue 0023): mkdocs-material + pymdownx config keys and the
75
+ # pygments lexer name for an interactive-session ("pycon") code block.
76
+ pymdownx
77
+ superfences
78
+ linenums
79
+ crossrefs
80
+ pycon
81
+
82
+ # Unicode / regex technical vocabulary (roadmap 0.2/0.3 + Phase 1 steps)
83
+ keycap
84
+ keycaps
85
+ guillemets
86
+ guillemet
87
+ lookaheads
88
+ unflanked
89
+ carrierless
90
+ retargets
91
+
92
+ # Python API identifiers referenced in prose
93
+ maxunicode
94
+ omap
95
+ autojunk
@@ -0,0 +1,41 @@
1
+ ---
2
+ name: Bug report
3
+ about: A normalization step or the API produces the wrong output, or raises unexpectedly
4
+ title: "fix: "
5
+ labels: bug
6
+ assignees: ""
7
+ ---
8
+
9
+ ## What happened
10
+
11
+ A clear description of the bug.
12
+
13
+ ## Reproduction
14
+
15
+ The smallest input + configuration that reproduces it. Please paste **text**, not a screenshot, so
16
+ the exact code points are preserved.
17
+
18
+ ```python
19
+ from araclean import normalize # or the profile / step you used
20
+
21
+ text = "..." # the input string
22
+ result = normalize(text, profile="SEARCH") # the call you made
23
+ ```
24
+
25
+ ## Expected vs. actual output
26
+
27
+ - **Expected:** `...`
28
+ - **Actual:** `...`
29
+
30
+ If it raised instead, paste the full traceback.
31
+
32
+ ## Environment
33
+
34
+ - araclean version: <!-- python -c "import araclean; print(araclean.__version__)" -->
35
+ - Python version:
36
+ - OS:
37
+ - Extras installed (if any): `[cli]` / `[pandas]` / `[polars]` / `[emoji]` / none
38
+
39
+ ## Anything else
40
+
41
+ Optional: the Unicode code points involved (`U+0640` …), the linguistic intent, or links.
@@ -0,0 +1,13 @@
1
+ # Force contributors through the structured templates above; a "blank issue" escape hatch is disabled
2
+ # so every report carries the input/config/version detail a normalization bug needs to be actionable.
3
+ blank_issues_enabled: false
4
+ contact_links:
5
+ - name: Question or discussion
6
+ url: https://github.com/MhdMartini/araclean/discussions
7
+ about: Usage questions, ideas, and anything that isn't a concrete bug or feature request.
8
+ - name: Security vulnerability
9
+ url: https://github.com/MhdMartini/araclean/security/advisories/new
10
+ about: Report a security issue privately — please do not open a public issue.
11
+ - name: Code of Conduct
12
+ url: https://github.com/MhdMartini/araclean/blob/main/CODE_OF_CONDUCT.md
13
+ about: How we expect everyone to behave, and how to report abuse.
@@ -0,0 +1,36 @@
1
+ ---
2
+ name: Feature request
3
+ about: Propose a new step, profile, or behavior
4
+ title: "feat: "
5
+ labels: enhancement
6
+ assignees: ""
7
+ ---
8
+
9
+ ## What's missing
10
+
11
+ The normalization/cleaning behavior you need and the use case behind it.
12
+
13
+ ## Proposed behavior
14
+
15
+ Concrete input → output examples. For a new transformation, show a few representative strings and the
16
+ output you'd expect.
17
+
18
+ | input | expected output |
19
+ | ----- | --------------- |
20
+ | `...` | `...` |
21
+
22
+ ## Safety class
23
+
24
+ Is this lossless (`ENCODING_REPAIR`) or lossy (`LINGUISTIC_FOLDING`)? Which profiles should include
25
+ it? (A lossless profile — `LIGHT`, `CLASSICAL` — may only contain `ENCODING_REPAIR` steps.)
26
+
27
+ ## Terminology (if it introduces a new concept)
28
+
29
+ araclean uses **Arabic-primary names** glossed to the English equivalent
30
+ ([ADR-0007](../../docs/adr/0007-arabic-primary-terminology.md)). If your proposal names a new
31
+ operation, suggest the established Arabic term and its English gloss so it can be added to
32
+ [`GLOSSARY.md`](../../GLOSSARY.md).
33
+
34
+ ## Alternatives considered
35
+
36
+ Existing steps/profiles you tried, and why they don't cover this.
@@ -0,0 +1,30 @@
1
+ <!--
2
+ Thanks for contributing to araclean! Keep the PR focused on one change.
3
+ The title must be a Conventional Commit (e.g. `feat(steps): add ReduceElongation`) — it feeds the
4
+ version bump and changelog (ADR-0008).
5
+ -->
6
+
7
+ ## What & why
8
+
9
+ What this changes and the motivation. Link any issue it closes (`Closes #NN`).
10
+
11
+ ## How it behaves
12
+
13
+ Input → output (or the error) that demonstrates the change, ideally mirrored by a new test.
14
+
15
+ ## Definition-of-done checklist
16
+
17
+ See [CONTRIBUTING.md](../CONTRIBUTING.md#the-test-bar-definition-of-done) for the full bar.
18
+
19
+ - [ ] Behavior test written **first** (TDD) and asserts **observable behavior through the public
20
+ interface** — not internal tables, private attributes, or step order.
21
+ - [ ] New/changed `Step`s declare their `safety` class; any lossless profile (`LIGHT`, `CLASSICAL`)
22
+ still contains only `ENCODING_REPAIR` steps.
23
+ - [ ] Steps precompute their `str.translate` table / compiled `re` at construction (no per-call setup
24
+ or validation).
25
+ - [ ] Gate is green locally: `uv run pre-commit run --all-files` (ruff, mypy --strict, pyright,
26
+ pytest, cspell).
27
+ - [ ] New Arabic term ⇒ added a [`GLOSSARY.md`](../GLOSSARY.md) row + docs abbreviation; public names
28
+ are Arabic-primary with an English gloss in the docstring (ADR-0007).
29
+ - [ ] Commits are Conventional Commits; `[project].version` is **not** hand-edited.
30
+ - [ ] Docs/ADRs updated if behavior, terminology, or a decision changed.
@@ -0,0 +1,171 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ tags: ["v*"] # a release tag (issue 0024) runs the gate, then publishes + deploys docs
7
+ pull_request:
8
+ workflow_dispatch: # manual end-to-end DRY RUN of the build/publish pipeline (0024 AC4)
9
+
10
+ concurrency:
11
+ group: ci-${{ github.ref }}
12
+ cancel-in-progress: true
13
+
14
+ jobs:
15
+ gate:
16
+ # The full quality gate (ruff, mypy, pyright, pytest, cspell) via pre-commit.
17
+ # OS/Python-independent, so it runs once.
18
+ name: quality gate
19
+ runs-on: ubuntu-latest
20
+ steps:
21
+ - uses: actions/checkout@v4
22
+ - uses: astral-sh/setup-uv@v6
23
+ with:
24
+ enable-cache: true
25
+ - run: uv sync
26
+ - uses: actions/setup-node@v4
27
+ with:
28
+ node-version: "20"
29
+ - name: pre-commit
30
+ run: uv run pre-commit run --all-files --show-diff-on-failure
31
+
32
+ commits:
33
+ # Conventional-Commit lint over the PR's commits (ADR-0008). The local
34
+ # commit-msg hook can be bypassed, so cz check runs server-side too; it reads
35
+ # the same rules cz bump uses to derive the version, so the changelog/bump
36
+ # can't be poisoned by a malformed message.
37
+ name: commit messages
38
+ if: github.event_name == 'pull_request'
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+ with:
43
+ fetch-depth: 0
44
+ - uses: astral-sh/setup-uv@v6
45
+ with:
46
+ enable-cache: true
47
+ - run: uv sync
48
+ - name: commitizen check
49
+ run: uv run cz check --rev-range ${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}
50
+
51
+ docs:
52
+ # Build the docs site under --strict (issue 0023), so a broken link, a page missing from the
53
+ # nav, or a stale generated page fails CI. The `docs` dependency group is separate from `dev`,
54
+ # so only this job installs mkdocs-material; the docs guard tests (drift/cross-check/doctest/
55
+ # tooltips) run here with mkdocs present (they skip in the lean `gate`/`test` jobs).
56
+ name: docs site
57
+ runs-on: ubuntu-latest
58
+ steps:
59
+ - uses: actions/checkout@v4
60
+ - uses: astral-sh/setup-uv@v6
61
+ with:
62
+ enable-cache: true
63
+ - run: uv sync --group docs
64
+ - name: mkdocs build --strict
65
+ run: uv run mkdocs build --strict
66
+ - name: docs guards (drift, cross-check, doctest, tooltips, search)
67
+ run: uv run pytest tests/test_docs.py
68
+
69
+ test:
70
+ # Behavior + types across the supported Python and OS matrix.
71
+ name: test (py${{ matrix.python }} · ${{ matrix.os }})
72
+ strategy:
73
+ fail-fast: false
74
+ matrix:
75
+ # os: [ubuntu-latest, macos-latest, windows-latest]
76
+ os: [ubuntu-latest]
77
+ python: ["3.12", "3.13", "3.14"]
78
+ runs-on: ${{ matrix.os }}
79
+ steps:
80
+ - uses: actions/checkout@v4
81
+ - uses: astral-sh/setup-uv@v6
82
+ with:
83
+ enable-cache: true
84
+ python-version: ${{ matrix.python }}
85
+ - run: uv sync
86
+ - name: pytest
87
+ run: uv run pytest
88
+ - name: mypy --strict
89
+ run: uv run mypy
90
+ - name: pyright
91
+ run: uv run pyright
92
+
93
+ build:
94
+ # Build the sdist + wheel and validate the package metadata. Runs on every release tag AND on a
95
+ # manual workflow_dispatch — the dispatch path is the end-to-end DRY RUN (issue 0024, AC4): it
96
+ # produces and checks the exact artifacts a real publish would upload, without uploading them, so
97
+ # the packaging pipeline is proven before the first PyPI release. The built dist is handed to
98
+ # `publish` as an artifact, so the bytes that are checked are the bytes that ship.
99
+ name: build distribution
100
+ if: startsWith(github.ref, 'refs/tags/v') || github.event_name == 'workflow_dispatch'
101
+ runs-on: ubuntu-latest
102
+ steps:
103
+ - uses: actions/checkout@v4
104
+ - uses: astral-sh/setup-uv@v6
105
+ with:
106
+ enable-cache: true
107
+ - name: build sdist + wheel
108
+ # The version is static in pyproject.toml (cz owns it via `version_provider = "uv"`), so the
109
+ # build needs no git history — hatchling reads `[project].version` directly.
110
+ run: uv build
111
+ - name: check distribution metadata
112
+ run: uvx twine check --strict dist/*
113
+ - uses: actions/upload-artifact@v4
114
+ with:
115
+ name: dist
116
+ path: dist/
117
+
118
+ publish:
119
+ # Tag-triggered PyPI publish via Trusted Publishing (OIDC) — no stored token (ADR-0008, issue
120
+ # 0024). The OIDC publisher PyPI was given is bound to THIS workflow file (ci.yml), so the publish
121
+ # step must live here; moving it to another file would require re-doing the PyPI setup. Gated
122
+ # behind the full quality gate + the build/metadata check, so a tag only ships when everything is
123
+ # green. `id-token: write` lets the action mint the short-lived OIDC token it trades to PyPI; no
124
+ # other permission is granted. The `pypi` environment is a seam for adding a required-reviewer
125
+ # rule later (the publisher was registered with environment "any", so none is required today).
126
+ name: publish to PyPI
127
+ if: startsWith(github.ref, 'refs/tags/v')
128
+ needs: [gate, docs, test, build]
129
+ runs-on: ubuntu-latest
130
+ environment:
131
+ name: pypi
132
+ url: https://pypi.org/p/araclean
133
+ permissions:
134
+ id-token: write
135
+ steps:
136
+ - uses: actions/download-artifact@v4
137
+ with:
138
+ name: dist
139
+ path: dist/
140
+ - uses: pypa/gh-action-pypi-publish@release/v1
141
+
142
+ docs-deploy:
143
+ # Versioned docs (issue 0024, story 52): `mike` publishes the tag's docs as a pinned X.Y.Z version
144
+ # with a moving `latest` alias and makes `latest` the default, so the docs a reader lands on match
145
+ # the araclean version they installed — a reproducibility requirement (PRD), not a nicety. mike
146
+ # commits the rendered site to the `gh-pages` branch that GitHub Pages serves; `contents: write`
147
+ # authorizes that push and is the only permission granted. Runs after the gate + strict docs build
148
+ # are green, so the site can never be deployed broken.
149
+ name: deploy versioned docs
150
+ if: startsWith(github.ref, 'refs/tags/v')
151
+ needs: [gate, docs, test]
152
+ runs-on: ubuntu-latest
153
+ permissions:
154
+ contents: write
155
+ steps:
156
+ - uses: actions/checkout@v4
157
+ with:
158
+ fetch-depth: 0 # mike appends to the existing gh-pages history it rewrites
159
+ - uses: astral-sh/setup-uv@v6
160
+ with:
161
+ enable-cache: true
162
+ - run: uv sync --group docs
163
+ - name: configure git identity for the gh-pages commit
164
+ run: |
165
+ git config user.name "github-actions[bot]"
166
+ git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
167
+ - name: mike deploy
168
+ run: |
169
+ version="${GITHUB_REF_NAME#v}"
170
+ uv run mike deploy --push --update-aliases "$version" latest
171
+ uv run mike set-default --push latest
@@ -0,0 +1,27 @@
1
+ .ralph
2
+ issues/
3
+ docs/competitive-landscape.md
4
+
5
+ # Python
6
+ .venv/
7
+ __pycache__/
8
+ *.py[cod]
9
+ .mypy_cache/
10
+ .ruff_cache/
11
+ .pytest_cache/
12
+ .hypothesis/
13
+ build/
14
+ dist/
15
+ *.egg-info/
16
+
17
+ # Benchmark outputs: asv's envs/results/html (issue 0019) and pytest-benchmark's saved runs. The
18
+ # benchmark *suites* (benchmarks/, asv.conf.json) are tracked; only their generated timings are not.
19
+ .asv/
20
+ .benchmarks/
21
+
22
+ # Docs site build output (issue 0023): `mkdocs build` writes to site/. The source pages (docs/) and
23
+ # the generated, committed pages are tracked; only the rendered HTML is not.
24
+ site/
25
+
26
+ # Node (cspell runs via npx)
27
+ node_modules/
@@ -0,0 +1,71 @@
1
+ # Quality gate (issue 0001). Every hook runs the project's pinned tool via
2
+ # `uv run` so there is a single source of tool versions (the uv lockfile);
3
+ # cSpell runs via npx against the project dictionary (cspell.json).
4
+ #
5
+ # uv run pre-commit install # wire the hooks
6
+ # uv run pre-commit run --all-files # run the whole gate
7
+ minimum_pre_commit_version: "3.7.0"
8
+ default_install_hook_types: [pre-commit, commit-msg]
9
+ # Hooks below run at pre-commit unless they opt into another stage (only the
10
+ # commitizen hook does, at commit-msg) — so cspell/pytest don't lint commit text.
11
+ default_stages: [pre-commit]
12
+
13
+ repos:
14
+ - repo: local
15
+ hooks:
16
+ - id: ruff-check
17
+ name: ruff (lint)
18
+ entry: uv run ruff check --force-exclude
19
+ language: system
20
+ types_or: [python, pyi]
21
+ require_serial: true
22
+
23
+ - id: ruff-format
24
+ name: ruff (format)
25
+ entry: uv run ruff format --force-exclude
26
+ language: system
27
+ types_or: [python, pyi]
28
+ require_serial: true
29
+
30
+ - id: mypy
31
+ name: mypy --strict
32
+ entry: uv run mypy
33
+ language: system
34
+ types: [python]
35
+ pass_filenames: false
36
+
37
+ - id: pyright
38
+ name: pyright
39
+ entry: uv run pyright
40
+ language: system
41
+ types: [python]
42
+ pass_filenames: false
43
+
44
+ - id: pytest
45
+ name: pytest
46
+ entry: uv run pytest
47
+ language: system
48
+ types: [python]
49
+ pass_filenames: false
50
+ always_run: true
51
+
52
+ - id: cspell
53
+ name: cspell (canonical terminology)
54
+ # --no-must-find-files: a commit can stage a file cspell has nothing to check (the generated
55
+ # uv.lock, which cspell skips); without this, cspell exits non-zero on "no files" when that
56
+ # file is checked alone, failing the commit even though nothing is misspelled. A real
57
+ # misspelling in a checked file still fails the hook.
58
+ entry: npx --yes cspell@9.8.0 --no-progress --no-summary --no-must-find-files
59
+ language: system
60
+ types: [text]
61
+ # npx's package cache is not concurrency-safe: pre-commit splits a long file list (CI's
62
+ # --all-files) into batches it runs IN PARALLEL, and concurrent `npx --yes` installs into
63
+ # a cold cache corrupt each other (seen in CI as a bogus SyntaxError from inside
64
+ # cspell-lib). One serial invocation installs once, then checks everything.
65
+ require_serial: true
66
+
67
+ - id: commitizen
68
+ name: commitizen (Conventional Commit message)
69
+ entry: uv run cz check --allow-abort --commit-msg-file
70
+ language: system
71
+ stages: [commit-msg]
@@ -0,0 +1,52 @@
1
+ ## v0.2.0 (2026-06-12)
2
+
3
+ ### BREAKING CHANGE
4
+
5
+ - remove the exclude_sukun option from RemoveTashkeel and
6
+ remove_tashkeel and drop it from the serialized config. Sukun is the
7
+ absence of a vowel (not a haraka) but always rides with HARAKAT —
8
+ stripping the vowels while leaving a bare sukun was never a use case, so
9
+ the flag added API surface for no real need. Sukun is still removed only
10
+ alongside HARAKAT, never on its own.
11
+
12
+ ### Feat
13
+
14
+ - implement offset-preserving normalization (Phase 2 Bet 1, ADR-0012)
15
+ - add the polars .araclean Series namespace (0022)
16
+ - add the pandas .araclean Series accessor (0021)
17
+ - add the araclean CLI — a thin Typer adapter over the facade (0020)
18
+ - add RemoveStopwords — a curated, versioned, negation-safe stopword list (0017)
19
+ - add the config trust boundary — NormalizeConfig, JSON Schema, safety audit (0016)
20
+ - add SOCIAL — the noisy-user-text profile that keeps the signal (0014)
21
+ - add CLASSICAL — the vocalization-preserving lossless profile (0015)
22
+ - add HandleEmoji — keep / strip / demojize (0013)
23
+ - add cleaning steps — CleanURLs, CleanMentions, CleanHTML (0012)
24
+ - add Pipeline ergonomics — batch, repr, select (0005)
25
+ - add ML, the conservative-on-letters profile (0011)
26
+ - add SEARCH, the maximal recall profile (0010)
27
+ - add ReduceElongation, the repeated-letter elongation cap (0009)
28
+ - add MapDigits + MapPunctuation, the digit/punctuation folds (0008)
29
+ - add the four opt-in letter folds (0007)
30
+ - add RemoveTashkeel, the first lossy step (0006)
31
+ - preserve line structure in CollapseWhitespace by default
32
+ - complete LIGHT with the four char-map steps (0004)
33
+ - fold Arabic presentation forms into LIGHT (0003)
34
+ - thread NormalizeUnicode through all three layers (0002)
35
+
36
+ ### Fix
37
+
38
+ - harden apply_aligned (ADR-0012 review) and repair the CI gate
39
+ - make `asv run` build araclean via pip instead of a missing wheel (0019)
40
+ - flatten line breaks in the SEARCH profile per ADR-0010 (0025)
41
+ - give every lossy profile the shared CollapseWhitespace+NFC closing tail
42
+ - **steps**: complete the alef/hamza letter-fold repertoire (0007)
43
+ - cover the full tashkeel repertoire; drop exclude_sukun (0006)
44
+ - guarantee canonical NFC output so normalize is idempotent
45
+
46
+ ### Refactor
47
+
48
+ - consolidate the duplicated _build_pipeline helper into one
49
+
50
+ ### Perf
51
+
52
+ - fuse a pipeline's consecutive str.translate steps into one pass (0018)
@@ -0,0 +1,83 @@
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation.
6
+
7
+ We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community.
8
+
9
+ ## Our Standards
10
+
11
+ Examples of behavior that contributes to a positive environment for our community include:
12
+
13
+ * Demonstrating empathy and kindness toward other people
14
+ * Being respectful of differing opinions, viewpoints, and experiences
15
+ * Giving and gracefully accepting constructive feedback
16
+ * Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience
17
+ * Focusing on what is best not just for us as individuals, but for the overall community
18
+
19
+ Examples of unacceptable behavior include:
20
+
21
+ * The use of sexualized language or imagery, and sexual attention or advances of any kind
22
+ * Trolling, insulting or derogatory comments, and personal or political attacks
23
+ * Public or private harassment
24
+ * Publishing others' private information, such as a physical or email address, without their explicit permission
25
+ * Other conduct which could reasonably be considered inappropriate in a professional setting
26
+
27
+ ## Enforcement Responsibilities
28
+
29
+ Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful.
30
+
31
+ Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate.
32
+
33
+ ## Scope
34
+
35
+ This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event.
36
+
37
+ ## Enforcement
38
+
39
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at [INSERT CONTACT METHOD]. All complaints will be reviewed and investigated promptly and fairly.
40
+
41
+ All community leaders are obligated to respect the privacy and security of the reporter of any incident.
42
+
43
+ ## Enforcement Guidelines
44
+
45
+ Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct:
46
+
47
+ ### 1. Correction
48
+
49
+ **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community.
50
+
51
+ **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested.
52
+
53
+ ### 2. Warning
54
+
55
+ **Community Impact**: A violation through a single incident or series of actions.
56
+
57
+ **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban.
58
+
59
+ ### 3. Temporary Ban
60
+
61
+ **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior.
62
+
63
+ **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban.
64
+
65
+ ### 4. Permanent Ban
66
+
67
+ **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals.
68
+
69
+ **Consequence**: A permanent ban from any sort of public interaction within the community.
70
+
71
+ ## Attribution
72
+
73
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
74
+
75
+ Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
76
+
77
+ For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations].
78
+
79
+ [homepage]: https://www.contributor-covenant.org
80
+ [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
81
+ [Mozilla CoC]: https://github.com/mozilla/diversity
82
+ [FAQ]: https://www.contributor-covenant.org/faq
83
+ [translations]: https://www.contributor-covenant.org/translations