gladia-normalization 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. gladia_normalization-0.1.0a1/.commitlintrc.json +3 -0
  2. gladia_normalization-0.1.0a1/.github/pull_request_template.md +43 -0
  3. gladia_normalization-0.1.0a1/.github/workflows/cd.yml +49 -0
  4. gladia_normalization-0.1.0a1/.github/workflows/ci.yml +53 -0
  5. gladia_normalization-0.1.0a1/.gitignore +19 -0
  6. gladia_normalization-0.1.0a1/.pre-commit-config.yaml +28 -0
  7. gladia_normalization-0.1.0a1/.python-version +1 -0
  8. gladia_normalization-0.1.0a1/AGENTS.md +160 -0
  9. gladia_normalization-0.1.0a1/CLAUDE.md +1 -0
  10. gladia_normalization-0.1.0a1/CONTRIBUTING.md +217 -0
  11. gladia_normalization-0.1.0a1/LICENSE +21 -0
  12. gladia_normalization-0.1.0a1/PKG-INFO +204 -0
  13. gladia_normalization-0.1.0a1/README.md +161 -0
  14. gladia_normalization-0.1.0a1/docs/steps.md +439 -0
  15. gladia_normalization-0.1.0a1/normalization/__init__.py +4 -0
  16. gladia_normalization-0.1.0a1/normalization/constants/__init__.py +3 -0
  17. gladia_normalization-0.1.0a1/normalization/constants/protectors.py +24 -0
  18. gladia_normalization-0.1.0a1/normalization/languages/__init__.py +7 -0
  19. gladia_normalization-0.1.0a1/normalization/languages/base/__init__.py +7 -0
  20. gladia_normalization-0.1.0a1/normalization/languages/base/language_config.py +83 -0
  21. gladia_normalization-0.1.0a1/normalization/languages/base/language_operator.py +65 -0
  22. gladia_normalization-0.1.0a1/normalization/languages/english/__init__.py +7 -0
  23. gladia_normalization-0.1.0a1/normalization/languages/english/number_normalizer.py +433 -0
  24. gladia_normalization-0.1.0a1/normalization/languages/english/operators.py +199 -0
  25. gladia_normalization-0.1.0a1/normalization/languages/english/replacements.py +1774 -0
  26. gladia_normalization-0.1.0a1/normalization/languages/english/sentence_replacements.py +3 -0
  27. gladia_normalization-0.1.0a1/normalization/languages/french/__init__.py +7 -0
  28. gladia_normalization-0.1.0a1/normalization/languages/french/operators.py +38 -0
  29. gladia_normalization-0.1.0a1/normalization/languages/french/replacements.py +1 -0
  30. gladia_normalization-0.1.0a1/normalization/languages/registery.py +17 -0
  31. gladia_normalization-0.1.0a1/normalization/pipeline/__init__.py +0 -0
  32. gladia_normalization-0.1.0a1/normalization/pipeline/base.py +120 -0
  33. gladia_normalization-0.1.0a1/normalization/pipeline/loader.py +70 -0
  34. gladia_normalization-0.1.0a1/normalization/pipeline/replacer.py +38 -0
  35. gladia_normalization-0.1.0a1/normalization/presets/gladia-3.yaml +119 -0
  36. gladia_normalization-0.1.0a1/normalization/steps/__init__.py +4 -0
  37. gladia_normalization-0.1.0a1/normalization/steps/base/__init__.py +6 -0
  38. gladia_normalization-0.1.0a1/normalization/steps/base/protect_step.py +24 -0
  39. gladia_normalization-0.1.0a1/normalization/steps/base/restore_step.py +23 -0
  40. gladia_normalization-0.1.0a1/normalization/steps/base/text_step.py +11 -0
  41. gladia_normalization-0.1.0a1/normalization/steps/base/word_step.py +11 -0
  42. gladia_normalization-0.1.0a1/normalization/steps/registery.py +26 -0
  43. gladia_normalization-0.1.0a1/normalization/steps/text/__init__.py +79 -0
  44. gladia_normalization-0.1.0a1/normalization/steps/text/apply_sentence_level_replacements.py +29 -0
  45. gladia_normalization-0.1.0a1/normalization/steps/text/casefold_text.py +13 -0
  46. gladia_normalization-0.1.0a1/normalization/steps/text/convert_comparison_operators_to_words.py +26 -0
  47. gladia_normalization-0.1.0a1/normalization/steps/text/convert_decimal_periods_to_decimal_word.py +30 -0
  48. gladia_normalization-0.1.0a1/normalization/steps/text/convert_degree_symbols_to_words.py +26 -0
  49. gladia_normalization-0.1.0a1/normalization/steps/text/convert_digit_word_sequences_to_digits.py +41 -0
  50. gladia_normalization-0.1.0a1/normalization/steps/text/convert_dots_to_words_in_technical_contexts.py +33 -0
  51. gladia_normalization-0.1.0a1/normalization/steps/text/convert_oclock_to_numeric_time.py +34 -0
  52. gladia_normalization-0.1.0a1/normalization/steps/text/convert_roman_numerals_to_digits.py +41 -0
  53. gladia_normalization-0.1.0a1/normalization/steps/text/convert_word_based_time_patterns.py +70 -0
  54. gladia_normalization-0.1.0a1/normalization/steps/text/expand_alphanumeric_codes.py +64 -0
  55. gladia_normalization-0.1.0a1/normalization/steps/text/expand_contractions.py +16 -0
  56. gladia_normalization-0.1.0a1/normalization/steps/text/expand_written_numbers_to_digits.py +13 -0
  57. gladia_normalization-0.1.0a1/normalization/steps/text/expand_www_abbreviation.py +15 -0
  58. gladia_normalization-0.1.0a1/normalization/steps/text/fix_ampm_letter_spacing.py +25 -0
  59. gladia_normalization-0.1.0a1/normalization/steps/text/fix_dot_adjacent_number_words.py +37 -0
  60. gladia_normalization-0.1.0a1/normalization/steps/text/fix_one_word_in_numeric_contexts.py +16 -0
  61. gladia_normalization-0.1.0a1/normalization/steps/text/fix_version_number_v_prefix.py +15 -0
  62. gladia_normalization-0.1.0a1/normalization/steps/text/format_time_patterns_with_ampm.py +53 -0
  63. gladia_normalization-0.1.0a1/normalization/steps/text/normalize_numeric_time_formats.py +16 -0
  64. gladia_normalization-0.1.0a1/normalization/steps/text/normalize_punctuation_between_number_words.py +30 -0
  65. gladia_normalization-0.1.0a1/normalization/steps/text/placeholders.py +367 -0
  66. gladia_normalization-0.1.0a1/normalization/steps/text/protect_plus_word_before_digit_words.py +37 -0
  67. gladia_normalization-0.1.0a1/normalization/steps/text/remove_acronym_periods.py +17 -0
  68. gladia_normalization-0.1.0a1/normalization/steps/text/remove_diacritics.py +40 -0
  69. gladia_normalization-0.1.0a1/normalization/steps/text/remove_filler_words.py +19 -0
  70. gladia_normalization-0.1.0a1/normalization/steps/text/remove_hash_before_numbers.py +15 -0
  71. gladia_normalization-0.1.0a1/normalization/steps/text/remove_non_numeric_trailing_dots.py +15 -0
  72. gladia_normalization-0.1.0a1/normalization/steps/text/remove_spaces_between_adjacent_digits.py +54 -0
  73. gladia_normalization-0.1.0a1/normalization/steps/text/remove_standalone_currency_symbols.py +35 -0
  74. gladia_normalization-0.1.0a1/normalization/steps/text/remove_symbols.py +24 -0
  75. gladia_normalization-0.1.0a1/normalization/steps/text/remove_thousand_separators.py +24 -0
  76. gladia_normalization-0.1.0a1/normalization/steps/text/remove_trailing_apostrophe_space.py +15 -0
  77. gladia_normalization-0.1.0a1/normalization/steps/text/remove_trailing_dot_word_from_emails.py +22 -0
  78. gladia_normalization-0.1.0a1/normalization/steps/text/remove_trailing_period.py +15 -0
  79. gladia_normalization-0.1.0a1/normalization/steps/text/remove_zero_minutes_from_time.py +30 -0
  80. gladia_normalization-0.1.0a1/normalization/steps/text/replace_currency.py +31 -0
  81. gladia_normalization-0.1.0a1/normalization/steps/word/__init__.py +3 -0
  82. gladia_normalization-0.1.0a1/normalization/steps/word/apply_word_replacements.py +33 -0
  83. gladia_normalization-0.1.0a1/pyproject.toml +59 -0
  84. gladia_normalization-0.1.0a1/scripts/generate_step_docs.py +78 -0
  85. gladia_normalization-0.1.0a1/tests/__init__.py +0 -0
  86. gladia_normalization-0.1.0a1/tests/e2e/__init__.py +0 -0
  87. gladia_normalization-0.1.0a1/tests/e2e/default_pipeline_test.py +42 -0
  88. gladia_normalization-0.1.0a1/tests/e2e/files/gladia-3.csv +126 -0
  89. gladia_normalization-0.1.0a1/tests/e2e/normalization_test.py +68 -0
  90. gladia_normalization-0.1.0a1/tests/unit/languages/__init__.py +0 -0
  91. gladia_normalization-0.1.0a1/tests/unit/languages/english_registry_test.py +33 -0
  92. gladia_normalization-0.1.0a1/tests/unit/languages/symbols_to_words_test.py +29 -0
  93. gladia_normalization-0.1.0a1/tests/unit/languages/word_replacement_test.py +21 -0
  94. gladia_normalization-0.1.0a1/tests/unit/steps/__init__.py +0 -0
  95. gladia_normalization-0.1.0a1/tests/unit/steps/text/__init__.py +0 -0
  96. gladia_normalization-0.1.0a1/tests/unit/steps/text/conftest.py +22 -0
  97. gladia_normalization-0.1.0a1/tests/unit/steps/text/convert_dots_to_words_in_technical_contexts_test.py +54 -0
  98. gladia_normalization-0.1.0a1/tests/unit/steps/text/convert_oclock_to_numeric_time_test.py +32 -0
  99. gladia_normalization-0.1.0a1/tests/unit/steps/text/convert_roman_numerals_to_digits_test.py +72 -0
  100. gladia_normalization-0.1.0a1/tests/unit/steps/text/protect_decimal_separator_test.py +40 -0
  101. gladia_normalization-0.1.0a1/tests/unit/steps/text/remove_diacritics_test.py +17 -0
  102. gladia_normalization-0.1.0a1/tests/unit/steps/text/remove_zero_minutes_from_time_test.py +21 -0
  103. gladia_normalization-0.1.0a1/tests/unit/steps/text/replace_currency_test.py +28 -0
  104. gladia_normalization-0.1.0a1/tests/unit/steps/text/restore_decimal_separator_with_word_test.py +21 -0
  105. gladia_normalization-0.1.0a1/uv.lock +315 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ "extends": ["@commitlint/config-conventional"]
3
+ }
@@ -0,0 +1,43 @@
1
+ ## What does this PR do?
2
+
3
+ <!-- One-sentence summarizing of the change. -->
4
+
5
+ ## Type of change
6
+
7
+ - [ ] New language (`languages/{lang}/`)
8
+ - [ ] New step (`steps/text/` or `steps/word/`)
9
+ - [ ] New preset version (`presets/`)
10
+ - [ ] Bug fix
11
+ - [ ] Refactor / internal cleanup
12
+ - [ ] Docs / CI
13
+
14
+ ## Checklist
15
+
16
+ ### New language
17
+
18
+ - [ ] Created `languages/{lang}/` with `operators.py`, `replacements.py`, `__init__.py`
19
+ - [ ] All word-level substitutions are in `replacements.py`, not inline in `operators.py`
20
+ - [ ] Decorated operators class with `@register_language`
21
+ - [ ] Added one import line to `languages/__init__.py`
22
+ - [ ] Added unit tests in `tests/unit/languages/`
23
+ - [ ] Added e2e test rows in `tests/e2e/files/`
24
+
25
+ ### New step
26
+
27
+ - [ ] `name` class attribute is unique and matches the YAML key
28
+ - [ ] Decorated with `@register_step`
29
+ - [ ] Added one import line to `steps/text/__init__.py` or `steps/word/__init__.py`
30
+ - [ ] Algorithm reads data from `operators.config.*`, no hardcoded language-specific values
31
+ - [ ] Optional config fields are guarded with `if operators.config.field is None: return text`
32
+ - [ ] Placeholder protect/restore pairs are both in `steps/text/placeholders.py` and `pipeline/base.py`'s `validate()` is updated
33
+ - [ ] Added unit tests in `tests/unit/steps/`
34
+ - [ ] Added step name to relevant preset YAMLs (new preset file if existing presets are affected)
35
+ - [ ] If the class docstring was added or changed, ran `uv run scripts/generate_step_docs.py` to regenerate `docs/steps.md`
36
+
37
+ ### Preset change
38
+
39
+ - [ ] Existing preset files are not modified — new behavior uses a new preset version file
40
+
41
+ ## Tests
42
+
43
+ <!-- Describe what was tested and how. -->
@@ -0,0 +1,49 @@
1
+ name: CD
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ # PEP 440 versioning
7
+ - v[0-9]+.[0-9]+.[0-9]+
8
+ - v[0-9]+.[0-9]+.[0-9]+a[0-9]+
9
+ - v[0-9]+.[0-9]+.[0-9]+b[0-9]+
10
+ - v[0-9]+.[0-9]+.[0-9]+rc[0-9]+
11
+
12
+ jobs:
13
+ publish:
14
+ name: Build and publish
15
+ runs-on: ubuntu-latest
16
+ environment: pypi
17
+ permissions:
18
+ contents: read
19
+ id-token: write # required for Trusted Publisher (OIDC)
20
+
21
+ steps:
22
+ - uses: actions/checkout@v6
23
+
24
+ - uses: astral-sh/setup-uv@v7
25
+ with:
26
+ python-version: "3.13"
27
+
28
+ - name: Validate tag format
29
+ run: |
30
+ if [[ ! "${{ github.ref_name }}" =~ ^v[0-9]+\.[0-9]+\.[0-9]+(a|b|rc)?[0-9]*$ ]]; then
31
+ echo "Error: Tag must follow PEP 440 versioning format (vMAJOR.MINOR.PATCH with optional pre-release suffix)"
32
+ echo "Examples: v1.2.3, v1.2.3a1, v1.2.3b2, v1.2.3rc1"
33
+ echo "Got: ${{ github.ref_name }}"
34
+ exit 1
35
+ fi
36
+
37
+ - name: Extract version from tag
38
+ id: version
39
+ run: echo "version=${GITHUB_REF_NAME#v}" >> "$GITHUB_OUTPUT"
40
+
41
+ - name: Update version in pyproject.toml
42
+ run: |
43
+ sed -i 's/^version = ".*"/version = "${{ steps.version.outputs.version }}"/' pyproject.toml
44
+
45
+ - name: Build
46
+ run: uv build
47
+
48
+ - name: Publish to PyPI
49
+ run: uv publish
@@ -0,0 +1,53 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ types: [opened, synchronize, reopened, labeled]
7
+ workflow_dispatch:
8
+
9
+ jobs:
10
+ commitlint:
11
+ name: Lint commit messages
12
+ runs-on: ubuntu-latest
13
+ permissions:
14
+ contents: read
15
+ pull-requests: read
16
+ steps:
17
+ - uses: actions/checkout@v6
18
+ with:
19
+ fetch-depth: 0
20
+ - uses: wagoid/commitlint-github-action@v6
21
+ with:
22
+ failOnWarnings: false
23
+
24
+ lint:
25
+ name: Lint (ruff)
26
+ runs-on: ubuntu-latest
27
+ steps:
28
+ - uses: actions/checkout@v6
29
+ - uses: astral-sh/setup-uv@v7
30
+ - run: uvx ruff check .
31
+ - run: uvx ruff format --check .
32
+
33
+ typecheck:
34
+ name: Type check (ty)
35
+ runs-on: ubuntu-latest
36
+ steps:
37
+ - uses: actions/checkout@v6
38
+ - uses: astral-sh/setup-uv@v7
39
+ with:
40
+ python-version: "3.13"
41
+ - run: uv sync --group dev
42
+ - run: uv run ty check .
43
+
44
+ test:
45
+ name: Tests (pytest)
46
+ runs-on: ubuntu-latest
47
+ steps:
48
+ - uses: actions/checkout@v6
49
+ - uses: astral-sh/setup-uv@v7
50
+ with:
51
+ python-version: "3.13"
52
+ - run: uv sync --group dev
53
+ - run: uv run pytest
@@ -0,0 +1,19 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .ruff_cache/
9
+ .pytest_cache/
10
+
11
+
12
+ # Virtual environments
13
+ .venv
14
+
15
+ # IDE
16
+ .vscode/
17
+ .idea/
18
+ .cursor/
19
+ .claude/
@@ -0,0 +1,28 @@
1
+ default_install_hook_types:
2
+ - pre-commit
3
+ - commit-msg
4
+
5
+ repos:
6
+ - repo: https://github.com/alessandrojcm/commitlint-pre-commit-hook
7
+ rev: v9.24.0
8
+ hooks:
9
+ - id: commitlint
10
+ stages: [commit-msg]
11
+ additional_dependencies: ["@commitlint/config-conventional"]
12
+ verbose: true
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ rev: v0.15.2
15
+ hooks:
16
+ - id: ruff-check
17
+ args: [--fix]
18
+ - id: ruff-format
19
+
20
+ # Remove this once ty pre-commit hook is released
21
+ - repo: local
22
+ hooks:
23
+ - id: ty
24
+ name: ty check
25
+ entry: uvx ty check .
26
+ language: system
27
+ pass_filenames: false
28
+ always_run: true
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,160 @@
1
+ # text_normalizers — Agent Guidelines
2
+
3
+ This document describes the architecture, conventions, and rules for contributing to `normalization`. Read it fully before making any change.
4
+
5
+ ---
6
+
7
+ ## What this project is
8
+
9
+ A Python library for normalizing speech-to-text transcription output and ground truth to enable fair Word Error Rate (WER) comparison across STT engines. It converts surface-form variations (currency symbols, written numbers, abbreviations, punctuation, fillers) into a canonical text representation so that semantically equivalent transcriptions are treated as identical.
10
+ The repository is using uv as package manager.
11
+
12
+ ---
13
+
14
+ ## Architecture overview
15
+
16
+ The pipeline has exactly three stages, always in this order:
17
+
18
+ 1. **Text pre-processing** — full-text transformations before word splitting (e.g. placeholder protection, symbol conversion, contraction expansion)
19
+ 2. **Word processing** — per-token transformations after splitting on spaces (e.g. replacements, email detection)
20
+ 3. **Text post-processing** — full-text cleanup after rejoining words (e.g. placeholder restoration, digit collapsing)
21
+
22
+ This 3-stage structure is a hard constraint, not a suggestion. Steps have implicit ordering dependencies (a placeholder must be protected before symbols are removed, and restored after). Never flatten stages or allow steps to run out of order.
23
+
24
+ ### Stage responsibilities
25
+
26
+ **text_pre_steps** — full text before word splitting.
27
+ Protect patterns (decimals, email symbols, slashes), expand multi-word forms (contractions, numbers, acronyms), convert symbols to words (currency, degrees, operators), apply character-level transforms (casefold, diacritics, punctuation removal), normalize whitespace.
28
+
29
+ **word_steps** — individual tokens after splitting, no neighbor context.
30
+ Skip special tokens (emails), apply single-word replacements (`vs` → `versus`), remove bracketed noise (`[inaudible]`).
31
+
32
+ **text_post_steps** — full text after word joining.
33
+ Restore placeholders to their final form (characters or words), format multi-word patterns (time, numbers), collapse digit sequences, normalize whitespace.
34
+
35
+ Pipelines are defined in YAML. The YAML lists which steps run in each stage. Step classes register themselves automatically via a decorator — the YAML name maps directly to the registered step.
36
+
37
+ ---
38
+
39
+ ## Project structure — key rules
40
+
41
+ ### `languages/`
42
+
43
+ Each supported language is a **self-contained folder** (e.g. `languages/english/`). Every language folder follows the same structure:
44
+
45
+ - `operators.py` — subclass of `LanguageOperators`, holds the language config instance and any language-specific _behavioral_ method overrides
46
+ - `replacements.py` — a plain `dict[str, str]` of **all** word-level substitutions for this language. Every word replacement goes here — never add inline entries in `operators.py`. An empty dict is valid for languages with no replacements yet.
47
+ - `__init__.py` — exports the operators class and the replacements dict, nothing else. Do not re-export sentence replacements, number normalizers, or any other internal symbols.
48
+
49
+ **`languages/base/`** is a package that defines the full language contract. It contains two files:
50
+
51
+ - `language_config.py` — `LanguageConfig` dataclass: all language-specific _data_ (separators, currency words, filler words, digit words, time word maps, sentence replacements, etc.). Most fields have sensible defaults (empty dicts/lists, `None` for optional fields); steps that read them skip gracefully when `None`.
52
+ - `language_operator.py` — `LanguageOperators`: the base class and language-neutral fallback. Directly instantiable with no arguments — uses a minimal `LanguageConfig(code="default")` with empty symbol/currency mappings and all optional fields set to `None`. Registered in the language registry under `"default"` so it serves as the automatic fallback when no language is specified or the language is unsupported. All methods are no-ops. Only methods where the algorithm itself varies by language should be overridden in subclasses. Methods that are purely data-driven (i.e. the step owns the algorithm and only reads config values) do **not** belong here.
53
+
54
+ Both symbols are re-exported from `languages/base/__init__.py`.
55
+
56
+ Additional files beyond the required three (e.g. `number_normalizer.py`, `sentence_replacements.py`) are allowed when a language needs them, but they must never be empty. Number-related _data_ (digit words, number words) belongs in `LanguageConfig`. Only create a `number_normalizer.py` when the expansion _algorithm_ is complex enough to warrant its own module (see `languages/english/number_normalizer.py`).
57
+
58
+ When adding a new language:
59
+
60
+ 1. Create a new folder under `languages/` with `operators.py`, `replacements.py`, and `__init__.py`
61
+ 2. Decorate the operators class with `@register_language` — registration is automatic
62
+ 3. Add one import line to `languages/__init__.py` to trigger the decorator at import time
63
+
64
+ ### `steps/`
65
+
66
+ Steps are **atomic, stateless, single-responsibility** transformations. Each step class:
67
+
68
+ - Has a `name` class attribute (the string used in YAML)
69
+ - Is decorated with `@register_step` — this auto-registers it, no manual registry update needed
70
+ - Receives `(text, operators)` for text steps, or `(word, operators)` for word steps
71
+ - **Owns the algorithm** — the `__call__` method contains the transformation logic
72
+ - **Reads data from `operators.config.*`** — never hardcodes language-specific values
73
+
74
+ Steps are organized into `steps/text/` and `steps/word/` by stage. Protect/restore placeholder pairs always live in the **same file** (`steps/text/placeholders.py`) to keep their dependency explicit and co-located.
75
+
76
+ When adding a new step:
77
+
78
+ 1. Create or add to the appropriate file under `steps/text/` or `steps/word/`
79
+ 2. Decorate with `@register_step`
80
+ 3. Add one import line to `steps/text/__init__.py` or `steps/word/__init__.py`
81
+ 4. Add the step name to the relevant YAML preset(s) if it should run by default
82
+
83
+ ### `pipeline/`
84
+
85
+ - `base.py` — `NormalizationPipeline`: the orchestrator. Holds the three ordered step lists, runs them, exposes `.describe()` and `.validate()`.
86
+ - `loader.py` — reads a YAML preset, resolves step names from the step registry, instantiates operators from the language registry, returns a ready-to-use pipeline.
87
+ - `replacer.py` — stateful compiled-regex engine used by the word replacement step. Lives here because it is infrastructure, not a step itself.
88
+
89
+ ### `presets/`
90
+
91
+ Versioned YAML files shipped with the library. **Once published, a preset must never be modified** — benchmark reproducibility depends on it. New behavior means a new preset file with a new version name.
92
+
93
+ ---
94
+
95
+ ## Core conventions
96
+
97
+ ### Auto-registration, not manual registries
98
+
99
+ Never manually maintain a dict mapping names to classes. Use the `@register_step` and `@register_language` decorators defined in `steps/registery.py` and `languages/registery.py`. The only manual work is adding an import line to the relevant `__init__.py` so the decorator runs at import time.
100
+
101
+ ### Language data vs. language behavior
102
+
103
+ This is the central design rule. There are two distinct places for language-specific things:
104
+
105
+ **`LanguageConfig` (data)** — everything that can be expressed as a value: strings, lists, dicts. This includes separator characters, currency words, filler words, digit words, number words, and data-driven mappings like `time_words`, `sentence_replacements`, etc. Optional fields use `TypeAlias | None = None`; a `None` value means the step that reads it must skip gracefully. Semantic `TypeAlias` definitions (`TimeWords`, `DigitWords`, `SentenceReplacements`, etc.) are defined in `language_config.py` to make the contract self-documenting.
106
+
107
+ **`LanguageOperators` (behavior)** — only methods where the _algorithm itself_ varies by language. Examples: `expand_contractions` (uses an external library + custom regexes), `expand_written_numbers` (English uses a complex Whisper-derived normalizer), `normalize_numeric_time_formats` (am/pm regex structure), `fix_one_word_in_numeric_contexts` (language-specific digit-adjacent pattern), `get_compound_minutes` (English combines tens+ones with hyphen/space; other languages form these differently or not at all). If the algorithm is generic and only the _data_ differs, the data goes in `LanguageConfig` and the algorithm goes in the step — not in the operator.
108
+
109
+ Decision rule: ask "does the _logic_ change by language, or just the _values_?" If only values change → `LanguageConfig`. If the logic changes → `LanguageOperators` method override.
110
+
111
+ ### Placeholder protection is ordered and paired
112
+
113
+ Any step that protects a character with a placeholder token must have a corresponding restore step. These must always be in `steps/text/placeholders.py`. The protect step must run in Stage 1 before `RemoveSymbolsStep`. The restore step must run in Stage 3. `pipeline.validate()` enforces this — do not bypass it. `loader.py` calls `validate()` automatically after constructing the pipeline.
114
+
115
+ When implementing placeholder steps, use the base classes where they fit:
116
+
117
+ - **`ProtectStep`** — use when the pattern has exactly two capture groups and emits a single placeholder (template: `\1{placeholder}\2`). Implement `_pattern(operators)`.
118
+ - **`RestoreStep`** — use when restoration is a plain string replacement of a single placeholder. Implement `_replacement(operators)`.
119
+ - **`TextStep`** directly — use when neither contract fits (multiple placeholders in one pass, zero-width patterns, per-match fan-out, marker deletion, post-replace logic). In that case, document why in the class docstring.
120
+
121
+ ### Steps are language-agnostic
122
+
123
+ A step must not contain any language-specific logic or string literals. If the algorithm differs by language, add a method to `LanguageOperators` (with a no-op default in the base) and call `operators.that_method(text)` from the step. If only data differs, read it from `operators.config.*`. English-only helpers (e.g. `EnglishNumberNormalizer`) live inside `languages/english/`, not in `steps/`.
124
+
125
+ ### Language folders are self-contained
126
+
127
+ Everything specific to a language lives inside its folder. If you find yourself adding a helper that only one language uses, it goes in that language's folder as an additional file — not in `steps/`, not in `pipeline/`. The English number normalizer (`languages/english/number_normalizer.py`) is the canonical example of this pattern.
128
+
129
+ ### Presets are the reproducibility contract
130
+
131
+ Never modify a published preset YAML. Never let a preset reference a step that has changed its behavior under the same name. If a step's behavior changes, create a new step with a new name and update the relevant presets accordingly.
132
+
133
+ ---
134
+
135
+ ## Adding a new language — checklist
136
+
137
+ - [ ] Create `languages/{lang}/` with `operators.py`, `replacements.py`, `__init__.py`
138
+ - [ ] Put all word-level substitutions in `replacements.py`; do not add inline entries in `operators.py`
139
+ - [ ] Instantiate a `LanguageConfig` in `operators.py`, filling in all required fields and any optional dict fields your language needs (`time_words`, `sentence_replacements`, etc.)
140
+ - [ ] Subclass `LanguageOperators`, overriding only methods where the _algorithm_ differs (not just the data)
141
+ - [ ] If the language has digit words, populate `digit_words` in `LanguageConfig`
142
+ - [ ] If the language uses spoken time patterns, populate `time_words` with all needed word→digit mappings (clock hours 1-12 and minute-worth values up to 50); if it also uses compound minute expressions (e.g. "twenty-one"), override `get_compound_minutes()` to generate them — do **not** put this in config
143
+ - [ ] If number expansion is needed and the algorithm is complex, implement it in a `number_normalizer.py` file and override `expand_written_numbers`; otherwise do not create the file
144
+ - [ ] Decorate the class with `@register_language`
145
+ - [ ] Add one import to `languages/__init__.py`
146
+ - [ ] Add tests in `tests/unit/languages/`
147
+ - [ ] Add test rows to `tests/e2e/files/` for the new language
148
+
149
+ ## Adding a new step — checklist
150
+
151
+ - [ ] Add the class to the appropriate file in `steps/text/` or `steps/word/`
152
+ - [ ] Set a unique `name` class attribute
153
+ - [ ] Decorate with `@register_step`
154
+ - [ ] Add one import to `steps/text/__init__.py` or `steps/word/__init__.py`
155
+ - [ ] Place the algorithm in `__call__`; read language data from `operators.config.*`; call operator methods only for genuinely behavioral differences
156
+ - [ ] If the step reads an optional `LanguageConfig` field, guard with `if operators.config.field is None: return text` and add a TODO comment
157
+ - [ ] Add unit tests in `tests/unit/steps/`
158
+ - [ ] If it involves placeholder protection, add both protect and restore to `steps/text/placeholders.py` and update `pipeline/base.py`'s `validate()` accordingly; use `ProtectStep`/`RestoreStep` base classes where the contract fits, otherwise use `TextStep` directly and document why in the docstring
159
+ - [ ] Add the step name to relevant preset YAMLs if needed (new preset version if existing presets are affected)
160
+ - [ ] If you added or changed the class docstring, run `python scripts/generate_step_docs.py` to regenerate `docs/steps.md`
@@ -0,0 +1 @@
1
+ AGENTS.md
@@ -0,0 +1,217 @@
1
+ # Contributing
2
+
3
+ Thanks for your interest in `gladia-normalization`! Here's how to get involved.
4
+
5
+ ## Reporting bugs
6
+
7
+ Open an issue with steps to reproduce, expected vs actual behavior, and your environment (Python version, OS, package version).
8
+
9
+ ## Submitting changes
10
+
11
+ 1. **Fork the repo and create a branch**: `git checkout -b feat/my-feature`
12
+ 2. **Make your changes and add tests**
13
+ 3. **Run the checks**:
14
+ ```bash
15
+ uv run pytest # run tests
16
+ uv run ruff check . # lint
17
+ uv run ruff format . # format
18
+ uv run ty check # type-check
19
+ ```
20
+ 4. **Push your branch**: `git push origin your-feature-branch`
21
+ 5. **Create a PR**: Go to GitHub and create a pull request
22
+ 6. **Fill out the PR template**: Provide clear description of changes
23
+ 7. **Wait for review**: Maintainers will review and provide feedback
24
+ 8. **Address feedback**: Make requested changes and push updates
25
+ 9. **Merge**: Once approved, your PR will be merged!
26
+
27
+ ### Pre-commit hooks
28
+
29
+ The project uses [pre-commit](https://pre-commit.com/) to enforce linting, formatting, and commit message conventions automatically. Install the hooks once after cloning:
30
+
31
+ ```bash
32
+ uv run pre-commit install --install-hooks
33
+ ```
34
+
35
+ This will run Ruff (lint + format) and ty (type-check) on every commit, and validate your commit message on `commit-msg`.
36
+
37
+ ## Commit style
38
+
39
+ We use [Conventional Commits](https://www.conventionalcommits.org/): pre-fix your commit with `feat:`, `fix:`, `docs:`, `chore:`, etc.
40
+
41
+ ## Architecture at a glance
42
+
43
+ Every pipeline runs exactly **three stages**, always in this order:
44
+
45
+ 1. **Text pre-processing** — full-text transforms before word splitting (placeholder protection, symbol conversion, contraction expansion, …)
46
+ 2. **Word processing** — per-token transforms after splitting on spaces (replacements, filler removal, …)
47
+ 3. **Text post-processing** — full-text cleanup after rejoining words (placeholder restoration, digit collapsing, …)
48
+
49
+ This ordering is a hard constraint — some steps depend on earlier steps having run. See the [README](./README.md) for more detail.
50
+
51
+ ## Adding a new step
52
+
53
+ 1. Create or extend a file under `normalization/steps/text/` or `normalization/steps/word/`.
54
+ 2. Decorate the class with `@register_step` and set a unique `name` attribute.
55
+ 3. Add an import to `steps/text/__init__.py` or `steps/word/__init__.py`.
56
+ 4. Add unit tests under `tests/unit/steps/`.
57
+ 5. Add the step name to the relevant preset YAML, or create a new preset version.
58
+ 6. If you added or changed the class docstring, regenerate `docs/steps.md` by running `uv run scripts/generate_step_docs.py`.
59
+
60
+ ### Choosing a base class
61
+
62
+ There are four base classes. Pick the narrowest one that fits your step.
63
+
64
+ **`WordStep`** — use when your transformation operates on a single token in isolation, with no knowledge of neighboring words. This is the only base class for Stage 2 steps.
65
+
66
+ ```python
67
+ @register_step
68
+ class MyWordStep(WordStep):
69
+ name = "my_word_step"
70
+
71
+ def __call__(self, word: str, operators: LanguageOperators) -> str:
72
+ ...
73
+ ```
74
+
75
+ **`TextStep`** — the general-purpose base for Stage 1 and Stage 3. Use it when your transformation needs to see the full string, or when none of the more specific bases below fit.
76
+
77
+ ```python
78
+ @register_step
79
+ class MyTextStep(TextStep):
80
+ name = "my_text_step"
81
+
82
+ def __call__(self, text: str, operators: LanguageOperators) -> str:
83
+ ...
84
+ ```
85
+
86
+ **`ProtectStep`** — a specialization of `TextStep` for the common case of replacing a character with a placeholder token. You only implement `_pattern`, which returns a compiled regex with **exactly two capture groups** (what comes before and after the character being replaced). The `__call__` is fixed: it applies the pattern as `\1{placeholder}\2`.
87
+
88
+ ```python
89
+ @register_step
90
+ class MyProtectStep(ProtectStep):
91
+ name = "my_protect_step"
92
+ placeholder = ProtectPlaceholder.MY_PLACEHOLDER
93
+
94
+ def _pattern(self, operators: LanguageOperators) -> re.Pattern:
95
+ return re.compile(r"(\d+)X(\d+)") # two capture groups required
96
+ ```
97
+
98
+ Use `ProtectStep` when: one regex pattern maps to exactly one placeholder substitution.
99
+
100
+ Use `TextStep` directly instead when: a single pass must protect two different symbols (like email `@` and `.`), the replacement needs to absorb surrounding whitespace with `\s*`, or the replacement is a per-match function rather than a fixed template.
101
+
102
+ **`RestoreStep`** — a specialization of `TextStep` for restoring a placeholder back to a string. You only implement `_replacement`, which returns the string to substitute in. The `__call__` does a plain `str.replace` of the placeholder (and its case-folded form).
103
+
104
+ ```python
105
+ @register_step
106
+ class MyRestoreStep(RestoreStep):
107
+ name = "my_restore_step"
108
+ placeholder = ProtectPlaceholder.MY_PLACEHOLDER
109
+
110
+ def _replacement(self, operators: LanguageOperators) -> str:
111
+ return operators.config.some_word or " "
112
+ ```
113
+
114
+ Use `RestoreStep` when: restoration is a straight token swap with no surrounding whitespace to absorb and no additional logic needed.
115
+
116
+ Use `TextStep` directly instead when: the placeholder was inserted with spaces around it (requiring `re.sub` with `\s*` to avoid double spaces), the marker should be deleted entirely rather than replaced, or post-replacement cleanup is needed.
117
+
118
+ ## Writing tests
119
+
120
+ ### Unit tests for a step
121
+
122
+ Unit tests live under `tests/unit/steps/text/` or `tests/unit/steps/word/`, mirroring the step file structure.
123
+
124
+ The `tests/unit/steps/text/conftest.py` provides two fixtures and a helper:
125
+
126
+ - `operators` — a bare `LanguageOperators()` instance (language-agnostic)
127
+ - `english_operators` — an `EnglishOperators()` instance
128
+ - `assert_text_step_registered(step_cls)` — verifies the step is in the registry under its name
129
+
130
+ Every test file for a step should at minimum:
131
+
132
+ 1. Assert the step is registered.
133
+ 2. Instantiate the step with `MyStep()` and call it directly: `MyStep()(text, operators)`.
134
+ 3. Mutate `operators.config` fields in-place to cover different language configurations without creating a full language.
135
+
136
+ ```python
137
+ # tests/unit/steps/text/my_step_test.py
138
+ from normalization.languages.base import LanguageOperators
139
+ from normalization.steps.text.my_module import MyStep
140
+
141
+ from .conftest import assert_text_step_registered
142
+
143
+
144
+ def test_step_is_registered():
145
+ assert_text_step_registered(MyStep)
146
+
147
+
148
+ def test_my_step_basic(operators: LanguageOperators):
149
+ result = MyStep()("some input", operators)
150
+ assert result == "expected output"
151
+
152
+
153
+ def test_my_step_with_config(operators: LanguageOperators):
154
+ operators.config.some_field = "custom_value"
155
+ result = MyStep()("some input", operators)
156
+ assert result == "expected output with custom value"
157
+
158
+
159
+ def test_my_step_with_english(english_operators):
160
+ result = MyStep()("some input", english_operators)
161
+ assert result == "english-specific output"
162
+ ```
163
+
164
+ ### E2E tests for a preset
165
+
166
+ E2E tests validate the full pipeline (preset + language) against a CSV fixture. The test runner lives in `tests/e2e/normalization_test.py` and CSV files go in `tests/e2e/files/`.
167
+
168
+ **CSV format** — three columns, no quoting needed unless the value contains a comma:
169
+
170
+ ```
171
+ input,expected,language
172
+ $1,000,000,1000000 dollars,en
173
+ hello world,hello world,fr
174
+ ```
175
+
176
+ Each row is one test case. The `language` column must match a registered language code (or `default`).
177
+
178
+ **Registering a new CSV** — add a block to `normalization_test.py` following the existing pattern:
179
+
180
+ ```python
181
+ _MY_PRESET_CSV = _FILES_DIR / "my-preset.csv"
182
+ _MY_PRESET_TESTS = _load_tests_from_csv(_MY_PRESET_CSV) if _MY_PRESET_CSV.exists() else []
183
+ _MY_PRESET_PIPELINES: dict[str, NormalizationPipeline] = {}
184
+
185
+
186
+ @pytest.mark.parametrize(
187
+ "test",
188
+ _MY_PRESET_TESTS,
189
+ ids=_case_ids(_MY_PRESET_TESTS),
190
+ )
191
+ def test_my_preset(test: NormalizationTest) -> None:
192
+ pipeline = _load_pipeline("my-preset", test.language)
193
+ result = pipeline.normalize(test.input)
194
+ assert result == test.expected, (
195
+ f"\n input: {test.input!r}"
196
+ f"\n expected: {test.expected!r}"
197
+ f"\n got: {result!r}"
198
+ )
199
+ ```
200
+
201
+ Pipelines are cached per language inside `_MY_PRESET_PIPELINES` to avoid reloading for each parametrized case — follow the `_load_pipeline` helper pattern already in the file.
202
+
203
+ Steps must be **language-agnostic** — delegate all language-specific logic to the `operators` argument or read data from `operators.config.*`.
204
+
205
+ ## Adding a new language
206
+
207
+ 1. Create `normalization/languages/{lang}/` with `operators.py`, `replacements.py`, and `__init__.py`.
208
+ 2. Put all word-level substitutions in `replacements.py`.
209
+ 3. Instantiate a `LanguageConfig` and subclass `LanguageOperators` in `operators.py`.
210
+ 4. Decorate with `@register_language` and add one import to `normalization/languages/__init__.py`.
211
+ 5. Add tests under `tests/unit/languages/` and e2e fixture rows in `tests/e2e/files/`.
212
+
213
+ ## Key design rules
214
+
215
+ - **Data vs. behavior**: if only the _values_ change by language, put them in `LanguageConfig`. If the _algorithm_ changes, override a method in `LanguageOperators`.
216
+ - **Presets are immutable**: never modify a published preset YAML — new behavior means a new preset file.
217
+ - **Placeholder pairs**: every `protect_*` step in Stage 1 must have a matching `restore_*` in Stage 3. The pipeline validates this at load time.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Gladia
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.