oracc-parser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. oracc_parser-0.1.0/LICENSE +21 -0
  2. oracc_parser-0.1.0/PKG-INFO +166 -0
  3. oracc_parser-0.1.0/README.md +130 -0
  4. oracc_parser-0.1.0/oracc_parser/__init__.py +34 -0
  5. oracc_parser-0.1.0/oracc_parser/cache.py +251 -0
  6. oracc_parser-0.1.0/oracc_parser/cli.py +201 -0
  7. oracc_parser-0.1.0/oracc_parser/constants.py +104 -0
  8. oracc_parser-0.1.0/oracc_parser/download/__init__.py +1 -0
  9. oracc_parser-0.1.0/oracc_parser/download/extract_jsons.py +87 -0
  10. oracc_parser-0.1.0/oracc_parser/download/fetch_data.py +298 -0
  11. oracc_parser-0.1.0/oracc_parser/download/oracc_download.py +270 -0
  12. oracc_parser-0.1.0/oracc_parser/download/pleiades.py +174 -0
  13. oracc_parser-0.1.0/oracc_parser/enriched_data/__init__.py +1 -0
  14. oracc_parser-0.1.0/oracc_parser/enriched_data/grouped_oracc_metadata_columns.csv +338 -0
  15. oracc_parser-0.1.0/oracc_parser/enriched_data/languages.csv +36 -0
  16. oracc_parser-0.1.0/oracc_parser/enriched_data/period_mapping.csv +26 -0
  17. oracc_parser-0.1.0/oracc_parser/enriched_data/pos_tags.csv +50 -0
  18. oracc_parser-0.1.0/oracc_parser/enriched_data/projects_metadata.csv +223 -0
  19. oracc_parser-0.1.0/oracc_parser/enriched_data/provenience.csv +337 -0
  20. oracc_parser-0.1.0/oracc_parser/enriched_data/raw_archive_values.csv +713 -0
  21. oracc_parser-0.1.0/oracc_parser/enriched_data/sign_readings.csv +8903 -0
  22. oracc_parser-0.1.0/oracc_parser/enriched_data/state_supergroup_mapping.csv +57 -0
  23. oracc_parser-0.1.0/oracc_parser/export/__init__.py +1 -0
  24. oracc_parser-0.1.0/oracc_parser/export/to_jsonl.py +161 -0
  25. oracc_parser-0.1.0/oracc_parser/io/__init__.py +2 -0
  26. oracc_parser-0.1.0/oracc_parser/io/word_csv.py +467 -0
  27. oracc_parser-0.1.0/oracc_parser/metadata/__init__.py +1 -0
  28. oracc_parser-0.1.0/oracc_parser/metadata/archive.py +399 -0
  29. oracc_parser-0.1.0/oracc_parser/metadata/populate.py +564 -0
  30. oracc_parser-0.1.0/oracc_parser/models/__init__.py +1 -0
  31. oracc_parser-0.1.0/oracc_parser/models/config.py +114 -0
  32. oracc_parser-0.1.0/oracc_parser/models/tablet.py +237 -0
  33. oracc_parser-0.1.0/oracc_parser/parsing/__init__.py +1 -0
  34. oracc_parser-0.1.0/oracc_parser/parsing/parse_content.py +174 -0
  35. oracc_parser-0.1.0/oracc_parser/parsing/parse_signs.py +219 -0
  36. oracc_parser-0.1.0/oracc_parser/parsing/parse_words.py +177 -0
  37. oracc_parser-0.1.0/oracc_parser/parsing/text_builder.py +175 -0
  38. oracc_parser-0.1.0/oracc_parser/parsing/translation.py +91 -0
  39. oracc_parser-0.1.0/oracc_parser/pipeline.py +535 -0
  40. oracc_parser-0.1.0/oracc_parser/settings.py +120 -0
  41. oracc_parser-0.1.0/oracc_parser/utils/__init__.py +1 -0
  42. oracc_parser-0.1.0/oracc_parser/utils/logger.py +32 -0
  43. oracc_parser-0.1.0/oracc_parser/utils/paths.py +519 -0
  44. oracc_parser-0.1.0/oracc_parser/utils/unicode.py +109 -0
  45. oracc_parser-0.1.0/oracc_parser.egg-info/PKG-INFO +166 -0
  46. oracc_parser-0.1.0/oracc_parser.egg-info/SOURCES.txt +57 -0
  47. oracc_parser-0.1.0/oracc_parser.egg-info/dependency_links.txt +1 -0
  48. oracc_parser-0.1.0/oracc_parser.egg-info/entry_points.txt +2 -0
  49. oracc_parser-0.1.0/oracc_parser.egg-info/requires.txt +18 -0
  50. oracc_parser-0.1.0/oracc_parser.egg-info/top_level.txt +1 -0
  51. oracc_parser-0.1.0/pyproject.toml +65 -0
  52. oracc_parser-0.1.0/setup.cfg +4 -0
  53. oracc_parser-0.1.0/tests/test_cache.py +205 -0
  54. oracc_parser-0.1.0/tests/test_constants.py +73 -0
  55. oracc_parser-0.1.0/tests/test_export.py +64 -0
  56. oracc_parser-0.1.0/tests/test_models.py +86 -0
  57. oracc_parser-0.1.0/tests/test_parsing.py +52 -0
  58. oracc_parser-0.1.0/tests/test_pipeline.py +116 -0
  59. oracc_parser-0.1.0/tests/test_settings.py +41 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shahar Spencer
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,166 @@
1
+ Metadata-Version: 2.4
2
+ Name: oracc-parser
3
+ Version: 0.1.0
4
+ Summary: Download and parse ORACC cuneiform text projects into ML-ready formats
5
+ Author: Avital Romach, Shahar Spencer, Claude Sonnet 4.6
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/shaharspencer/oracc-parser
8
+ Project-URL: Issues, https://github.com/shaharspencer/oracc-parser/issues
9
+ Keywords: oracc,cuneiform,akkadian,nlp,digital-humanities
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: requests>=2.31
21
+ Requires-Dist: pandas>=2.0
22
+ Requires-Dist: pydantic>=2.0
23
+ Requires-Dist: tqdm>=4.65
24
+ Requires-Dist: beautifulsoup4>=4.12
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=7.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
28
+ Requires-Dist: httpx>=0.24; extra == "dev"
29
+ Provides-Extra: server
30
+ Requires-Dist: fastapi>=0.100; extra == "server"
31
+ Requires-Dist: uvicorn>=0.22; extra == "server"
32
+ Provides-Extra: notebooks
33
+ Requires-Dist: jupyter; extra == "notebooks"
34
+ Requires-Dist: matplotlib; extra == "notebooks"
35
+ Dynamic: license-file
36
+
37
+ # oracc-parser
38
+
39
+ A Python tool to download and parse [ORACC](http://oracc.museum.upenn.edu/) cuneiform text projects into machine-learning-ready formats (JSONL, CSV, pandas DataFrames).
40
+
41
+ ## Features
42
+
43
+ - **Download** — Fetch project ZIPs directly from ORACC or Zenodo
44
+ - **Parse** — Convert raw ORACC JSON into structured data
45
+ - **Export** — Save datasets as JSONL, CSV, or pandas DataFrames
46
+ - **Configure** — Control handling of broken signs and POS masking using `RunConfig`
47
+
48
+ ## Installation
49
+
50
+
51
+ ```bash
52
+ git clone https://github.com/shaharspencer/oracc-parser.git
53
+ cd oracc-parser
54
+ pip install -e ".[dev]"
55
+ ```
56
+
57
+ ## Getting Started — Notebooks
58
+
59
+ The easiest way to explore oracc-parser is through the interactive notebooks.
60
+ Start with notebook 01 — it downloads all the data you need from Zenodo automatically.
61
+
62
+ | Notebook | What you'll learn |
63
+ |---|---|
64
+ | [`01_quickstart.ipynb`](notebooks/01_quickstart.ipynb) | Download the dataset → parse a project from pre-processed CSVs → explore transliterations, translations, and metadata → export |
65
+ | [`02_reference_data.ipynb`](notebooks/02_reference_data.ipynb) | Browse all projects in the dataset, query catalogues, explore bundled reference data (provenance, periods, sign list, POS tags) |
66
+ | [`03_configure_and_export.ipynb`](notebooks/03_configure_and_export.ipynb) | All `RunConfig` options — word-level and sign-level break filtering, POS masking — combining multiple projects and exporting datasets |
67
+ | [`04_oracc_json_processing.ipynb`](notebooks/04_oracc_json_processing.ipynb) | Advanced: understand the raw ORACC JSON structure, the JSON → TabletRecord → CSV pipeline, and how to download and parse projects not in the dataset |
68
+
69
+ ```bash
70
+ pip install oracc-parser[notebooks]
71
+ jupyter notebook notebooks/
72
+ ```
73
+
74
+ ## Quick Example
75
+
76
+ ```python
77
+ from oracc_parser import parse_project, RunConfig, get_full_flat_table
78
+
79
+ # Parse 5 tablets from SAA 01 (Neo-Assyrian royal letters)
80
+ records = parse_project("saao/saa01", config=RunConfig(limit=5))
81
+
82
+ # Get a flat DataFrame — no nesting, ready for analysis
83
+ df = get_full_flat_table(records)
84
+ df.to_json("dataset.jsonl", orient="records", lines=True)
85
+ ```
86
+
87
+ ## Configuration
88
+
89
+ You can customize the parsing process using `RunConfig`:
90
+
91
+ ```python
92
+ from oracc_parser import parse_project, RunConfig
93
+
94
+ records = parse_project("saao/saa01", config=RunConfig(
95
+ limit=10,
96
+ max_break_fraction=0.5, # word-level: drop words that are >50% broken
97
+ drop_missing=True, # sign-level: drop [x] signs from Unicode output
98
+ drop_damaged=False, # sign-level: keep ⸢x⸣ signs in Unicode output
99
+ mask_pos=["PN", "DN"], # replace personal/divine names with tag
100
+ ))
101
+ ```
102
+
103
+ ### Two independent levels of break filtering
104
+
105
+ `RunConfig` provides two distinct ways to handle damaged or missing text,
106
+ operating at different granularities and affecting different outputs:
107
+
108
+ | Parameter | Level | Affects | How it works |
109
+ |---|---|---|---|
110
+ | `max_break_fraction` | **Word** | Transliteration, normalization, lemmatization | Each word has a `break_perc` (fraction of its signs that are broken). Words exceeding this threshold are replaced with `X`. Default `1.0` keeps all words. |
111
+ | `drop_missing` | **Sign** | Unicode cuneiform only | Drops individual signs marked `[x]` (completely lost). |
112
+ | `drop_damaged` | **Sign** | Unicode cuneiform only | Drops individual signs marked `⸢x⸣` (partially legible). |
113
+
114
+ > **Note:** Because word-level and sign-level filtering use different thresholds
115
+ > and different granularities, **the text outputs and the Unicode cuneiform output
116
+ > are not necessarily aligned**. A word kept in the transliteration (because its
117
+ > average damage is below `max_break_fraction`) may still have individual signs
118
+ > dropped from the Unicode output if `drop_missing` / `drop_damaged` are enabled.
119
+
120
+ ### Other options
121
+
122
+ | Parameter | Default | Description |
123
+ |---|---|---|
124
+ | `limit` | `None` | Only parse the first N texts (useful for testing) |
125
+ | `keep_word_segmentation` | `True` | Preserve word boundaries in Unicode cuneiform output |
126
+ | `mask_pos` | `[]` | Replace words of certain POS tags with the tag name |
127
+ | `languages` | `["Akkadian"]` | Which languages to include when downloading projects |
128
+ | `use_cache` | `True` | Use cached results if available |
129
+
130
+ All reference data is bundled with the package, so you don't need to configure external paths unless you are customizing `oracc_parser.settings`.
131
+
132
+ ## CLI
133
+
134
+ ```bash
135
+ oracc-parser download --project saao/saa01
136
+ oracc-parser parse --project saao/saa01 --limit 5 --format jsonl --output saa01.jsonl
137
+ ```
138
+
139
+ ## Heavy Data (Zenodo)
140
+
141
+ Large data files (ORACC ZIPs, cached translations, Pleiades data) are on Zenodo:
142
+
143
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.18643122.svg)](https://doi.org/10.5281/zenodo.18643122)
144
+
145
+ ```bash
146
+ python scripts/download_zenodo_data.py
147
+ ```
148
+
149
+ ## Running Tests
150
+
151
+ ```bash
152
+ pytest tests/ -v # 98 tests
153
+ ```
154
+
155
+ ## Known Limitations
156
+
157
+ - **Chronology**: Period-to-year normalization is optimized for the **1st Millennium BCE**.
158
+ - **Language**: Parsing is primarily validated on **Akkadian** projects.
159
+
160
+ ## License
161
+
162
+ MIT — see [LICENSE](LICENSE).
163
+
164
+ ## Credits
165
+
166
+ Based on code by Niek Veldhuis ([Compass](https://github.com/niekveldhuis/compass)) and adapted for the BEn Project.
@@ -0,0 +1,130 @@
1
+ # oracc-parser
2
+
3
+ A Python tool to download and parse [ORACC](http://oracc.museum.upenn.edu/) cuneiform text projects into machine-learning-ready formats (JSONL, CSV, pandas DataFrames).
4
+
5
+ ## Features
6
+
7
+ - **Download** — Fetch project ZIPs directly from ORACC or Zenodo
8
+ - **Parse** — Convert raw ORACC JSON into structured data
9
+ - **Export** — Save datasets as JSONL, CSV, or pandas DataFrames
10
+ - **Configure** — Control handling of broken signs and POS masking using `RunConfig`
11
+
12
+ ## Installation
13
+
14
+
15
+ ```bash
16
+ git clone https://github.com/shaharspencer/oracc-parser.git
17
+ cd oracc-parser
18
+ pip install -e ".[dev]"
19
+ ```
20
+
21
+ ## Getting Started — Notebooks
22
+
23
+ The easiest way to explore oracc-parser is through the interactive notebooks.
24
+ Start with notebook 01 — it downloads all the data you need from Zenodo automatically.
25
+
26
+ | Notebook | What you'll learn |
27
+ |---|---|
28
+ | [`01_quickstart.ipynb`](notebooks/01_quickstart.ipynb) | Download the dataset → parse a project from pre-processed CSVs → explore transliterations, translations, and metadata → export |
29
+ | [`02_reference_data.ipynb`](notebooks/02_reference_data.ipynb) | Browse all projects in the dataset, query catalogues, explore bundled reference data (provenance, periods, sign list, POS tags) |
30
+ | [`03_configure_and_export.ipynb`](notebooks/03_configure_and_export.ipynb) | All `RunConfig` options — word-level and sign-level break filtering, POS masking — combining multiple projects and exporting datasets |
31
+ | [`04_oracc_json_processing.ipynb`](notebooks/04_oracc_json_processing.ipynb) | Advanced: understand the raw ORACC JSON structure, the JSON → TabletRecord → CSV pipeline, and how to download and parse projects not in the dataset |
32
+
33
+ ```bash
34
+ pip install oracc-parser[notebooks]
35
+ jupyter notebook notebooks/
36
+ ```
37
+
38
+ ## Quick Example
39
+
40
+ ```python
41
+ from oracc_parser import parse_project, RunConfig, get_full_flat_table
42
+
43
+ # Parse 5 tablets from SAA 01 (Neo-Assyrian royal letters)
44
+ records = parse_project("saao/saa01", config=RunConfig(limit=5))
45
+
46
+ # Get a flat DataFrame — no nesting, ready for analysis
47
+ df = get_full_flat_table(records)
48
+ df.to_json("dataset.jsonl", orient="records", lines=True)
49
+ ```
50
+
51
+ ## Configuration
52
+
53
+ You can customize the parsing process using `RunConfig`:
54
+
55
+ ```python
56
+ from oracc_parser import parse_project, RunConfig
57
+
58
+ records = parse_project("saao/saa01", config=RunConfig(
59
+ limit=10,
60
+ max_break_fraction=0.5, # word-level: drop words that are >50% broken
61
+ drop_missing=True, # sign-level: drop [x] signs from Unicode output
62
+ drop_damaged=False, # sign-level: keep ⸢x⸣ signs in Unicode output
63
+ mask_pos=["PN", "DN"], # replace personal/divine names with tag
64
+ ))
65
+ ```
66
+
67
+ ### Two independent levels of break filtering
68
+
69
+ `RunConfig` provides two distinct ways to handle damaged or missing text,
70
+ operating at different granularities and affecting different outputs:
71
+
72
+ | Parameter | Level | Affects | How it works |
73
+ |---|---|---|---|
74
+ | `max_break_fraction` | **Word** | Transliteration, normalization, lemmatization | Each word has a `break_perc` (fraction of its signs that are broken). Words exceeding this threshold are replaced with `X`. Default `1.0` keeps all words. |
75
+ | `drop_missing` | **Sign** | Unicode cuneiform only | Drops individual signs marked `[x]` (completely lost). |
76
+ | `drop_damaged` | **Sign** | Unicode cuneiform only | Drops individual signs marked `⸢x⸣` (partially legible). |
77
+
78
+ > **Note:** Because word-level and sign-level filtering use different thresholds
79
+ > and different granularities, **the text outputs and the Unicode cuneiform output
80
+ > are not necessarily aligned**. A word kept in the transliteration (because its
81
+ > average damage is below `max_break_fraction`) may still have individual signs
82
+ > dropped from the Unicode output if `drop_missing` / `drop_damaged` are enabled.
83
+
84
+ ### Other options
85
+
86
+ | Parameter | Default | Description |
87
+ |---|---|---|
88
+ | `limit` | `None` | Only parse the first N texts (useful for testing) |
89
+ | `keep_word_segmentation` | `True` | Preserve word boundaries in Unicode cuneiform output |
90
+ | `mask_pos` | `[]` | Replace words of certain POS tags with the tag name |
91
+ | `languages` | `["Akkadian"]` | Which languages to include when downloading projects |
92
+ | `use_cache` | `True` | Use cached results if available |
93
+
94
+ All reference data is bundled with the package, so you don't need to configure external paths unless you are customizing `oracc_parser.settings`.
95
+
96
+ ## CLI
97
+
98
+ ```bash
99
+ oracc-parser download --project saao/saa01
100
+ oracc-parser parse --project saao/saa01 --limit 5 --format jsonl --output saa01.jsonl
101
+ ```
102
+
103
+ ## Heavy Data (Zenodo)
104
+
105
+ Large data files (ORACC ZIPs, cached translations, Pleiades data) are on Zenodo:
106
+
107
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.18643122.svg)](https://doi.org/10.5281/zenodo.18643122)
108
+
109
+ ```bash
110
+ python scripts/download_zenodo_data.py
111
+ ```
112
+
113
+ ## Running Tests
114
+
115
+ ```bash
116
+ pytest tests/ -v # 98 tests
117
+ ```
118
+
119
+ ## Known Limitations
120
+
121
+ - **Chronology**: Period-to-year normalization is optimized for the **1st Millennium BCE**.
122
+ - **Language**: Parsing is primarily validated on **Akkadian** projects.
123
+
124
+ ## License
125
+
126
+ MIT — see [LICENSE](LICENSE).
127
+
128
+ ## Credits
129
+
130
+ Based on code by Niek Veldhuis ([Compass](https://github.com/niekveldhuis/compass)) and adapted for the BEn Project.
@@ -0,0 +1,34 @@
1
+ """
2
+ oracc-parser: Download and parse ORACC cuneiform text projects.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ # Public re-exports for convenience
9
+ from oracc_parser.pipeline import ( # noqa: F401
10
+ export_to_csv,
11
+ export_to_jsonl,
12
+ parse_project,
13
+ parse_project_from_word_csvs,
14
+ records_to_word_dataframes,
15
+ save_project_catalogue,
16
+ load_project_catalogue,
17
+ reference_data,
18
+ get_metadata_table,
19
+ get_transliterations,
20
+ get_normalizations,
21
+ get_lemmatizations,
22
+ get_unicode_texts,
23
+ get_translations,
24
+ get_full_flat_table,
25
+ )
26
+ from oracc_parser.io.word_csv import ( # noqa: F401
27
+ load_word_csvs_from_dir,
28
+ load_word_csvs_from_zenodo,
29
+ save_word_csv,
30
+ )
31
+ from oracc_parser.models.config import RunConfig # noqa: F401
32
+ from oracc_parser.metadata.populate import enrich_catalogue_df # noqa: F401
33
+ from oracc_parser.download.pleiades import PleiadesData # noqa: F401
34
+
@@ -0,0 +1,251 @@
1
+ """
2
+ JSON caching for parsed TabletRecord objects.
3
+
4
+ Parsed tablets are expensive to produce (long runtimes due to CDL tree
5
+ traversal, sign parsing, and translation downloads). This module caches
6
+ the full result including a **config fingerprint**.
7
+
8
+ On reload:
9
+ - If the current config matches the cached fingerprint → **instant return**
10
+ (everything is reused, including string representations)
11
+ - If the config differs → the cached **words** are reused and string
12
+ representations are rebuilt (cheap, no re-parsing needed)
13
+ - If not cached at all → full parse from scratch
14
+
15
+ Cache layout::
16
+
17
+ {cache_dir}/tablets/{project}/{text_id}.json
18
+
19
+ Each file is a JSON wrapper::
20
+
21
+ {"config_fingerprint": "a1b2c3d4", "record": { ... TabletRecord ... }}
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import hashlib
26
+ import json
27
+ from pathlib import Path
28
+
29
+ from oracc_parser.utils.logger import get_logger
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Config fingerprinting
36
+ # ---------------------------------------------------------------------------
37
+
38
+ # These RunConfig fields affect the parsed output.
39
+ # Everything else (USE_CACHE, CACHE_DIR, limit, languages) does NOT.
40
+ _OUTPUT_AFFECTING_FIELDS = (
41
+ "drop_missing",
42
+ "drop_damaged",
43
+ "keep_word_segmentation",
44
+ "mask_pos",
45
+ "max_break_fraction",
46
+ )
47
+
48
+
49
+ def config_fingerprint(config) -> str:
50
+ """Compute a short, stable hash of the output-affecting config options.
51
+
52
+ Args:
53
+ config: A ``RunConfig`` instance.
54
+
55
+ Returns:
56
+ 8-char hex string (e.g. ``"a1b2c3d4"``).
57
+ """
58
+ key = {}
59
+ for field in _OUTPUT_AFFECTING_FIELDS:
60
+ val = getattr(config, field)
61
+ if isinstance(val, list):
62
+ val = sorted(val)
63
+ key[field] = val
64
+
65
+ raw = json.dumps(key, sort_keys=True)
66
+ return hashlib.sha256(raw.encode()).hexdigest()[:8]
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Path helpers
71
+ # ---------------------------------------------------------------------------
72
+
73
+
74
+ def _resolve_cache_dir(cache_dir: str | None = None) -> Path:
75
+ """Return the base cache directory."""
76
+ if cache_dir:
77
+ return Path(cache_dir)
78
+ from oracc_parser.settings import CACHE_DIR as settings_CACHE_DIR
79
+ return settings_CACHE_DIR
80
+
81
+
82
+ def _tablet_path(
83
+ project: str,
84
+ text_id: str,
85
+ cache_dir: str | None = None,
86
+ ) -> Path:
87
+ """Return the JSON file path for a cached tablet."""
88
+ base = _resolve_cache_dir(cache_dir) / "tablets"
89
+ project_dir = project.replace("/", "-")
90
+ return base / project_dir / f"{text_id}.json"
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Load / Save
95
+ # ---------------------------------------------------------------------------
96
+
97
+
98
+ def load_cached_tablet(
99
+ project: str,
100
+ text_id: str,
101
+ config,
102
+ cache_dir: str | None = None,
103
+ ) -> "TabletRecord | None":
104
+ """Load a cached tablet, rebuilding string reps only if config changed.
105
+
106
+ Two fast paths:
107
+
108
+ 1. **Config match** — the cached fingerprint matches the current config.
109
+ The full record (including string representations) is returned as-is.
110
+ This is the fastest path.
111
+
112
+ 2. **Config mismatch** — the words and metadata are reused, but string
113
+ representations are rebuilt with the current config. This avoids
114
+ the expensive CDL parsing + translation download.
115
+
116
+ Args:
117
+ project: ORACC project path, e.g. ``"saao/saa01"``.
118
+ text_id: Text identifier, e.g. ``"P334189"``.
119
+ config: ``RunConfig`` instance.
120
+ cache_dir: Custom cache directory (overrides settings).
121
+
122
+ Returns:
123
+ The TabletRecord (possibly with rebuilt strings), or ``None``.
124
+ """
125
+ from oracc_parser.models.tablet import TabletRecord
126
+ from oracc_parser.parsing.parse_content import (
127
+ _add_word_level_representations,
128
+ _add_unicode_representation,
129
+ )
130
+
131
+ path = _tablet_path(project, text_id, cache_dir)
132
+ if not path.exists():
133
+ return None
134
+
135
+ try:
136
+ raw = path.read_text(encoding="utf-8")
137
+ wrapper = json.loads(raw)
138
+
139
+ # Handle both new wrapper format and legacy bare-record format
140
+ if "record" in wrapper and "config_fingerprint" in wrapper:
141
+ cached_fp = wrapper["config_fingerprint"]
142
+ record = TabletRecord.model_validate(wrapper["record"])
143
+ else:
144
+ # Legacy format (bare TabletRecord JSON) — always rebuild
145
+ cached_fp = None
146
+ record = TabletRecord.model_validate(wrapper)
147
+
148
+ current_fp = config_fingerprint(config)
149
+
150
+ if cached_fp == current_fp:
151
+ # Fast path: config matches → everything is valid
152
+ return record
153
+
154
+ # Config changed → rebuild string representations from cached words
155
+ record.content = _add_word_level_representations(
156
+ record.content, config.mask_pos, config.max_break_fraction
157
+ )
158
+ record.content = _add_unicode_representation(
159
+ record.content,
160
+ drop_missing=config.drop_missing,
161
+ drop_damaged=config.drop_damaged,
162
+ keep_segmentation=config.keep_word_segmentation,
163
+ )
164
+ return record
165
+
166
+ except Exception as e:
167
+ logger.warning(f"Corrupt cache file {path}, will re-parse: {e}")
168
+ path.unlink(missing_ok=True)
169
+ return None
170
+
171
+
172
+ def save_tablet_to_cache(
173
+ record: "TabletRecord",
174
+ project: str,
175
+ text_id: str,
176
+ config,
177
+ cache_dir: str | None = None,
178
+ ) -> None:
179
+ """Persist a TabletRecord to the JSON cache with a config fingerprint.
180
+
181
+ The saved file includes the config fingerprint so that on reload
182
+ we can skip string rebuilding when the config hasn't changed.
183
+
184
+ Args:
185
+ record: The parsed tablet to cache.
186
+ project: ORACC project path.
187
+ text_id: Text identifier.
188
+ config: ``RunConfig`` instance (its fingerprint is stored).
189
+ cache_dir: Custom cache directory.
190
+ """
191
+ path = _tablet_path(project, text_id, cache_dir)
192
+ path.parent.mkdir(parents=True, exist_ok=True)
193
+
194
+ wrapper = {
195
+ "config_fingerprint": config_fingerprint(config),
196
+ "record": record.model_dump(mode="python"),
197
+ }
198
+
199
+ try:
200
+ path.write_text(
201
+ json.dumps(wrapper, indent=1, default=str, ensure_ascii=False),
202
+ encoding="utf-8",
203
+ )
204
+ except Exception as e:
205
+ logger.warning(f"Failed to write cache file {path}: {e}")
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Clear
210
+ # ---------------------------------------------------------------------------
211
+
212
+
213
+ def clear_project_cache(
214
+ project: str | None = None,
215
+ cache_dir: str | None = None,
216
+ ) -> int:
217
+ """Delete cached JSON files for a project (or all projects).
218
+
219
+ Args:
220
+ project: ORACC project path. ``None`` = clear everything.
221
+ cache_dir: Custom cache directory.
222
+
223
+ Returns:
224
+ Number of tablet JSON files deleted.
225
+ """
226
+ base = _resolve_cache_dir(cache_dir) / "tablets"
227
+ if not base.exists():
228
+ return 0
229
+
230
+ if project:
231
+ target = base / project.replace("/", "-")
232
+ else:
233
+ target = base
234
+
235
+ if not target.exists():
236
+ return 0
237
+
238
+ count = 0
239
+ for f in target.rglob("*.json"):
240
+ f.unlink()
241
+ count += 1
242
+
243
+ # Clean up empty directories (bottom-up)
244
+ for d in sorted(target.rglob("*"), reverse=True):
245
+ if d.is_dir() and not any(d.iterdir()):
246
+ d.rmdir()
247
+ if project and target.exists() and not any(target.iterdir()):
248
+ target.rmdir()
249
+
250
+ logger.info(f"Cleared {count} cached tablet(s)")
251
+ return count