oracc-parser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- oracc_parser-0.1.0/LICENSE +21 -0
- oracc_parser-0.1.0/PKG-INFO +166 -0
- oracc_parser-0.1.0/README.md +130 -0
- oracc_parser-0.1.0/oracc_parser/__init__.py +34 -0
- oracc_parser-0.1.0/oracc_parser/cache.py +251 -0
- oracc_parser-0.1.0/oracc_parser/cli.py +201 -0
- oracc_parser-0.1.0/oracc_parser/constants.py +104 -0
- oracc_parser-0.1.0/oracc_parser/download/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/download/extract_jsons.py +87 -0
- oracc_parser-0.1.0/oracc_parser/download/fetch_data.py +298 -0
- oracc_parser-0.1.0/oracc_parser/download/oracc_download.py +270 -0
- oracc_parser-0.1.0/oracc_parser/download/pleiades.py +174 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/grouped_oracc_metadata_columns.csv +338 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/languages.csv +36 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/period_mapping.csv +26 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/pos_tags.csv +50 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/projects_metadata.csv +223 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/provenience.csv +337 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/raw_archive_values.csv +713 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/sign_readings.csv +8903 -0
- oracc_parser-0.1.0/oracc_parser/enriched_data/state_supergroup_mapping.csv +57 -0
- oracc_parser-0.1.0/oracc_parser/export/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/export/to_jsonl.py +161 -0
- oracc_parser-0.1.0/oracc_parser/io/__init__.py +2 -0
- oracc_parser-0.1.0/oracc_parser/io/word_csv.py +467 -0
- oracc_parser-0.1.0/oracc_parser/metadata/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/metadata/archive.py +399 -0
- oracc_parser-0.1.0/oracc_parser/metadata/populate.py +564 -0
- oracc_parser-0.1.0/oracc_parser/models/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/models/config.py +114 -0
- oracc_parser-0.1.0/oracc_parser/models/tablet.py +237 -0
- oracc_parser-0.1.0/oracc_parser/parsing/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/parsing/parse_content.py +174 -0
- oracc_parser-0.1.0/oracc_parser/parsing/parse_signs.py +219 -0
- oracc_parser-0.1.0/oracc_parser/parsing/parse_words.py +177 -0
- oracc_parser-0.1.0/oracc_parser/parsing/text_builder.py +175 -0
- oracc_parser-0.1.0/oracc_parser/parsing/translation.py +91 -0
- oracc_parser-0.1.0/oracc_parser/pipeline.py +535 -0
- oracc_parser-0.1.0/oracc_parser/settings.py +120 -0
- oracc_parser-0.1.0/oracc_parser/utils/__init__.py +1 -0
- oracc_parser-0.1.0/oracc_parser/utils/logger.py +32 -0
- oracc_parser-0.1.0/oracc_parser/utils/paths.py +519 -0
- oracc_parser-0.1.0/oracc_parser/utils/unicode.py +109 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/PKG-INFO +166 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/SOURCES.txt +57 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/dependency_links.txt +1 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/entry_points.txt +2 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/requires.txt +18 -0
- oracc_parser-0.1.0/oracc_parser.egg-info/top_level.txt +1 -0
- oracc_parser-0.1.0/pyproject.toml +65 -0
- oracc_parser-0.1.0/setup.cfg +4 -0
- oracc_parser-0.1.0/tests/test_cache.py +205 -0
- oracc_parser-0.1.0/tests/test_constants.py +73 -0
- oracc_parser-0.1.0/tests/test_export.py +64 -0
- oracc_parser-0.1.0/tests/test_models.py +86 -0
- oracc_parser-0.1.0/tests/test_parsing.py +52 -0
- oracc_parser-0.1.0/tests/test_pipeline.py +116 -0
- oracc_parser-0.1.0/tests/test_settings.py +41 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Shahar Spencer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: oracc-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Download and parse ORACC cuneiform text projects into ML-ready formats
|
|
5
|
+
Author: Avital Romach, Shahar Spencer, Claude Sonnet 4.6
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/shaharspencer/oracc-parser
|
|
8
|
+
Project-URL: Issues, https://github.com/shaharspencer/oracc-parser/issues
|
|
9
|
+
Keywords: oracc,cuneiform,akkadian,nlp,digital-humanities
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: requests>=2.31
|
|
21
|
+
Requires-Dist: pandas>=2.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0
|
|
23
|
+
Requires-Dist: tqdm>=4.65
|
|
24
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: httpx>=0.24; extra == "dev"
|
|
29
|
+
Provides-Extra: server
|
|
30
|
+
Requires-Dist: fastapi>=0.100; extra == "server"
|
|
31
|
+
Requires-Dist: uvicorn>=0.22; extra == "server"
|
|
32
|
+
Provides-Extra: notebooks
|
|
33
|
+
Requires-Dist: jupyter; extra == "notebooks"
|
|
34
|
+
Requires-Dist: matplotlib; extra == "notebooks"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# oracc-parser
|
|
38
|
+
|
|
39
|
+
A Python tool to download and parse [ORACC](http://oracc.museum.upenn.edu/) cuneiform text projects into machine-learning-ready formats (JSONL, CSV, pandas DataFrames).
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **Download** — Fetch project ZIPs directly from ORACC or Zenodo
|
|
44
|
+
- **Parse** — Convert raw ORACC JSON into structured data
|
|
45
|
+
- **Export** — Save datasets as JSONL, CSV, or pandas DataFrames
|
|
46
|
+
- **Configure** — Control handling of broken signs and POS masking using `RunConfig`
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://github.com/shaharspencer/oracc-parser.git
|
|
53
|
+
cd oracc-parser
|
|
54
|
+
pip install -e ".[dev]"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Getting Started — Notebooks
|
|
58
|
+
|
|
59
|
+
The easiest way to explore oracc-parser is through the interactive notebooks.
|
|
60
|
+
Start with notebook 01 — it downloads all the data you need from Zenodo automatically.
|
|
61
|
+
|
|
62
|
+
| Notebook | What you'll learn |
|
|
63
|
+
|---|---|
|
|
64
|
+
| [`01_quickstart.ipynb`](notebooks/01_quickstart.ipynb) | Download the dataset → parse a project from pre-processed CSVs → explore transliterations, translations, and metadata → export |
|
|
65
|
+
| [`02_reference_data.ipynb`](notebooks/02_reference_data.ipynb) | Browse all projects in the dataset, query catalogues, explore bundled reference data (provenance, periods, sign list, POS tags) |
|
|
66
|
+
| [`03_configure_and_export.ipynb`](notebooks/03_configure_and_export.ipynb) | All `RunConfig` options — word-level and sign-level break filtering, POS masking — combining multiple projects and exporting datasets |
|
|
67
|
+
| [`04_oracc_json_processing.ipynb`](notebooks/04_oracc_json_processing.ipynb) | Advanced: understand the raw ORACC JSON structure, the JSON → TabletRecord → CSV pipeline, and how to download and parse projects not in the dataset |
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install oracc-parser[notebooks]
|
|
71
|
+
jupyter notebook notebooks/
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Quick Example
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from oracc_parser import parse_project, RunConfig, get_full_flat_table
|
|
78
|
+
|
|
79
|
+
# Parse 5 tablets from SAA 01 (Neo-Assyrian royal letters)
|
|
80
|
+
records = parse_project("saao/saa01", config=RunConfig(limit=5))
|
|
81
|
+
|
|
82
|
+
# Get a flat DataFrame — no nesting, ready for analysis
|
|
83
|
+
df = get_full_flat_table(records)
|
|
84
|
+
df.to_json("dataset.jsonl", orient="records", lines=True)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Configuration
|
|
88
|
+
|
|
89
|
+
You can customize the parsing process using `RunConfig`:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
from oracc_parser import parse_project, RunConfig
|
|
93
|
+
|
|
94
|
+
records = parse_project("saao/saa01", config=RunConfig(
|
|
95
|
+
limit=10,
|
|
96
|
+
max_break_fraction=0.5, # word-level: drop words that are >50% broken
|
|
97
|
+
drop_missing=True, # sign-level: drop [x] signs from Unicode output
|
|
98
|
+
drop_damaged=False, # sign-level: keep ⸢x⸣ signs in Unicode output
|
|
99
|
+
mask_pos=["PN", "DN"], # replace personal/divine names with tag
|
|
100
|
+
))
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Two independent levels of break filtering
|
|
104
|
+
|
|
105
|
+
`RunConfig` provides two distinct ways to handle damaged or missing text,
|
|
106
|
+
operating at different granularities and affecting different outputs:
|
|
107
|
+
|
|
108
|
+
| Parameter | Level | Affects | How it works |
|
|
109
|
+
|---|---|---|---|
|
|
110
|
+
| `max_break_fraction` | **Word** | Transliteration, normalization, lemmatization | Each word has a `break_perc` (fraction of its signs that are broken). Words exceeding this threshold are replaced with `X`. Default `1.0` keeps all words. |
|
|
111
|
+
| `drop_missing` | **Sign** | Unicode cuneiform only | Drops individual signs marked `[x]` (completely lost). |
|
|
112
|
+
| `drop_damaged` | **Sign** | Unicode cuneiform only | Drops individual signs marked `⸢x⸣` (partially legible). |
|
|
113
|
+
|
|
114
|
+
> **Note:** Because word-level and sign-level filtering use different thresholds
|
|
115
|
+
> and different granularities, **the text outputs and the Unicode cuneiform output
|
|
116
|
+
> are not necessarily aligned**. A word kept in the transliteration (because its
|
|
117
|
+
> average damage is below `max_break_fraction`) may still have individual signs
|
|
118
|
+
> dropped from the Unicode output if `drop_missing` / `drop_damaged` are enabled.
|
|
119
|
+
|
|
120
|
+
### Other options
|
|
121
|
+
|
|
122
|
+
| Parameter | Default | Description |
|
|
123
|
+
|---|---|---|
|
|
124
|
+
| `limit` | `None` | Only parse the first N texts (useful for testing) |
|
|
125
|
+
| `keep_word_segmentation` | `True` | Preserve word boundaries in Unicode cuneiform output |
|
|
126
|
+
| `mask_pos` | `[]` | Replace words of certain POS tags with the tag name |
|
|
127
|
+
| `languages` | `["Akkadian"]` | Which languages to include when downloading projects |
|
|
128
|
+
| `use_cache` | `True` | Use cached results if available |
|
|
129
|
+
|
|
130
|
+
All reference data is bundled with the package, so you don't need to configure external paths unless you are customizing `oracc_parser.settings`.
|
|
131
|
+
|
|
132
|
+
## CLI
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
oracc-parser download --project saao/saa01
|
|
136
|
+
oracc-parser parse --project saao/saa01 --limit 5 --format jsonl --output saa01.jsonl
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Heavy Data (Zenodo)
|
|
140
|
+
|
|
141
|
+
Large data files (ORACC ZIPs, cached translations, Pleiades data) are on Zenodo:
|
|
142
|
+
|
|
143
|
+
[](https://doi.org/10.5281/zenodo.18643122)
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
python scripts/download_zenodo_data.py
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Running Tests
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
pytest tests/ -v # 98 tests
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Known Limitations
|
|
156
|
+
|
|
157
|
+
- **Chronology**: Period-to-year normalization is optimized for the **1st Millennium BCE**.
|
|
158
|
+
- **Language**: Parsing is primarily validated on **Akkadian** projects.
|
|
159
|
+
|
|
160
|
+
## License
|
|
161
|
+
|
|
162
|
+
MIT — see [LICENSE](LICENSE).
|
|
163
|
+
|
|
164
|
+
## Credits
|
|
165
|
+
|
|
166
|
+
Based on code by Niek Veldhuis ([Compass](https://github.com/niekveldhuis/compass)) and adapted for the BEn Project.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# oracc-parser
|
|
2
|
+
|
|
3
|
+
A Python tool to download and parse [ORACC](http://oracc.museum.upenn.edu/) cuneiform text projects into machine-learning-ready formats (JSONL, CSV, pandas DataFrames).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Download** — Fetch project ZIPs directly from ORACC or Zenodo
|
|
8
|
+
- **Parse** — Convert raw ORACC JSON into structured data
|
|
9
|
+
- **Export** — Save datasets as JSONL, CSV, or pandas DataFrames
|
|
10
|
+
- **Configure** — Control handling of broken signs and POS masking using `RunConfig`
|
|
11
|
+
|
|
12
|
+
## Installation
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
git clone https://github.com/shaharspencer/oracc-parser.git
|
|
17
|
+
cd oracc-parser
|
|
18
|
+
pip install -e ".[dev]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Getting Started — Notebooks
|
|
22
|
+
|
|
23
|
+
The easiest way to explore oracc-parser is through the interactive notebooks.
|
|
24
|
+
Start with notebook 01 — it downloads all the data you need from Zenodo automatically.
|
|
25
|
+
|
|
26
|
+
| Notebook | What you'll learn |
|
|
27
|
+
|---|---|
|
|
28
|
+
| [`01_quickstart.ipynb`](notebooks/01_quickstart.ipynb) | Download the dataset → parse a project from pre-processed CSVs → explore transliterations, translations, and metadata → export |
|
|
29
|
+
| [`02_reference_data.ipynb`](notebooks/02_reference_data.ipynb) | Browse all projects in the dataset, query catalogues, explore bundled reference data (provenance, periods, sign list, POS tags) |
|
|
30
|
+
| [`03_configure_and_export.ipynb`](notebooks/03_configure_and_export.ipynb) | All `RunConfig` options — word-level and sign-level break filtering, POS masking — combining multiple projects and exporting datasets |
|
|
31
|
+
| [`04_oracc_json_processing.ipynb`](notebooks/04_oracc_json_processing.ipynb) | Advanced: understand the raw ORACC JSON structure, the JSON → TabletRecord → CSV pipeline, and how to download and parse projects not in the dataset |
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install oracc-parser[notebooks]
|
|
35
|
+
jupyter notebook notebooks/
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick Example
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from oracc_parser import parse_project, RunConfig, get_full_flat_table
|
|
42
|
+
|
|
43
|
+
# Parse 5 tablets from SAA 01 (Neo-Assyrian royal letters)
|
|
44
|
+
records = parse_project("saao/saa01", config=RunConfig(limit=5))
|
|
45
|
+
|
|
46
|
+
# Get a flat DataFrame — no nesting, ready for analysis
|
|
47
|
+
df = get_full_flat_table(records)
|
|
48
|
+
df.to_json("dataset.jsonl", orient="records", lines=True)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Configuration
|
|
52
|
+
|
|
53
|
+
You can customize the parsing process using `RunConfig`:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from oracc_parser import parse_project, RunConfig
|
|
57
|
+
|
|
58
|
+
records = parse_project("saao/saa01", config=RunConfig(
|
|
59
|
+
limit=10,
|
|
60
|
+
max_break_fraction=0.5, # word-level: drop words that are >50% broken
|
|
61
|
+
drop_missing=True, # sign-level: drop [x] signs from Unicode output
|
|
62
|
+
drop_damaged=False, # sign-level: keep ⸢x⸣ signs in Unicode output
|
|
63
|
+
mask_pos=["PN", "DN"], # replace personal/divine names with tag
|
|
64
|
+
))
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Two independent levels of break filtering
|
|
68
|
+
|
|
69
|
+
`RunConfig` provides two distinct ways to handle damaged or missing text,
|
|
70
|
+
operating at different granularities and affecting different outputs:
|
|
71
|
+
|
|
72
|
+
| Parameter | Level | Affects | How it works |
|
|
73
|
+
|---|---|---|---|
|
|
74
|
+
| `max_break_fraction` | **Word** | Transliteration, normalization, lemmatization | Each word has a `break_perc` (fraction of its signs that are broken). Words exceeding this threshold are replaced with `X`. Default `1.0` keeps all words. |
|
|
75
|
+
| `drop_missing` | **Sign** | Unicode cuneiform only | Drops individual signs marked `[x]` (completely lost). |
|
|
76
|
+
| `drop_damaged` | **Sign** | Unicode cuneiform only | Drops individual signs marked `⸢x⸣` (partially legible). |
|
|
77
|
+
|
|
78
|
+
> **Note:** Because word-level and sign-level filtering use different thresholds
|
|
79
|
+
> and different granularities, **the text outputs and the Unicode cuneiform output
|
|
80
|
+
> are not necessarily aligned**. A word kept in the transliteration (because its
|
|
81
|
+
> average damage is below `max_break_fraction`) may still have individual signs
|
|
82
|
+
> dropped from the Unicode output if `drop_missing` / `drop_damaged` are enabled.
|
|
83
|
+
|
|
84
|
+
### Other options
|
|
85
|
+
|
|
86
|
+
| Parameter | Default | Description |
|
|
87
|
+
|---|---|---|
|
|
88
|
+
| `limit` | `None` | Only parse the first N texts (useful for testing) |
|
|
89
|
+
| `keep_word_segmentation` | `True` | Preserve word boundaries in Unicode cuneiform output |
|
|
90
|
+
| `mask_pos` | `[]` | Replace words of certain POS tags with the tag name |
|
|
91
|
+
| `languages` | `["Akkadian"]` | Which languages to include when downloading projects |
|
|
92
|
+
| `use_cache` | `True` | Use cached results if available |
|
|
93
|
+
|
|
94
|
+
All reference data is bundled with the package, so you don't need to configure external paths unless you are customizing `oracc_parser.settings`.
|
|
95
|
+
|
|
96
|
+
## CLI
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
oracc-parser download --project saao/saa01
|
|
100
|
+
oracc-parser parse --project saao/saa01 --limit 5 --format jsonl --output saa01.jsonl
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Heavy Data (Zenodo)
|
|
104
|
+
|
|
105
|
+
Large data files (ORACC ZIPs, cached translations, Pleiades data) are on Zenodo:
|
|
106
|
+
|
|
107
|
+
[](https://doi.org/10.5281/zenodo.18643122)
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
python scripts/download_zenodo_data.py
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Running Tests
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
pytest tests/ -v # 98 tests
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## Known Limitations
|
|
120
|
+
|
|
121
|
+
- **Chronology**: Period-to-year normalization is optimized for the **1st Millennium BCE**.
|
|
122
|
+
- **Language**: Parsing is primarily validated on **Akkadian** projects.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
MIT — see [LICENSE](LICENSE).
|
|
127
|
+
|
|
128
|
+
## Credits
|
|
129
|
+
|
|
130
|
+
Based on code by Niek Veldhuis ([Compass](https://github.com/niekveldhuis/compass)) and adapted for the BEn Project.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
oracc-parser: Download and parse ORACC cuneiform text projects.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
# Public re-exports for convenience
|
|
9
|
+
from oracc_parser.pipeline import ( # noqa: F401
|
|
10
|
+
export_to_csv,
|
|
11
|
+
export_to_jsonl,
|
|
12
|
+
parse_project,
|
|
13
|
+
parse_project_from_word_csvs,
|
|
14
|
+
records_to_word_dataframes,
|
|
15
|
+
save_project_catalogue,
|
|
16
|
+
load_project_catalogue,
|
|
17
|
+
reference_data,
|
|
18
|
+
get_metadata_table,
|
|
19
|
+
get_transliterations,
|
|
20
|
+
get_normalizations,
|
|
21
|
+
get_lemmatizations,
|
|
22
|
+
get_unicode_texts,
|
|
23
|
+
get_translations,
|
|
24
|
+
get_full_flat_table,
|
|
25
|
+
)
|
|
26
|
+
from oracc_parser.io.word_csv import ( # noqa: F401
|
|
27
|
+
load_word_csvs_from_dir,
|
|
28
|
+
load_word_csvs_from_zenodo,
|
|
29
|
+
save_word_csv,
|
|
30
|
+
)
|
|
31
|
+
from oracc_parser.models.config import RunConfig # noqa: F401
|
|
32
|
+
from oracc_parser.metadata.populate import enrich_catalogue_df # noqa: F401
|
|
33
|
+
from oracc_parser.download.pleiades import PleiadesData # noqa: F401
|
|
34
|
+
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""
|
|
2
|
+
JSON caching for parsed TabletRecord objects.
|
|
3
|
+
|
|
4
|
+
Parsed tablets are expensive to produce (long runtimes due to CDL tree
|
|
5
|
+
traversal, sign parsing, and translation downloads). This module caches
|
|
6
|
+
the full result including a **config fingerprint**.
|
|
7
|
+
|
|
8
|
+
On reload:
|
|
9
|
+
- If the current config matches the cached fingerprint → **instant return**
|
|
10
|
+
(everything is reused, including string representations)
|
|
11
|
+
- If the config differs → the cached **words** are reused and string
|
|
12
|
+
representations are rebuilt (cheap, no re-parsing needed)
|
|
13
|
+
- If not cached at all → full parse from scratch
|
|
14
|
+
|
|
15
|
+
Cache layout::
|
|
16
|
+
|
|
17
|
+
{cache_dir}/tablets/{project}/{text_id}.json
|
|
18
|
+
|
|
19
|
+
Each file is a JSON wrapper::
|
|
20
|
+
|
|
21
|
+
{"config_fingerprint": "a1b2c3d4", "record": { ... TabletRecord ... }}
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import hashlib
|
|
26
|
+
import json
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
from oracc_parser.utils.logger import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Config fingerprinting
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
# These RunConfig fields affect the parsed output.
|
|
39
|
+
# Everything else (USE_CACHE, CACHE_DIR, limit, languages) does NOT.
|
|
40
|
+
_OUTPUT_AFFECTING_FIELDS = (
|
|
41
|
+
"drop_missing",
|
|
42
|
+
"drop_damaged",
|
|
43
|
+
"keep_word_segmentation",
|
|
44
|
+
"mask_pos",
|
|
45
|
+
"max_break_fraction",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def config_fingerprint(config) -> str:
|
|
50
|
+
"""Compute a short, stable hash of the output-affecting config options.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
config: A ``RunConfig`` instance.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
8-char hex string (e.g. ``"a1b2c3d4"``).
|
|
57
|
+
"""
|
|
58
|
+
key = {}
|
|
59
|
+
for field in _OUTPUT_AFFECTING_FIELDS:
|
|
60
|
+
val = getattr(config, field)
|
|
61
|
+
if isinstance(val, list):
|
|
62
|
+
val = sorted(val)
|
|
63
|
+
key[field] = val
|
|
64
|
+
|
|
65
|
+
raw = json.dumps(key, sort_keys=True)
|
|
66
|
+
return hashlib.sha256(raw.encode()).hexdigest()[:8]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Path helpers
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _resolve_cache_dir(cache_dir: str | None = None) -> Path:
|
|
75
|
+
"""Return the base cache directory."""
|
|
76
|
+
if cache_dir:
|
|
77
|
+
return Path(cache_dir)
|
|
78
|
+
from oracc_parser.settings import CACHE_DIR as settings_CACHE_DIR
|
|
79
|
+
return settings_CACHE_DIR
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _tablet_path(
|
|
83
|
+
project: str,
|
|
84
|
+
text_id: str,
|
|
85
|
+
cache_dir: str | None = None,
|
|
86
|
+
) -> Path:
|
|
87
|
+
"""Return the JSON file path for a cached tablet."""
|
|
88
|
+
base = _resolve_cache_dir(cache_dir) / "tablets"
|
|
89
|
+
project_dir = project.replace("/", "-")
|
|
90
|
+
return base / project_dir / f"{text_id}.json"
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ---------------------------------------------------------------------------
|
|
94
|
+
# Load / Save
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def load_cached_tablet(
|
|
99
|
+
project: str,
|
|
100
|
+
text_id: str,
|
|
101
|
+
config,
|
|
102
|
+
cache_dir: str | None = None,
|
|
103
|
+
) -> "TabletRecord | None":
|
|
104
|
+
"""Load a cached tablet, rebuilding string reps only if config changed.
|
|
105
|
+
|
|
106
|
+
Two fast paths:
|
|
107
|
+
|
|
108
|
+
1. **Config match** — the cached fingerprint matches the current config.
|
|
109
|
+
The full record (including string representations) is returned as-is.
|
|
110
|
+
This is the fastest path.
|
|
111
|
+
|
|
112
|
+
2. **Config mismatch** — the words and metadata are reused, but string
|
|
113
|
+
representations are rebuilt with the current config. This avoids
|
|
114
|
+
the expensive CDL parsing + translation download.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
project: ORACC project path, e.g. ``"saao/saa01"``.
|
|
118
|
+
text_id: Text identifier, e.g. ``"P334189"``.
|
|
119
|
+
config: ``RunConfig`` instance.
|
|
120
|
+
cache_dir: Custom cache directory (overrides settings).
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The TabletRecord (possibly with rebuilt strings), or ``None``.
|
|
124
|
+
"""
|
|
125
|
+
from oracc_parser.models.tablet import TabletRecord
|
|
126
|
+
from oracc_parser.parsing.parse_content import (
|
|
127
|
+
_add_word_level_representations,
|
|
128
|
+
_add_unicode_representation,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
path = _tablet_path(project, text_id, cache_dir)
|
|
132
|
+
if not path.exists():
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
raw = path.read_text(encoding="utf-8")
|
|
137
|
+
wrapper = json.loads(raw)
|
|
138
|
+
|
|
139
|
+
# Handle both new wrapper format and legacy bare-record format
|
|
140
|
+
if "record" in wrapper and "config_fingerprint" in wrapper:
|
|
141
|
+
cached_fp = wrapper["config_fingerprint"]
|
|
142
|
+
record = TabletRecord.model_validate(wrapper["record"])
|
|
143
|
+
else:
|
|
144
|
+
# Legacy format (bare TabletRecord JSON) — always rebuild
|
|
145
|
+
cached_fp = None
|
|
146
|
+
record = TabletRecord.model_validate(wrapper)
|
|
147
|
+
|
|
148
|
+
current_fp = config_fingerprint(config)
|
|
149
|
+
|
|
150
|
+
if cached_fp == current_fp:
|
|
151
|
+
# Fast path: config matches → everything is valid
|
|
152
|
+
return record
|
|
153
|
+
|
|
154
|
+
# Config changed → rebuild string representations from cached words
|
|
155
|
+
record.content = _add_word_level_representations(
|
|
156
|
+
record.content, config.mask_pos, config.max_break_fraction
|
|
157
|
+
)
|
|
158
|
+
record.content = _add_unicode_representation(
|
|
159
|
+
record.content,
|
|
160
|
+
drop_missing=config.drop_missing,
|
|
161
|
+
drop_damaged=config.drop_damaged,
|
|
162
|
+
keep_segmentation=config.keep_word_segmentation,
|
|
163
|
+
)
|
|
164
|
+
return record
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.warning(f"Corrupt cache file {path}, will re-parse: {e}")
|
|
168
|
+
path.unlink(missing_ok=True)
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def save_tablet_to_cache(
|
|
173
|
+
record: "TabletRecord",
|
|
174
|
+
project: str,
|
|
175
|
+
text_id: str,
|
|
176
|
+
config,
|
|
177
|
+
cache_dir: str | None = None,
|
|
178
|
+
) -> None:
|
|
179
|
+
"""Persist a TabletRecord to the JSON cache with a config fingerprint.
|
|
180
|
+
|
|
181
|
+
The saved file includes the config fingerprint so that on reload
|
|
182
|
+
we can skip string rebuilding when the config hasn't changed.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
record: The parsed tablet to cache.
|
|
186
|
+
project: ORACC project path.
|
|
187
|
+
text_id: Text identifier.
|
|
188
|
+
config: ``RunConfig`` instance (its fingerprint is stored).
|
|
189
|
+
cache_dir: Custom cache directory.
|
|
190
|
+
"""
|
|
191
|
+
path = _tablet_path(project, text_id, cache_dir)
|
|
192
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
193
|
+
|
|
194
|
+
wrapper = {
|
|
195
|
+
"config_fingerprint": config_fingerprint(config),
|
|
196
|
+
"record": record.model_dump(mode="python"),
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
path.write_text(
|
|
201
|
+
json.dumps(wrapper, indent=1, default=str, ensure_ascii=False),
|
|
202
|
+
encoding="utf-8",
|
|
203
|
+
)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.warning(f"Failed to write cache file {path}: {e}")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# Clear
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def clear_project_cache(
|
|
214
|
+
project: str | None = None,
|
|
215
|
+
cache_dir: str | None = None,
|
|
216
|
+
) -> int:
|
|
217
|
+
"""Delete cached JSON files for a project (or all projects).
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
project: ORACC project path. ``None`` = clear everything.
|
|
221
|
+
cache_dir: Custom cache directory.
|
|
222
|
+
|
|
223
|
+
Returns:
|
|
224
|
+
Number of tablet JSON files deleted.
|
|
225
|
+
"""
|
|
226
|
+
base = _resolve_cache_dir(cache_dir) / "tablets"
|
|
227
|
+
if not base.exists():
|
|
228
|
+
return 0
|
|
229
|
+
|
|
230
|
+
if project:
|
|
231
|
+
target = base / project.replace("/", "-")
|
|
232
|
+
else:
|
|
233
|
+
target = base
|
|
234
|
+
|
|
235
|
+
if not target.exists():
|
|
236
|
+
return 0
|
|
237
|
+
|
|
238
|
+
count = 0
|
|
239
|
+
for f in target.rglob("*.json"):
|
|
240
|
+
f.unlink()
|
|
241
|
+
count += 1
|
|
242
|
+
|
|
243
|
+
# Clean up empty directories (bottom-up)
|
|
244
|
+
for d in sorted(target.rglob("*"), reverse=True):
|
|
245
|
+
if d.is_dir() and not any(d.iterdir()):
|
|
246
|
+
d.rmdir()
|
|
247
|
+
if project and target.exists() and not any(target.iterdir()):
|
|
248
|
+
target.rmdir()
|
|
249
|
+
|
|
250
|
+
logger.info(f"Cleared {count} cached tablet(s)")
|
|
251
|
+
return count
|