oracc-parser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. oracc_parser/__init__.py +34 -0
  2. oracc_parser/cache.py +251 -0
  3. oracc_parser/cli.py +201 -0
  4. oracc_parser/constants.py +104 -0
  5. oracc_parser/download/__init__.py +1 -0
  6. oracc_parser/download/extract_jsons.py +87 -0
  7. oracc_parser/download/fetch_data.py +298 -0
  8. oracc_parser/download/oracc_download.py +270 -0
  9. oracc_parser/download/pleiades.py +174 -0
  10. oracc_parser/enriched_data/__init__.py +1 -0
  11. oracc_parser/enriched_data/grouped_oracc_metadata_columns.csv +338 -0
  12. oracc_parser/enriched_data/languages.csv +36 -0
  13. oracc_parser/enriched_data/period_mapping.csv +26 -0
  14. oracc_parser/enriched_data/pos_tags.csv +50 -0
  15. oracc_parser/enriched_data/projects_metadata.csv +223 -0
  16. oracc_parser/enriched_data/provenience.csv +337 -0
  17. oracc_parser/enriched_data/raw_archive_values.csv +713 -0
  18. oracc_parser/enriched_data/sign_readings.csv +8903 -0
  19. oracc_parser/enriched_data/state_supergroup_mapping.csv +57 -0
  20. oracc_parser/export/__init__.py +1 -0
  21. oracc_parser/export/to_jsonl.py +161 -0
  22. oracc_parser/io/__init__.py +2 -0
  23. oracc_parser/io/word_csv.py +467 -0
  24. oracc_parser/metadata/__init__.py +1 -0
  25. oracc_parser/metadata/archive.py +399 -0
  26. oracc_parser/metadata/populate.py +564 -0
  27. oracc_parser/models/__init__.py +1 -0
  28. oracc_parser/models/config.py +114 -0
  29. oracc_parser/models/tablet.py +237 -0
  30. oracc_parser/parsing/__init__.py +1 -0
  31. oracc_parser/parsing/parse_content.py +174 -0
  32. oracc_parser/parsing/parse_signs.py +219 -0
  33. oracc_parser/parsing/parse_words.py +177 -0
  34. oracc_parser/parsing/text_builder.py +175 -0
  35. oracc_parser/parsing/translation.py +91 -0
  36. oracc_parser/pipeline.py +535 -0
  37. oracc_parser/settings.py +120 -0
  38. oracc_parser/utils/__init__.py +1 -0
  39. oracc_parser/utils/logger.py +32 -0
  40. oracc_parser/utils/paths.py +519 -0
  41. oracc_parser/utils/unicode.py +109 -0
  42. oracc_parser-0.1.0.dist-info/METADATA +166 -0
  43. oracc_parser-0.1.0.dist-info/RECORD +47 -0
  44. oracc_parser-0.1.0.dist-info/WHEEL +5 -0
  45. oracc_parser-0.1.0.dist-info/entry_points.txt +2 -0
  46. oracc_parser-0.1.0.dist-info/licenses/LICENSE +21 -0
  47. oracc_parser-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,34 @@
1
+ """
2
+ oracc-parser: Download and parse ORACC cuneiform text projects.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ # Public re-exports for convenience
9
+ from oracc_parser.pipeline import ( # noqa: F401
10
+ export_to_csv,
11
+ export_to_jsonl,
12
+ parse_project,
13
+ parse_project_from_word_csvs,
14
+ records_to_word_dataframes,
15
+ save_project_catalogue,
16
+ load_project_catalogue,
17
+ reference_data,
18
+ get_metadata_table,
19
+ get_transliterations,
20
+ get_normalizations,
21
+ get_lemmatizations,
22
+ get_unicode_texts,
23
+ get_translations,
24
+ get_full_flat_table,
25
+ )
26
+ from oracc_parser.io.word_csv import ( # noqa: F401
27
+ load_word_csvs_from_dir,
28
+ load_word_csvs_from_zenodo,
29
+ save_word_csv,
30
+ )
31
+ from oracc_parser.models.config import RunConfig # noqa: F401
32
+ from oracc_parser.metadata.populate import enrich_catalogue_df # noqa: F401
33
+ from oracc_parser.download.pleiades import PleiadesData # noqa: F401
34
+
oracc_parser/cache.py ADDED
@@ -0,0 +1,251 @@
1
+ """
2
+ JSON caching for parsed TabletRecord objects.
3
+
4
+ Parsed tablets are expensive to produce (long runtimes due to CDL tree
5
+ traversal, sign parsing, and translation downloads). This module caches
6
+ the full result including a **config fingerprint**.
7
+
8
+ On reload:
9
+ - If the current config matches the cached fingerprint → **instant return**
10
+ (everything is reused, including string representations)
11
+ - If the config differs → the cached **words** are reused and string
12
+ representations are rebuilt (cheap, no re-parsing needed)
13
+ - If not cached at all → full parse from scratch
14
+
15
+ Cache layout::
16
+
17
+ {cache_dir}/tablets/{project}/{text_id}.json
18
+
19
+ Each file is a JSON wrapper::
20
+
21
+ {"config_fingerprint": "a1b2c3d4", "record": { ... TabletRecord ... }}
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import hashlib
26
+ import json
27
+ from pathlib import Path
28
+
29
+ from oracc_parser.utils.logger import get_logger
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Config fingerprinting
36
+ # ---------------------------------------------------------------------------
37
+
38
+ # These RunConfig fields affect the parsed output.
39
+ # Everything else (USE_CACHE, CACHE_DIR, limit, languages) does NOT.
40
+ _OUTPUT_AFFECTING_FIELDS = (
41
+ "drop_missing",
42
+ "drop_damaged",
43
+ "keep_word_segmentation",
44
+ "mask_pos",
45
+ "max_break_fraction",
46
+ )
47
+
48
+
49
+ def config_fingerprint(config) -> str:
50
+ """Compute a short, stable hash of the output-affecting config options.
51
+
52
+ Args:
53
+ config: A ``RunConfig`` instance.
54
+
55
+ Returns:
56
+ 8-char hex string (e.g. ``"a1b2c3d4"``).
57
+ """
58
+ key = {}
59
+ for field in _OUTPUT_AFFECTING_FIELDS:
60
+ val = getattr(config, field)
61
+ if isinstance(val, list):
62
+ val = sorted(val)
63
+ key[field] = val
64
+
65
+ raw = json.dumps(key, sort_keys=True)
66
+ return hashlib.sha256(raw.encode()).hexdigest()[:8]
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Path helpers
71
+ # ---------------------------------------------------------------------------
72
+
73
+
74
+ def _resolve_cache_dir(cache_dir: str | None = None) -> Path:
75
+ """Return the base cache directory."""
76
+ if cache_dir:
77
+ return Path(cache_dir)
78
+ from oracc_parser.settings import CACHE_DIR as settings_CACHE_DIR
79
+ return settings_CACHE_DIR
80
+
81
+
82
+ def _tablet_path(
83
+ project: str,
84
+ text_id: str,
85
+ cache_dir: str | None = None,
86
+ ) -> Path:
87
+ """Return the JSON file path for a cached tablet."""
88
+ base = _resolve_cache_dir(cache_dir) / "tablets"
89
+ project_dir = project.replace("/", "-")
90
+ return base / project_dir / f"{text_id}.json"
91
+
92
+
93
+ # ---------------------------------------------------------------------------
94
+ # Load / Save
95
+ # ---------------------------------------------------------------------------
96
+
97
+
98
+ def load_cached_tablet(
99
+ project: str,
100
+ text_id: str,
101
+ config,
102
+ cache_dir: str | None = None,
103
+ ) -> "TabletRecord | None":
104
+ """Load a cached tablet, rebuilding string reps only if config changed.
105
+
106
+ Two fast paths:
107
+
108
+ 1. **Config match** — the cached fingerprint matches the current config.
109
+ The full record (including string representations) is returned as-is.
110
+ This is the fastest path.
111
+
112
+ 2. **Config mismatch** — the words and metadata are reused, but string
113
+ representations are rebuilt with the current config. This avoids
114
+ the expensive CDL parsing + translation download.
115
+
116
+ Args:
117
+ project: ORACC project path, e.g. ``"saao/saa01"``.
118
+ text_id: Text identifier, e.g. ``"P334189"``.
119
+ config: ``RunConfig`` instance.
120
+ cache_dir: Custom cache directory (overrides settings).
121
+
122
+ Returns:
123
+ The TabletRecord (possibly with rebuilt strings), or ``None``.
124
+ """
125
+ from oracc_parser.models.tablet import TabletRecord
126
+ from oracc_parser.parsing.parse_content import (
127
+ _add_word_level_representations,
128
+ _add_unicode_representation,
129
+ )
130
+
131
+ path = _tablet_path(project, text_id, cache_dir)
132
+ if not path.exists():
133
+ return None
134
+
135
+ try:
136
+ raw = path.read_text(encoding="utf-8")
137
+ wrapper = json.loads(raw)
138
+
139
+ # Handle both new wrapper format and legacy bare-record format
140
+ if "record" in wrapper and "config_fingerprint" in wrapper:
141
+ cached_fp = wrapper["config_fingerprint"]
142
+ record = TabletRecord.model_validate(wrapper["record"])
143
+ else:
144
+ # Legacy format (bare TabletRecord JSON) — always rebuild
145
+ cached_fp = None
146
+ record = TabletRecord.model_validate(wrapper)
147
+
148
+ current_fp = config_fingerprint(config)
149
+
150
+ if cached_fp == current_fp:
151
+ # Fast path: config matches → everything is valid
152
+ return record
153
+
154
+ # Config changed → rebuild string representations from cached words
155
+ record.content = _add_word_level_representations(
156
+ record.content, config.mask_pos, config.max_break_fraction
157
+ )
158
+ record.content = _add_unicode_representation(
159
+ record.content,
160
+ drop_missing=config.drop_missing,
161
+ drop_damaged=config.drop_damaged,
162
+ keep_segmentation=config.keep_word_segmentation,
163
+ )
164
+ return record
165
+
166
+ except Exception as e:
167
+ logger.warning(f"Corrupt cache file {path}, will re-parse: {e}")
168
+ path.unlink(missing_ok=True)
169
+ return None
170
+
171
+
172
+ def save_tablet_to_cache(
173
+ record: "TabletRecord",
174
+ project: str,
175
+ text_id: str,
176
+ config,
177
+ cache_dir: str | None = None,
178
+ ) -> None:
179
+ """Persist a TabletRecord to the JSON cache with a config fingerprint.
180
+
181
+ The saved file includes the config fingerprint so that on reload
182
+ we can skip string rebuilding when the config hasn't changed.
183
+
184
+ Args:
185
+ record: The parsed tablet to cache.
186
+ project: ORACC project path.
187
+ text_id: Text identifier.
188
+ config: ``RunConfig`` instance (its fingerprint is stored).
189
+ cache_dir: Custom cache directory.
190
+ """
191
+ path = _tablet_path(project, text_id, cache_dir)
192
+ path.parent.mkdir(parents=True, exist_ok=True)
193
+
194
+ wrapper = {
195
+ "config_fingerprint": config_fingerprint(config),
196
+ "record": record.model_dump(mode="python"),
197
+ }
198
+
199
+ try:
200
+ path.write_text(
201
+ json.dumps(wrapper, indent=1, default=str, ensure_ascii=False),
202
+ encoding="utf-8",
203
+ )
204
+ except Exception as e:
205
+ logger.warning(f"Failed to write cache file {path}: {e}")
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Clear
210
+ # ---------------------------------------------------------------------------
211
+
212
+
213
+ def clear_project_cache(
214
+ project: str | None = None,
215
+ cache_dir: str | None = None,
216
+ ) -> int:
217
+ """Delete cached JSON files for a project (or all projects).
218
+
219
+ Args:
220
+ project: ORACC project path. ``None`` = clear everything.
221
+ cache_dir: Custom cache directory.
222
+
223
+ Returns:
224
+ Number of tablet JSON files deleted.
225
+ """
226
+ base = _resolve_cache_dir(cache_dir) / "tablets"
227
+ if not base.exists():
228
+ return 0
229
+
230
+ if project:
231
+ target = base / project.replace("/", "-")
232
+ else:
233
+ target = base
234
+
235
+ if not target.exists():
236
+ return 0
237
+
238
+ count = 0
239
+ for f in target.rglob("*.json"):
240
+ f.unlink()
241
+ count += 1
242
+
243
+ # Clean up empty directories (bottom-up)
244
+ for d in sorted(target.rglob("*"), reverse=True):
245
+ if d.is_dir() and not any(d.iterdir()):
246
+ d.rmdir()
247
+ if project and target.exists() and not any(target.iterdir()):
248
+ target.rmdir()
249
+
250
+ logger.info(f"Cleared {count} cached tablet(s)")
251
+ return count
oracc_parser/cli.py ADDED
@@ -0,0 +1,201 @@
1
+ """
2
+ Command-line interface for oracc-parser.
3
+
4
+ Usage:
5
+ oracc-parser download --project saao/saa01
6
+ oracc-parser download --lang akkadian
7
+ oracc-parser parse --project saao/saa01 --format jsonl --output data.jsonl
8
+ oracc-parser parse --project saao/saa01 --limit 5 --format csv --output data.csv
9
+ oracc-parser clear-cache # clear all cached tablets
10
+ oracc-parser clear-cache --project saao # clear one project's cache
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import argparse
15
+ import sys
16
+
17
+ from oracc_parser.pipeline import export_to_csv, export_to_jsonl, parse_project
18
+ from oracc_parser.download.oracc_download import download_projects
19
+ from oracc_parser.models.config import RunConfig
20
+ from oracc_parser.utils.logger import get_logger
21
+
22
+ logger = get_logger()
23
+
24
+
25
+ def main(argv: list[str] | None = None):
26
+ """Entry point for the oracc-parser CLI."""
27
+ parser = argparse.ArgumentParser(
28
+ prog="oracc-parser",
29
+ description="Download and parse ORACC cuneiform text projects.",
30
+ )
31
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
32
+
33
+ # --------------- download ---------------
34
+ dl = subparsers.add_parser("download", help="Download ORACC project ZIPs")
35
+ dl.add_argument("--project", "-p", help="Project path, e.g. saao/saa01")
36
+ dl.add_argument(
37
+ "--lang",
38
+ "-l",
39
+ default="Akkadian",
40
+ help="Language filter for bulk download (default: Akkadian)",
41
+ )
42
+ dl.add_argument("--limit", "-n", type=int, help="Download only first N projects")
43
+
44
+ # --------------- parse ---------------
45
+ ps = subparsers.add_parser("parse", help="Parse a project and export results")
46
+ ps.add_argument("--project", "-p", required=True, help="Project path")
47
+ ps.add_argument(
48
+ "--format",
49
+ "-f",
50
+ choices=["jsonl", "csv"],
51
+ default="jsonl",
52
+ help="Output format (default: jsonl)",
53
+ )
54
+ ps.add_argument(
55
+ "--output", "-o", default="output.jsonl", help="Output file path"
56
+ )
57
+ ps.add_argument("--limit", "-n", type=int, help="Parse only first N texts")
58
+ ps.add_argument(
59
+ "--drop-missing",
60
+ action="store_true",
61
+ help="Drop entirely missing signs [x]",
62
+ )
63
+ ps.add_argument(
64
+ "--drop-damaged",
65
+ action="store_true",
66
+ help="Drop damaged signs ⸢x⸣",
67
+ )
68
+ ps.add_argument(
69
+ "--mask-pos",
70
+ nargs="*",
71
+ default=[],
72
+ help="POS tags to mask (e.g. PN DN GN)",
73
+ )
74
+ ps.add_argument("--no-cache", action="store_true", help="Disable caching")
75
+ ps.add_argument("--no-download", action="store_true", help="Skip download step")
76
+
77
+ # --------------- fetch-data ---------------
78
+ fd = subparsers.add_parser("fetch-data", help="Download pre-processed data from Zenodo")
79
+ fd.add_argument("--url", "-u", default=None, help="Zenodo record URL")
80
+ fd.add_argument("--output", "-o", default=None, help="Destination directory")
81
+
82
+ # --------------- info ---------------
83
+ subparsers.add_parser("info", help="Show bundled reference data summary")
84
+
85
+ # --------------- clear-cache ---------------
86
+ cc = subparsers.add_parser("clear-cache", help="Delete cached parsed tablets")
87
+ cc.add_argument(
88
+ "--project", "-p",
89
+ help="Only clear cache for this project (default: clear all)",
90
+ )
91
+
92
+ args = parser.parse_args(argv)
93
+
94
+ if args.command == "download":
95
+ _cmd_download(args)
96
+ elif args.command == "fetch-data":
97
+ _cmd_fetch_data(args)
98
+ elif args.command == "parse":
99
+ _cmd_parse(args)
100
+ elif args.command == "info":
101
+ _cmd_info()
102
+ elif args.command == "clear-cache":
103
+ _cmd_clear_cache(args)
104
+ else:
105
+ parser.print_help()
106
+ sys.exit(1)
107
+
108
+
109
+ def _cmd_fetch_data(args):
110
+ """Handle the fetch-data command."""
111
+ from pathlib import Path
112
+ from oracc_parser.download.fetch_data import fetch_data
113
+
114
+ fetch_data(
115
+ url=args.url,
116
+ dest=Path(args.output) if args.output else None,
117
+ )
118
+
119
+
120
+ def _cmd_download(args):
121
+ """Handle the download command."""
122
+ config = RunConfig(
123
+ languages=[args.lang] if args.lang else ["Akkadian"],
124
+ limit=args.limit,
125
+ )
126
+
127
+ if args.project:
128
+ from oracc_parser.download.oracc_download import download_zip
129
+
130
+ path = download_zip(args.project)
131
+ if path:
132
+ print(f"Downloaded: {path}")
133
+ else:
134
+ print("Download failed.", file=sys.stderr)
135
+ sys.exit(1)
136
+ else:
137
+ paths = download_projects(config=config)
138
+ print(f"Downloaded {len(paths)} project(s).")
139
+
140
+
141
+ def _cmd_parse(args):
142
+ """Handle the parse command."""
143
+ config = RunConfig(
144
+ drop_missing=args.drop_missing,
145
+ drop_damaged=args.drop_damaged,
146
+ mask_pos=args.mask_pos,
147
+ use_cache=not args.no_cache,
148
+ limit=args.limit,
149
+ )
150
+
151
+ records = parse_project(
152
+ args.project, config=config, download=not args.no_download
153
+ )
154
+
155
+ if not records:
156
+ print("No records parsed.", file=sys.stderr)
157
+ sys.exit(1)
158
+
159
+ if args.format == "jsonl":
160
+ path = export_to_jsonl(records, args.output)
161
+ else:
162
+ path = export_to_csv(records, args.output)
163
+
164
+ print(f"Exported {len(records)} records to {path}")
165
+
166
+
167
+ def _cmd_info():
168
+ """Show summary of bundled reference data."""
169
+ from oracc_parser.pipeline import reference_data
170
+
171
+ datasets = {
172
+ "Provenance": reference_data.get_provenance,
173
+ "Period mapping": reference_data.get_period_mapping,
174
+ "Sign list": reference_data.get_sign_list,
175
+ "POS tags": reference_data.get_pos_tags,
176
+ "Languages": reference_data.get_languages,
177
+ "Projects metadata": reference_data.get_projects_metadata,
178
+ }
179
+
180
+ for name, loader in datasets.items():
181
+ try:
182
+ df = loader()
183
+ print(f" {name}: {len(df)} rows, columns: {list(df.columns)}")
184
+ except Exception as e:
185
+ print(f" {name}: Error loading - {e}", file=sys.stderr)
186
+
187
+
188
+ def _cmd_clear_cache(args):
189
+ """Handle the clear-cache command."""
190
+ from oracc_parser.cache import clear_project_cache
191
+
192
+ count = clear_project_cache(project=args.project)
193
+ if count:
194
+ scope = f"project '{args.project}'" if args.project else "all projects"
195
+ print(f"Cleared {count} cached tablet(s) for {scope}.")
196
+ else:
197
+ print("No cached tablets found.")
198
+
199
+
200
+ if __name__ == "__main__":
201
+ main()
@@ -0,0 +1,104 @@
1
+ """
2
+ Standardized sentinel values and warning messages used across oracc-parser.
3
+
4
+ All "unknown" / "not found" / "unmapped" states are centralized here
5
+ so the user sees consistent, informative messages rather than ad-hoc strings.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ # ---------------------------------------------------------------------------
10
+ # Sentinel values — used as field defaults when real data is unavailable
11
+ # ---------------------------------------------------------------------------
12
+
13
+ # Geography
14
+ CITY_UNKNOWN = "unknown"
15
+ """Provenance city could not be determined from ORACC catalogue."""
16
+
17
+ STATE_UNMAPPED = "unmapped"
18
+ """Project has not been mapped to a state/empire grouping in our reference data."""
19
+
20
+ # Chronology
21
+ PERIOD_UNKNOWN = "unknown"
22
+ """Historical period could not be determined."""
23
+
24
+ YEAR_UNKNOWN = None
25
+ """Year could not be resolved from the period mapping (represented as None)."""
26
+
27
+ # POS / Language
28
+ POS_NOT_PROVIDED = "NOT_PROVIDED"
29
+ """Part-of-speech tag was absent from the ORACC data for this word."""
30
+
31
+ LANGUAGE_UNKNOWN = "unknown"
32
+ """Language code could not be mapped to a known language."""
33
+
34
+ # Content
35
+ TRANSLATION_UNAVAILABLE = ""
36
+ """English translation was not available on the ORACC web interface."""
37
+
38
+ SIGN_UNICODE_FALLBACK = "U"
39
+ """Sign reading could not be converted to a Unicode cuneiform character."""
40
+
41
+ SIGN_BROKEN = "X"
42
+ """Sign is entirely missing / broken beyond recognition."""
43
+
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # Warning messages — logged when edge cases are encountered
47
+ # ---------------------------------------------------------------------------
48
+
49
+ def warn_unmapped_city(project: str, raw_prov: str) -> str:
50
+ """Warning when a provenance value can't be matched to our reference table."""
51
+ return (
52
+ f"[{project}] Provenance '{raw_prov}' not found in reference data. "
53
+ f"City set to '{CITY_UNKNOWN}'. "
54
+ f"Consider adding this city to data/provenience.csv."
55
+ )
56
+
57
+
58
+ def warn_unmapped_state(project: str) -> str:
59
+ """Warning when a project hasn't been mapped to a state grouping."""
60
+ return (
61
+ f"[{project}] Project not mapped to a state/empire grouping. "
62
+ f"State set to '{STATE_UNMAPPED}'. "
63
+ f"This project may need manual classification."
64
+ )
65
+
66
+
67
+ def warn_unmapped_period(project: str, period: str) -> str:
68
+ """Warning when a period name isn't in the period-to-year mapping."""
69
+ return (
70
+ f"[{project}] Period '{period}' not found in period_mapping.csv. "
71
+ f"Year range could not be resolved."
72
+ )
73
+
74
+
75
+ def warn_unmapped_pos(raw_pos: str) -> str:
76
+ """Warning when a POS tag isn't in the reference table."""
77
+ return (
78
+ f"POS tag '{raw_pos}' not found in pos_tags.csv. "
79
+ f"Normalized POS set to '{POS_NOT_PROVIDED}'."
80
+ )
81
+
82
+
83
+ def warn_unmapped_language(lang_code: str) -> str:
84
+ """Warning when a language code can't be normalized."""
85
+ return (
86
+ f"Language code '{lang_code}' not found in languages.csv. "
87
+ f"Language set to '{LANGUAGE_UNKNOWN}'."
88
+ )
89
+
90
+
91
+ def warn_no_catalogue_entry(project: str, text_id: str) -> str:
92
+ """Warning when a text has no entry in the project catalogue."""
93
+ return (
94
+ f"[{project}/{text_id}] No catalogue entry found. "
95
+ f"Metadata will use default/unknown values."
96
+ )
97
+
98
+
99
+ def warn_unicode_fallback(reading: str, cleaned: str) -> str:
100
+ """Warning when a sign reading can't be mapped to Unicode."""
101
+ return (
102
+ f"Sign reading '{reading}' (cleaned: '{cleaned}') has no Unicode mapping. "
103
+ f"Stored as '{SIGN_UNICODE_FALLBACK}'."
104
+ )
@@ -0,0 +1 @@
1
+ from __future__ import annotations
@@ -0,0 +1,87 @@
1
+ """
2
+ Extract JSON text files and catalogue from downloaded ORACC project ZIPs.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ import zipfile
9
+ from typing import Optional
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from oracc_parser.utils.logger import get_logger
14
+ from oracc_parser.utils.paths import get_zip_dir
15
+
16
+ logger = get_logger()
17
+
18
+
19
+ class ProjectData(BaseModel):
20
+ """Container for JSON text files and catalogue extracted from a ZIP."""
21
+
22
+ json_files: list[dict] = Field(default_factory=list)
23
+ project_catalogue: Optional[dict] = None
24
+
25
+
26
+ def extract_from_zip(project: str, zip_dir=None) -> ProjectData:
27
+ """Extract all corpus JSONs and the catalogue from a project ZIP.
28
+
29
+ Args:
30
+ project: Project path, e.g. ``"saao/saa01"``.
31
+ zip_dir: Directory containing the ZIPs. Defaults to ``get_zip_dir()``.
32
+
33
+ Returns:
34
+ ProjectData with parsed JSON dicts and catalogue.
35
+ """
36
+ if zip_dir is None:
37
+ zip_dir = get_zip_dir()
38
+
39
+ result = ProjectData()
40
+ zipf = os.path.join(str(zip_dir), f"{project.replace('/', '-')}.zip")
41
+
42
+ if not os.path.exists(zipf):
43
+ logger.error(f"ZIP file not found: {zipf}")
44
+ return result
45
+
46
+ try:
47
+ with zipfile.ZipFile(zipf) as z:
48
+ if not z.namelist():
49
+ logger.error(f"ZIP file is empty: {zipf}")
50
+ return result
51
+
52
+ # Extract corpus JSON files
53
+ json_files = [
54
+ name for name in z.namelist()
55
+ if "corpusjson" in name and name.endswith(".json")
56
+ ]
57
+
58
+ for fn in json_files:
59
+ try:
60
+ raw = z.read(fn).decode("utf-8")
61
+ data = json.loads(raw)
62
+ result.json_files.append(data)
63
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
64
+ logger.error(f"Error reading {fn}: {e}")
65
+ except Exception as e:
66
+ logger.error(f"Unexpected error with {fn}: {e}")
67
+
68
+ # Extract catalogue
69
+ catalogue_files = [
70
+ name for name in z.namelist()
71
+ if name.endswith("catalogue.json")
72
+ ]
73
+ if catalogue_files:
74
+ try:
75
+ cat_raw = z.read(catalogue_files[0]).decode("utf-8")
76
+ result.project_catalogue = json.loads(cat_raw)
77
+ except Exception as e:
78
+ logger.error(f"Failed to parse catalogue.json: {e}")
79
+ else:
80
+ logger.warning(f"catalogue.json not found in {zipf}")
81
+
82
+ except zipfile.BadZipFile as e:
83
+ logger.error(f"Malformed ZIP file {zipf}: {e}")
84
+ except Exception as e:
85
+ logger.error(f"Unexpected error opening {zipf}: {e}")
86
+
87
+ return result