PyPI - oracc-parser - Versions diffs - 0.1.0__py3-none-any.whl - Mend

oracc-parser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

oracc_parser/__init__.py +34 -0
oracc_parser/cache.py +251 -0
oracc_parser/cli.py +201 -0
oracc_parser/constants.py +104 -0
oracc_parser/download/__init__.py +1 -0
oracc_parser/download/extract_jsons.py +87 -0
oracc_parser/download/fetch_data.py +298 -0
oracc_parser/download/oracc_download.py +270 -0
oracc_parser/download/pleiades.py +174 -0
oracc_parser/enriched_data/__init__.py +1 -0
oracc_parser/enriched_data/grouped_oracc_metadata_columns.csv +338 -0
oracc_parser/enriched_data/languages.csv +36 -0
oracc_parser/enriched_data/period_mapping.csv +26 -0
oracc_parser/enriched_data/pos_tags.csv +50 -0
oracc_parser/enriched_data/projects_metadata.csv +223 -0
oracc_parser/enriched_data/provenience.csv +337 -0
oracc_parser/enriched_data/raw_archive_values.csv +713 -0
oracc_parser/enriched_data/sign_readings.csv +8903 -0
oracc_parser/enriched_data/state_supergroup_mapping.csv +57 -0
oracc_parser/export/__init__.py +1 -0
oracc_parser/export/to_jsonl.py +161 -0
oracc_parser/io/__init__.py +2 -0
oracc_parser/io/word_csv.py +467 -0
oracc_parser/metadata/__init__.py +1 -0
oracc_parser/metadata/archive.py +399 -0
oracc_parser/metadata/populate.py +564 -0
oracc_parser/models/__init__.py +1 -0
oracc_parser/models/config.py +114 -0
oracc_parser/models/tablet.py +237 -0
oracc_parser/parsing/__init__.py +1 -0
oracc_parser/parsing/parse_content.py +174 -0
oracc_parser/parsing/parse_signs.py +219 -0
oracc_parser/parsing/parse_words.py +177 -0
oracc_parser/parsing/text_builder.py +175 -0
oracc_parser/parsing/translation.py +91 -0
oracc_parser/pipeline.py +535 -0
oracc_parser/settings.py +120 -0
oracc_parser/utils/__init__.py +1 -0
oracc_parser/utils/logger.py +32 -0
oracc_parser/utils/paths.py +519 -0
oracc_parser/utils/unicode.py +109 -0
oracc_parser-0.1.0.dist-info/METADATA +166 -0
oracc_parser-0.1.0.dist-info/RECORD +47 -0
oracc_parser-0.1.0.dist-info/WHEEL +5 -0
oracc_parser-0.1.0.dist-info/entry_points.txt +2 -0
oracc_parser-0.1.0.dist-info/licenses/LICENSE +21 -0
oracc_parser-0.1.0.dist-info/top_level.txt +1 -0

oracc_parser/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""
+oracc-parser: Download and parse ORACC cuneiform text projects.
+"""
+from __future__ import annotations
+__version__ = "0.1.0"
+# Public re-exports for convenience
+from oracc_parser.pipeline import (  # noqa: F401
+    export_to_csv,
+    export_to_jsonl,
+    parse_project,
+    parse_project_from_word_csvs,
+    records_to_word_dataframes,
+    save_project_catalogue,
+    load_project_catalogue,
+    reference_data,
+    get_metadata_table,
+    get_transliterations,
+    get_normalizations,
+    get_lemmatizations,
+    get_unicode_texts,
+    get_translations,
+    get_full_flat_table,
+)
+from oracc_parser.io.word_csv import (  # noqa: F401
+    load_word_csvs_from_dir,
+    load_word_csvs_from_zenodo,
+    save_word_csv,
+)
+from oracc_parser.models.config import RunConfig  # noqa: F401
+from oracc_parser.metadata.populate import enrich_catalogue_df  # noqa: F401
+from oracc_parser.download.pleiades import PleiadesData  # noqa: F401

oracc_parser/cache.py ADDED Viewed

@@ -0,0 +1,251 @@
+"""
+JSON caching for parsed TabletRecord objects.
+Parsed tablets are expensive to produce (long runtimes due to CDL tree
+traversal, sign parsing, and translation downloads).  This module caches
+the full result including a **config fingerprint**.
+On reload:
+- If the current config matches the cached fingerprint → **instant return**
+  (everything is reused, including string representations)
+- If the config differs → the cached **words** are reused and string
+  representations are rebuilt (cheap, no re-parsing needed)
+- If not cached at all → full parse from scratch
+Cache layout::
+    {cache_dir}/tablets/{project}/{text_id}.json
+Each file is a JSON wrapper::
+    {"config_fingerprint": "a1b2c3d4", "record": { ... TabletRecord ... }}
+"""
+from __future__ import annotations
+import hashlib
+import json
+from pathlib import Path
+from oracc_parser.utils.logger import get_logger
+logger = get_logger()
+# ---------------------------------------------------------------------------
+# Config fingerprinting
+# ---------------------------------------------------------------------------
+# These RunConfig fields affect the parsed output.
+# Everything else (USE_CACHE, CACHE_DIR, limit, languages) does NOT.
+_OUTPUT_AFFECTING_FIELDS = (
+    "drop_missing",
+    "drop_damaged",
+    "keep_word_segmentation",
+    "mask_pos",
+    "max_break_fraction",
+)
+def config_fingerprint(config) -> str:
+    """Compute a short, stable hash of the output-affecting config options.
+    Args:
+        config: A ``RunConfig`` instance.
+    Returns:
+        8-char hex string (e.g. ``"a1b2c3d4"``).
+    """
+    key = {}
+    for field in _OUTPUT_AFFECTING_FIELDS:
+        val = getattr(config, field)
+        if isinstance(val, list):
+            val = sorted(val)
+        key[field] = val
+    raw = json.dumps(key, sort_keys=True)
+    return hashlib.sha256(raw.encode()).hexdigest()[:8]
+# ---------------------------------------------------------------------------
+# Path helpers
+# ---------------------------------------------------------------------------
+def _resolve_cache_dir(cache_dir: str | None = None) -> Path:
+    """Return the base cache directory."""
+    if cache_dir:
+        return Path(cache_dir)
+    from oracc_parser.settings import CACHE_DIR as settings_CACHE_DIR
+    return settings_CACHE_DIR
+def _tablet_path(
+    project: str,
+    text_id: str,
+    cache_dir: str | None = None,
+) -> Path:
+    """Return the JSON file path for a cached tablet."""
+    base = _resolve_cache_dir(cache_dir) / "tablets"
+    project_dir = project.replace("/", "-")
+    return base / project_dir / f"{text_id}.json"
+# ---------------------------------------------------------------------------
+# Load / Save
+# ---------------------------------------------------------------------------
+def load_cached_tablet(
+    project: str,
+    text_id: str,
+    config,
+    cache_dir: str | None = None,
+) -> "TabletRecord | None":
+    """Load a cached tablet, rebuilding string reps only if config changed.
+    Two fast paths:
+    1. **Config match** — the cached fingerprint matches the current config.
+       The full record (including string representations) is returned as-is.
+       This is the fastest path.
+    2. **Config mismatch** — the words and metadata are reused, but string
+       representations are rebuilt with the current config.  This avoids
+       the expensive CDL parsing + translation download.
+    Args:
+        project: ORACC project path, e.g. ``"saao/saa01"``.
+        text_id: Text identifier, e.g. ``"P334189"``.
+        config: ``RunConfig`` instance.
+        cache_dir: Custom cache directory (overrides settings).
+    Returns:
+        The TabletRecord (possibly with rebuilt strings), or ``None``.
+    """
+    from oracc_parser.models.tablet import TabletRecord
+    from oracc_parser.parsing.parse_content import (
+        _add_word_level_representations,
+        _add_unicode_representation,
+    )
+    path = _tablet_path(project, text_id, cache_dir)
+    if not path.exists():
+        return None
+    try:
+        raw = path.read_text(encoding="utf-8")
+        wrapper = json.loads(raw)
+        # Handle both new wrapper format and legacy bare-record format
+        if "record" in wrapper and "config_fingerprint" in wrapper:
+            cached_fp = wrapper["config_fingerprint"]
+            record = TabletRecord.model_validate(wrapper["record"])
+        else:
+            # Legacy format (bare TabletRecord JSON) — always rebuild
+            cached_fp = None
+            record = TabletRecord.model_validate(wrapper)
+        current_fp = config_fingerprint(config)
+        if cached_fp == current_fp:
+            # Fast path: config matches → everything is valid
+            return record
+        # Config changed → rebuild string representations from cached words
+        record.content = _add_word_level_representations(
+            record.content, config.mask_pos, config.max_break_fraction
+        )
+        record.content = _add_unicode_representation(
+            record.content,
+            drop_missing=config.drop_missing,
+            drop_damaged=config.drop_damaged,
+            keep_segmentation=config.keep_word_segmentation,
+        )
+        return record
+    except Exception as e:
+        logger.warning(f"Corrupt cache file {path}, will re-parse: {e}")
+        path.unlink(missing_ok=True)
+        return None
+def save_tablet_to_cache(
+    record: "TabletRecord",
+    project: str,
+    text_id: str,
+    config,
+    cache_dir: str | None = None,
+) -> None:
+    """Persist a TabletRecord to the JSON cache with a config fingerprint.
+    The saved file includes the config fingerprint so that on reload
+    we can skip string rebuilding when the config hasn't changed.
+    Args:
+        record: The parsed tablet to cache.
+        project: ORACC project path.
+        text_id: Text identifier.
+        config: ``RunConfig`` instance (its fingerprint is stored).
+        cache_dir: Custom cache directory.
+    """
+    path = _tablet_path(project, text_id, cache_dir)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    wrapper = {
+        "config_fingerprint": config_fingerprint(config),
+        "record": record.model_dump(mode="python"),
+    }
+    try:
+        path.write_text(
+            json.dumps(wrapper, indent=1, default=str, ensure_ascii=False),
+            encoding="utf-8",
+        )
+    except Exception as e:
+        logger.warning(f"Failed to write cache file {path}: {e}")
+# ---------------------------------------------------------------------------
+# Clear
+# ---------------------------------------------------------------------------
+def clear_project_cache(
+    project: str | None = None,
+    cache_dir: str | None = None,
+) -> int:
+    """Delete cached JSON files for a project (or all projects).
+    Args:
+        project: ORACC project path.  ``None`` = clear everything.
+        cache_dir: Custom cache directory.
+    Returns:
+        Number of tablet JSON files deleted.
+    """
+    base = _resolve_cache_dir(cache_dir) / "tablets"
+    if not base.exists():
+        return 0
+    if project:
+        target = base / project.replace("/", "-")
+    else:
+        target = base
+    if not target.exists():
+        return 0
+    count = 0
+    for f in target.rglob("*.json"):
+        f.unlink()
+        count += 1
+    # Clean up empty directories (bottom-up)
+    for d in sorted(target.rglob("*"), reverse=True):
+        if d.is_dir() and not any(d.iterdir()):
+            d.rmdir()
+    if project and target.exists() and not any(target.iterdir()):
+        target.rmdir()
+    logger.info(f"Cleared {count} cached tablet(s)")
+    return count

oracc_parser/cli.py ADDED Viewed

@@ -0,0 +1,201 @@
+"""
+Command-line interface for oracc-parser.
+Usage:
+    oracc-parser download --project saao/saa01
+    oracc-parser download --lang akkadian
+    oracc-parser parse --project saao/saa01 --format jsonl --output data.jsonl
+    oracc-parser parse --project saao/saa01 --limit 5 --format csv --output data.csv
+    oracc-parser clear-cache                    # clear all cached tablets
+    oracc-parser clear-cache --project saao     # clear one project's cache
+"""
+from __future__ import annotations
+import argparse
+import sys
+from oracc_parser.pipeline import export_to_csv, export_to_jsonl, parse_project
+from oracc_parser.download.oracc_download import download_projects
+from oracc_parser.models.config import RunConfig
+from oracc_parser.utils.logger import get_logger
+logger = get_logger()
+def main(argv: list[str] | None = None):
+    """Entry point for the oracc-parser CLI."""
+    parser = argparse.ArgumentParser(
+        prog="oracc-parser",
+        description="Download and parse ORACC cuneiform text projects.",
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # --------------- download ---------------
+    dl = subparsers.add_parser("download", help="Download ORACC project ZIPs")
+    dl.add_argument("--project", "-p", help="Project path, e.g. saao/saa01")
+    dl.add_argument(
+        "--lang",
+        "-l",
+        default="Akkadian",
+        help="Language filter for bulk download (default: Akkadian)",
+    )
+    dl.add_argument("--limit", "-n", type=int, help="Download only first N projects")
+    # --------------- parse ---------------
+    ps = subparsers.add_parser("parse", help="Parse a project and export results")
+    ps.add_argument("--project", "-p", required=True, help="Project path")
+    ps.add_argument(
+        "--format",
+        "-f",
+        choices=["jsonl", "csv"],
+        default="jsonl",
+        help="Output format (default: jsonl)",
+    )
+    ps.add_argument(
+        "--output", "-o", default="output.jsonl", help="Output file path"
+    )
+    ps.add_argument("--limit", "-n", type=int, help="Parse only first N texts")
+    ps.add_argument(
+        "--drop-missing",
+        action="store_true",
+        help="Drop entirely missing signs [x]",
+    )
+    ps.add_argument(
+        "--drop-damaged",
+        action="store_true",
+        help="Drop damaged signs ⸢x⸣",
+    )
+    ps.add_argument(
+        "--mask-pos",
+        nargs="*",
+        default=[],
+        help="POS tags to mask (e.g. PN DN GN)",
+    )
+    ps.add_argument("--no-cache", action="store_true", help="Disable caching")
+    ps.add_argument("--no-download", action="store_true", help="Skip download step")
+    # --------------- fetch-data ---------------
+    fd = subparsers.add_parser("fetch-data", help="Download pre-processed data from Zenodo")
+    fd.add_argument("--url", "-u", default=None, help="Zenodo record URL")
+    fd.add_argument("--output", "-o", default=None, help="Destination directory")
+    # --------------- info ---------------
+    subparsers.add_parser("info", help="Show bundled reference data summary")
+    # --------------- clear-cache ---------------
+    cc = subparsers.add_parser("clear-cache", help="Delete cached parsed tablets")
+    cc.add_argument(
+        "--project", "-p",
+        help="Only clear cache for this project (default: clear all)",
+    )
+    args = parser.parse_args(argv)
+    if args.command == "download":
+        _cmd_download(args)
+    elif args.command == "fetch-data":
+        _cmd_fetch_data(args)
+    elif args.command == "parse":
+        _cmd_parse(args)
+    elif args.command == "info":
+        _cmd_info()
+    elif args.command == "clear-cache":
+        _cmd_clear_cache(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+def _cmd_fetch_data(args):
+    """Handle the fetch-data command."""
+    from pathlib import Path
+    from oracc_parser.download.fetch_data import fetch_data
+    fetch_data(
+        url=args.url,
+        dest=Path(args.output) if args.output else None,
+    )
+def _cmd_download(args):
+    """Handle the download command."""
+    config = RunConfig(
+        languages=[args.lang] if args.lang else ["Akkadian"],
+        limit=args.limit,
+    )
+    if args.project:
+        from oracc_parser.download.oracc_download import download_zip
+        path = download_zip(args.project)
+        if path:
+            print(f"Downloaded: {path}")
+        else:
+            print("Download failed.", file=sys.stderr)
+            sys.exit(1)
+    else:
+        paths = download_projects(config=config)
+        print(f"Downloaded {len(paths)} project(s).")
+def _cmd_parse(args):
+    """Handle the parse command."""
+    config = RunConfig(
+        drop_missing=args.drop_missing,
+        drop_damaged=args.drop_damaged,
+        mask_pos=args.mask_pos,
+        use_cache=not args.no_cache,
+        limit=args.limit,
+    )
+    records = parse_project(
+        args.project, config=config, download=not args.no_download
+    )
+    if not records:
+        print("No records parsed.", file=sys.stderr)
+        sys.exit(1)
+    if args.format == "jsonl":
+        path = export_to_jsonl(records, args.output)
+    else:
+        path = export_to_csv(records, args.output)
+    print(f"Exported {len(records)} records to {path}")
+def _cmd_info():
+    """Show summary of bundled reference data."""
+    from oracc_parser.pipeline import reference_data
+    datasets = {
+        "Provenance": reference_data.get_provenance,
+        "Period mapping": reference_data.get_period_mapping,
+        "Sign list": reference_data.get_sign_list,
+        "POS tags": reference_data.get_pos_tags,
+        "Languages": reference_data.get_languages,
+        "Projects metadata": reference_data.get_projects_metadata,
+    }
+    for name, loader in datasets.items():
+        try:
+            df = loader()
+            print(f"  {name}: {len(df)} rows, columns: {list(df.columns)}")
+        except Exception as e:
+            print(f"  {name}: Error loading - {e}", file=sys.stderr)
+def _cmd_clear_cache(args):
+    """Handle the clear-cache command."""
+    from oracc_parser.cache import clear_project_cache
+    count = clear_project_cache(project=args.project)
+    if count:
+        scope = f"project '{args.project}'" if args.project else "all projects"
+        print(f"Cleared {count} cached tablet(s) for {scope}.")
+    else:
+        print("No cached tablets found.")
+if __name__ == "__main__":
+    main()

oracc_parser/constants.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+Standardized sentinel values and warning messages used across oracc-parser.
+All "unknown" / "not found" / "unmapped" states are centralized here
+so the user sees consistent, informative messages rather than ad-hoc strings.
+"""
+from __future__ import annotations
+# ---------------------------------------------------------------------------
+# Sentinel values — used as field defaults when real data is unavailable
+# ---------------------------------------------------------------------------
+# Geography
+CITY_UNKNOWN = "unknown"
+"""Provenance city could not be determined from ORACC catalogue."""
+STATE_UNMAPPED = "unmapped"
+"""Project has not been mapped to a state/empire grouping in our reference data."""
+# Chronology
+PERIOD_UNKNOWN = "unknown"
+"""Historical period could not be determined."""
+YEAR_UNKNOWN = None
+"""Year could not be resolved from the period mapping (represented as None)."""
+# POS / Language
+POS_NOT_PROVIDED = "NOT_PROVIDED"
+"""Part-of-speech tag was absent from the ORACC data for this word."""
+LANGUAGE_UNKNOWN = "unknown"
+"""Language code could not be mapped to a known language."""
+# Content
+TRANSLATION_UNAVAILABLE = ""
+"""English translation was not available on the ORACC web interface."""
+SIGN_UNICODE_FALLBACK = "U"
+"""Sign reading could not be converted to a Unicode cuneiform character."""
+SIGN_BROKEN = "X"
+"""Sign is entirely missing / broken beyond recognition."""
+# ---------------------------------------------------------------------------
+# Warning messages — logged when edge cases are encountered
+# ---------------------------------------------------------------------------
+def warn_unmapped_city(project: str, raw_prov: str) -> str:
+    """Warning when a provenance value can't be matched to our reference table."""
+    return (
+        f"[{project}] Provenance '{raw_prov}' not found in reference data. "
+        f"City set to '{CITY_UNKNOWN}'. "
+        f"Consider adding this city to data/provenience.csv."
+    )
+def warn_unmapped_state(project: str) -> str:
+    """Warning when a project hasn't been mapped to a state grouping."""
+    return (
+        f"[{project}] Project not mapped to a state/empire grouping. "
+        f"State set to '{STATE_UNMAPPED}'. "
+        f"This project may need manual classification."
+    )
+def warn_unmapped_period(project: str, period: str) -> str:
+    """Warning when a period name isn't in the period-to-year mapping."""
+    return (
+        f"[{project}] Period '{period}' not found in period_mapping.csv. "
+        f"Year range could not be resolved."
+    )
+def warn_unmapped_pos(raw_pos: str) -> str:
+    """Warning when a POS tag isn't in the reference table."""
+    return (
+        f"POS tag '{raw_pos}' not found in pos_tags.csv. "
+        f"Normalized POS set to '{POS_NOT_PROVIDED}'."
+    )
+def warn_unmapped_language(lang_code: str) -> str:
+    """Warning when a language code can't be normalized."""
+    return (
+        f"Language code '{lang_code}' not found in languages.csv. "
+        f"Language set to '{LANGUAGE_UNKNOWN}'."
+    )
+def warn_no_catalogue_entry(project: str, text_id: str) -> str:
+    """Warning when a text has no entry in the project catalogue."""
+    return (
+        f"[{project}/{text_id}] No catalogue entry found. "
+        f"Metadata will use default/unknown values."
+    )
+def warn_unicode_fallback(reading: str, cleaned: str) -> str:
+    """Warning when a sign reading can't be mapped to Unicode."""
+    return (
+        f"Sign reading '{reading}' (cleaned: '{cleaned}') has no Unicode mapping. "
+        f"Stored as '{SIGN_UNICODE_FALLBACK}'."
+    )

oracc_parser/download/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from __future__ import annotations

oracc_parser/download/extract_jsons.py ADDED Viewed

@@ -0,0 +1,87 @@
+"""
+Extract JSON text files and catalogue from downloaded ORACC project ZIPs.
+"""
+from __future__ import annotations
+import json
+import os
+import zipfile
+from typing import Optional
+from pydantic import BaseModel, Field
+from oracc_parser.utils.logger import get_logger
+from oracc_parser.utils.paths import get_zip_dir
+logger = get_logger()
+class ProjectData(BaseModel):
+    """Container for JSON text files and catalogue extracted from a ZIP."""
+    json_files: list[dict] = Field(default_factory=list)
+    project_catalogue: Optional[dict] = None
+def extract_from_zip(project: str, zip_dir=None) -> ProjectData:
+    """Extract all corpus JSONs and the catalogue from a project ZIP.
+    Args:
+        project: Project path, e.g. ``"saao/saa01"``.
+        zip_dir: Directory containing the ZIPs. Defaults to ``get_zip_dir()``.
+    Returns:
+        ProjectData with parsed JSON dicts and catalogue.
+    """
+    if zip_dir is None:
+        zip_dir = get_zip_dir()
+    result = ProjectData()
+    zipf = os.path.join(str(zip_dir), f"{project.replace('/', '-')}.zip")
+    if not os.path.exists(zipf):
+        logger.error(f"ZIP file not found: {zipf}")
+        return result
+    try:
+        with zipfile.ZipFile(zipf) as z:
+            if not z.namelist():
+                logger.error(f"ZIP file is empty: {zipf}")
+                return result
+            # Extract corpus JSON files
+            json_files = [
+                name for name in z.namelist()
+                if "corpusjson" in name and name.endswith(".json")
+            ]
+            for fn in json_files:
+                try:
+                    raw = z.read(fn).decode("utf-8")
+                    data = json.loads(raw)
+                    result.json_files.append(data)
+                except (json.JSONDecodeError, UnicodeDecodeError) as e:
+                    logger.error(f"Error reading {fn}: {e}")
+                except Exception as e:
+                    logger.error(f"Unexpected error with {fn}: {e}")
+            # Extract catalogue
+            catalogue_files = [
+                name for name in z.namelist()
+                if name.endswith("catalogue.json")
+            ]
+            if catalogue_files:
+                try:
+                    cat_raw = z.read(catalogue_files[0]).decode("utf-8")
+                    result.project_catalogue = json.loads(cat_raw)
+                except Exception as e:
+                    logger.error(f"Failed to parse catalogue.json: {e}")
+            else:
+                logger.warning(f"catalogue.json not found in {zipf}")
+    except zipfile.BadZipFile as e:
+        logger.error(f"Malformed ZIP file {zipf}: {e}")
+    except Exception as e:
+        logger.error(f"Unexpected error opening {zipf}: {e}")
+    return result