PyPI - media-intelligence - Versions diffs - 0.1.0__py3-none-any.whl - Mend

media-intelligence 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

media_intelligence/__init__.py +158 -0
media_intelligence/_lazy.py +100 -0
media_intelligence/documents.py +119 -0
media_intelligence/enrich.py +94 -0
media_intelligence/ingest.py +74 -0
media_intelligence/ocr.py +65 -0
media_intelligence/persist/__init__.py +16 -0
media_intelligence/persist/base.py +41 -0
media_intelligence/persist/filestore.py +86 -0
media_intelligence/persist/pgstore.py +50 -0
media_intelligence/pipeline.py +281 -0
media_intelligence/publish.py +72 -0
media_intelligence/py.typed +0 -0
media_intelligence/schemas.py +163 -0
media_intelligence/structure.py +114 -0
media_intelligence/transcribe.py +37 -0
media_intelligence/video.py +59 -0
media_intelligence-0.1.0.dist-info/METADATA +146 -0
media_intelligence-0.1.0.dist-info/RECORD +21 -0
media_intelligence-0.1.0.dist-info/WHEEL +5 -0
media_intelligence-0.1.0.dist-info/top_level.txt +1 -0

media_intelligence/__init__.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""media_intelligence — the Abstract Intelligence Platform facade.
+A unified, layered access layer that turns raw media (PDFs, images, video) into
+structured, searchable, SEO-ready data. It does not reimplement any engine; it
+selects the *best* function of each sibling package and exposes it behind one
+clean, lazy API.
+Two ways to use it
+------------------
+1. Direct namespace access — grab one tool::
+       import media_intelligence as mi
+       text  = mi.ocr.image_to_text("page.png")
+       kw    = mi.enrich.keywords(text)
+       mi.documents.process_pdf("doc.pdf")
+2. The orchestrated, idempotent/resumable pipeline::
+       from media_intelligence import MediaPipeline
+       pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
+       pipe.ingest().extract().structure().enrich().persist().publish()
+       print(pipe.report.summary)
+       #  ... or just:  pipe.run()
+Layers map one-to-one onto canonical owning packages:
+    ingest    -> abstract_webtools   (scrape + yt-dlp video download)
+    ocr       -> abstract_ocr        (layout-aware multi-engine OCR)
+    documents -> abstract_pdfs       (PDF decomposition + HTML)
+    video     -> abstract_videos     (registry pipeline: download/frames/SEO)
+    transcribe-> hugpy               (Whisper ASR; abstract_ocr fallback)
+    enrich    -> hugpy               (summaries, keywords, vision, SEO)
+    persist   -> filesystem now, DB-pluggable interface
+    publish   -> abstract_react + abstract_nginx (SEO/OG + static HTML)
+Every backing package is imported lazily, so ``import media_intelligence`` is
+cheap and a missing optional package only errors when that layer is used.
+"""
+from __future__ import annotations
+import importlib
+import importlib.util
+from typing import TYPE_CHECKING
+from ._lazy import MediaIntelligenceError, MissingDependency
+from .schemas import (
+    MediaItem,
+    MediaKind,
+    PipelineReport,
+    Stage,
+    StageResult,
+    detect_media_kind,
+)
+__version__ = "0.1.0"
+# Submodules exposed as lazy namespaces via module __getattr__ below.
+_LAZY_SUBMODULES = {
+    "ingest",
+    "ocr",
+    "documents",
+    "video",
+    "transcribe",
+    "enrich",
+    "structure",
+    "persist",
+    "publish",
+}
+# Which backing package(s) each layer needs. ``persist`` is pure-stdlib (always
+# available); ``transcribe`` works if either hugpy or abstract_ocr is present.
+_LAYER_PACKAGES = {
+    "ingest": ("abstract_webtools",),
+    "ocr": ("abstract_ocr",),
+    "documents": ("abstract_pdfs",),
+    "video": ("abstract_videos",),
+    "transcribe": ("hugpy", "abstract_ocr"),   # any-of
+    "enrich": ("hugpy",),
+    "structure": (),
+    "persist": (),
+    "publish": ("abstract_react",),            # nginx HTML is an additional option
+}
+__all__ = [
+    "MediaPipeline",
+    "MediaItem",
+    "MediaKind",
+    "Stage",
+    "StageResult",
+    "PipelineReport",
+    "detect_media_kind",
+    "available",
+    "MediaIntelligenceError",
+    "MissingDependency",
+    "__version__",
+    *sorted(_LAZY_SUBMODULES),
+]
+def _installed(package: str) -> bool:
+    """Whether ``package`` is importable — without importing it."""
+    try:
+        return importlib.util.find_spec(package) is not None
+    except (ImportError, ValueError):
+        return False
+def available(layer: str | None = None):
+    """Report which layers are usable in this environment, without importing them.
+    >>> import media_intelligence as mi
+    >>> mi.available()            # {'ingest': True, 'ocr': True, 'publish': False, ...}
+    >>> mi.available("enrich")    # True / False
+    A layer is available if (any of) its backing package(s) are installed. The
+    pure-stdlib layers (``structure``, ``persist``) are always available.
+    """
+    def _ok(needed: tuple) -> bool:
+        return True if not needed else any(_installed(p) for p in needed)
+    if layer is not None:
+        if layer not in _LAYER_PACKAGES:
+            raise ValueError(f"unknown layer {layer!r}; choose from {sorted(_LAYER_PACKAGES)}")
+        return _ok(_LAYER_PACKAGES[layer])
+    return {name: _ok(pkgs) for name, pkgs in _LAYER_PACKAGES.items()}
+if TYPE_CHECKING:  # for type checkers / IDEs only — no runtime import cost
+    from . import (  # noqa: F401
+        documents,
+        enrich,
+        ingest,
+        ocr,
+        persist,
+        publish,
+        structure,
+        transcribe,
+        video,
+    )
+    from .pipeline import MediaPipeline  # noqa: F401
+def __getattr__(name: str):
+    """PEP 562 lazy attribute access.
+    Keeps the import graph flat: namespaces and the (heavier) pipeline module
+    are only imported when first referenced.
+    """
+    if name == "MediaPipeline":
+        module = importlib.import_module(".pipeline", __name__)
+        return module.MediaPipeline
+    if name in _LAZY_SUBMODULES:
+        return importlib.import_module(f".{name}", __name__)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+def __dir__():
+    return sorted(set(__all__) | set(globals()))

media_intelligence/_lazy.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Lazy / soft import plumbing for the media_intelligence facade.
+The whole point of this package is to be a *thin* unified access layer over a
+set of heavy sibling packages (paddleocr, torch, yt-dlp, whisper, ...). Importing
+``media_intelligence`` must stay cheap, so every sibling package is imported
+lazily — at first *use*, never at module import time — and the result is cached.
+If an optional layer's backing package is not installed, we raise a single,
+actionable :class:`MissingDependency` error that names the extra to install
+rather than leaking a raw ``ModuleNotFoundError`` from deep inside a submodule.
+"""
+from __future__ import annotations
+import functools
+import importlib
+from types import ModuleType
+from typing import Any, Callable
+__all__ = [
+    "MediaIntelligenceError",
+    "MissingDependency",
+    "soft_import",
+    "require",
+    "lazy_namespace",
+]
+class MediaIntelligenceError(RuntimeError):
+    """Base error for the media_intelligence facade."""
+class MissingDependency(MediaIntelligenceError):
+    """A layer was used but its backing package is not installed."""
+# Which pip extra installs which backing package — used to build helpful errors.
+_EXTRA_FOR_PACKAGE = {
+    "abstract_essentials": "(core)",
+    "abstract_webtools": "ingest",
+    "abstract_ocr": "ocr",
+    "abstract_pdfs": "documents",
+    "abstract_videos": "video",
+    "hugpy": "enrich",
+    "abstract_react": "publish",
+    "abstract_nginx": "publish",
+}
+_MODULE_CACHE: dict[str, ModuleType] = {}
+def soft_import(package: str, *, layer: str | None = None) -> ModuleType:
+    """Import ``package`` lazily, caching the module.
+    Raises :class:`MissingDependency` (not ``ModuleNotFoundError``) with an
+    install hint if the package is absent.
+    """
+    cached = _MODULE_CACHE.get(package)
+    if cached is not None:
+        return cached
+    try:
+        module = importlib.import_module(package)
+    except ModuleNotFoundError as exc:
+        # Only translate a *missing backing package*; a genuine sub-import error
+        # inside an installed package should surface unchanged.
+        if exc.name and (exc.name == package or package.startswith(exc.name + ".")):
+            # Resolve the install hint against the *top-level* package, so a
+            # missing submodule (e.g. "abstract_nginx.generate_htmls") still
+            # points at the right extra.
+            top = package.split(".", 1)[0]
+            extra = _EXTRA_FOR_PACKAGE.get(top, top)
+            hint = f'pip install "media_intelligence[{extra}]"' if extra and extra != "(core)" \
+                else f"pip install {top}"
+            raise MissingDependency(
+                f"The '{layer or package}' layer needs '{package}', which is not "
+                f"installed. Install it with:  {hint}"
+            ) from exc
+        raise
+    _MODULE_CACHE[package] = module
+    return module
+def require(package: str, attr: str, *, layer: str | None = None) -> Any:
+    """Return ``attr`` from a soft-imported ``package``.
+    Raises a clear error if the package is installed but the symbol is gone
+    (e.g. an upstream rename) so failures point at the facade, not the user.
+    """
+    module = soft_import(package, layer=layer)
+    try:
+        return getattr(module, attr)
+    except AttributeError as exc:
+        raise MediaIntelligenceError(
+            f"'{package}.{attr}' is not available — the upstream API may have "
+            f"changed. The media_intelligence '{layer or package}' layer needs updating."
+        ) from exc
+def lazy_namespace(loader: Callable[[], ModuleType]) -> Callable[[], ModuleType]:
+    """Wrap a submodule loader so the import happens once and is memoised."""
+    return functools.lru_cache(maxsize=1)(loader)

media_intelligence/documents.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Extraction + structuring layer (documents) — PDFs.
+Canonical owner: ``abstract_pdfs``. Page-level decomposition (text + images),
+manifest generation, OCR (delegating to ``abstract_ocr``), enrichment, and
+static HTML (viewer + gallery).
+"""
+from __future__ import annotations
+import json
+import os
+from typing import Any, Optional
+from ._lazy import require, soft_import
+_PKG = "abstract_pdfs"
+_LAYER = "documents"
+__all__ = [
+    "process_pdf",
+    "process_pdfs",
+    "process_all_pdfs",
+    "generate_pdf",
+    "pdf_pages",
+    "DocumentPipeline",
+    "SliceManager",
+]
+def process_pdf(pdf_path: str, **kwargs: Any) -> dict:
+    """Process every page of one PDF (image→text→info→metadata→html + gallery)."""
+    fn = require(_PKG, "process_pdf", layer=_LAYER)
+    return fn(pdf_path, **kwargs)
+def process_pdfs(pdf_paths: list[str], **kwargs: Any) -> list:
+    """Batch process many PDFs with two-level parallelism (PDFs × pages)."""
+    fn = require(_PKG, "process_pdfs", layer=_LAYER)
+    return fn(pdf_paths, **kwargs)
+def process_all_pdfs(directory: str, **kwargs: Any):
+    """Discover and process every ``.pdf`` under ``directory``."""
+    fn = require(_PKG, "process_all_pdfs", layer=_LAYER)
+    return fn(directory, **kwargs)
+def generate_pdf(pdf_path: str, **kwargs: Any) -> dict:
+    """One-call end-to-end: slice + OCR + enriched manifests + viewer HTML."""
+    mod = soft_import(_PKG + ".pipeline", layer=_LAYER)
+    fn = getattr(mod, "generate_pdf", None) or require(_PKG, "generate_pdf", layer=_LAYER)
+    return fn(pdf_path, **kwargs)
+def _resolve_pdf_dir(pdf_path: str) -> Optional[str]:
+    """Find the directory holding ``pages/`` for a processed PDF.
+    ``process_pdf`` relocates ``<dir>/foo.pdf`` into ``<dir>/foo/foo.pdf`` and
+    writes pages under ``<dir>/foo/pages/``. We check the relocated dir first,
+    then the original dir, so this works whether or not relocation happened.
+    """
+    p = os.path.abspath(pdf_path)
+    parent = os.path.dirname(p)
+    stem = os.path.splitext(os.path.basename(p))[0]
+    for candidate in (os.path.join(parent, stem), parent):
+        if os.path.isdir(os.path.join(candidate, "pages")):
+            return candidate
+    return None
+def pdf_pages(pdf_path: str) -> tuple[list[dict[str, Any]], Optional[str]]:
+    """Read back the per-page OCR'd text/info that ``process_pdf`` wrote to disk.
+    Returns ``(pages, full_text)`` where ``pages`` is a list of
+    ``{"index", "page", "text", "info"}`` dicts in page order, and ``full_text``
+    is the pages joined. Reads the cached ``pages/NNNN/text.txt`` + ``info.json``
+    layout directly — no re-OCR, no fragile deep imports. ``([], None)`` if the
+    PDF hasn't been processed yet.
+    """
+    base = _resolve_pdf_dir(pdf_path)
+    if base is None:
+        return [], None
+    pages_dir = os.path.join(base, "pages")
+    # zero-padded names (0001, 0002, ...) so lexical sort == page order
+    names = sorted(
+        d for d in os.listdir(pages_dir) if os.path.isdir(os.path.join(pages_dir, d))
+    )
+    pages: list[dict[str, Any]] = []
+    for i, name in enumerate(names):
+        pdir = os.path.join(pages_dir, name)
+        text_path = os.path.join(pdir, "text.txt")
+        info_path = os.path.join(pdir, "info.json")
+        text = ""
+        if os.path.isfile(text_path):
+            with open(text_path, "r", encoding="utf-8", errors="replace") as fh:
+                text = fh.read().strip()
+        info: dict[str, Any] = {}
+        if os.path.isfile(info_path):
+            try:
+                with open(info_path, "r", encoding="utf-8") as fh:
+                    info = json.load(fh)
+            except Exception:
+                info = {}
+        pages.append(
+            {"index": int(name) if name.isdigit() else i, "page": name, "text": text, "info": info}
+        )
+    full_text = "\n\n".join(p["text"] for p in pages if p["text"]).strip() or None
+    return pages, full_text
+def DocumentPipeline(*args: Any, **kwargs: Any):
+    """Construct the per-PDF ``DocumentPipeline`` orchestrator."""
+    cls = require(_PKG, "DocumentPipeline", layer=_LAYER)
+    return cls(*args, **kwargs)
+def SliceManager(*args: Any, **kwargs: Any):
+    """Construct the slice-aware multi-engine column OCR ``SliceManager``."""
+    cls = require(_PKG, "SliceManager", layer=_LAYER)
+    return cls(*args, **kwargs)

media_intelligence/enrich.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""Enrichment layer — content understanding.
+Canonical owner: ``hugpy`` (the latest ML/NLP namespace, its own project). One
+seam for summarization, keyword extraction/refinement, image captioning (vision),
+and the higher-level ``analyze*`` helpers that produce SEO-ready metadata.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from ._lazy import require
+_PKG = "hugpy"
+_LAYER = "enrich"
+__all__ = [
+    "summarize",
+    "summarize_image",
+    "keywords",
+    "refine_keywords",
+    "keyword_density",
+    "caption",
+    "analyze",
+    "analyze_pdf",
+    "analyze_video",
+    "execute",
+]
+def summarize(text: Optional[str] = None, **kwargs: Any) -> str:
+    """Abstractive summary of ``text`` (chunked + consolidated)."""
+    fn = require(_PKG, "summarize", layer=_LAYER)
+    return fn(text, **kwargs) if text is not None else fn(**kwargs)
+def summarize_image(image_path: str, **kwargs: Any) -> str:
+    """Summarize/describe an image (vision model)."""
+    fn = require(_PKG, "summarize_image", layer=_LAYER)
+    return fn(image_path, **kwargs)
+def keywords(text: str, **kwargs: Any):
+    """Extract keyphrases (KeyBERT + spaCy) → list[(phrase, score)]."""
+    fn = require(_PKG, "extract_keywords", layer=_LAYER)
+    return fn(text, **kwargs)
+def refine_keywords(text: str, **kwargs: Any):
+    """SEO-refined, density-filtered keywords → ``RefinedResult``."""
+    fn = require(_PKG, "refine_keywords", layer=_LAYER)
+    return fn(text, **kwargs)
+def keyword_density(text: str, kws: Any) -> dict:
+    """Keyword density map for ``text`` against ``kws``."""
+    fn = require(_PKG, "keyword_density", layer=_LAYER)
+    return fn(text, kws)
+def caption(image_path: str, prompt: str = "please describe this image", **kwargs: Any) -> str:
+    """Caption / visually analyse an image via the cached vision coder."""
+    get_vision_coder = require(_PKG, "get_vision_coder", layer=_LAYER)
+    coder = get_vision_coder(**{k: v for k, v in kwargs.items() if k in
+                                {"model_key", "torch_dtype", "max_tokens", "min_tokens"}})
+    analyze_kwargs = {k: v for k, v in kwargs.items() if k in {"max_new_tokens", "max_tokens"}}
+    return coder.analyze_image(image_path, prompt=prompt, **analyze_kwargs)
+def analyze(text: str, **kwargs: Any):
+    """Full text analysis → summary + keywords + metadata bundle."""
+    fn = require(_PKG, "analyze", layer=_LAYER)
+    return fn(text, **kwargs)
+def analyze_pdf(pdf_path: str, **kwargs: Any):
+    """Analyze an already-extracted PDF's text into SEO/metadata."""
+    fn = require(_PKG, "analyze_pdf", layer=_LAYER)
+    return fn(pdf_path, **kwargs)
+def analyze_video(*args: Any, **kwargs: Any):
+    """Frame-by-frame video understanding (vision over extracted frames)."""
+    fn = require(_PKG, "analyze_video", layer=_LAYER)
+    return fn(*args, **kwargs)
+def execute(*args: Any, **kwargs: Any):
+    """Escape hatch: hugpy's unified dispatch (``execute_prompt``).
+    Routes any ``(framework, task)`` — chat, vision, ASR, summarize, embed,
+    image-gen, keywords — through the model registry.
+    """
+    fn = require(_PKG, "execute_prompt", layer=_LAYER)
+    return fn(*args, **kwargs)

media_intelligence/ingest.py ADDED Viewed

@@ -0,0 +1,74 @@
+"""Ingestion layer — bring raw media in from the web.
+Canonical owner: ``abstract_webtools``. This is the single seam for *acquiring*
+media: scraping/parsing a page and downloading video (yt-dlp + ffmpeg). The
+``abstract_videos`` package also downloads video, but for the platform we make
+webtools the one owner of "fetch from a URL" and let the video pipeline consume
+the local file.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from ._lazy import require, soft_import
+_PKG = "abstract_webtools"
+_LAYER = "ingest"
+__all__ = [
+    "scrape",
+    "soup",
+    "page_text",
+    "links",
+    "download_video",
+    "video_info",
+    "video_id",
+]
+def scrape(url: str, **kwargs: Any):
+    """Return a ``UnifiedWebManager`` for ``url`` (lazy ``.soup``/``.link_mgr``/...)."""
+    UnifiedWebManager = require(_PKG, "UnifiedWebManager", layer=_LAYER)
+    return UnifiedWebManager(url=url, **kwargs)
+def soup(url: str, **kwargs: Any):
+    """Return a parsed BeautifulSoup-backed soup manager for ``url``."""
+    get_soup = require(_PKG, "get_soup", layer=_LAYER)
+    return get_soup(url, **kwargs)
+def page_text(url: str) -> str:
+    """Return the visible text of a page."""
+    get_soup_text = require(_PKG, "get_soup_text", layer=_LAYER)
+    return get_soup_text(url)
+def links(url: str, **kwargs: Any):
+    """Discover links/images on a page via the link manager."""
+    mgr = scrape(url, **kwargs)
+    return mgr.link_mgr
+def download_video(url: str, download_directory: Optional[str] = None, **kwargs: Any):
+    """Download a video from ``url`` (yt-dlp/ffmpeg/m3u8) and return the manager.
+    The manager exposes the resolved local path(s) and metadata.
+    """
+    get_video_mgr = require(_PKG, "get_video_mgr", layer=_LAYER)
+    if download_directory is not None:
+        kwargs.setdefault("download_directory", download_directory)
+    kwargs.setdefault("download_video", True)
+    return get_video_mgr(url, **kwargs)
+def video_info(url: str, **kwargs: Any) -> dict:
+    """Resolve video metadata for ``url`` without downloading."""
+    get_video_info = require(_PKG, "get_video_info", layer=_LAYER)
+    return get_video_info(url, **kwargs)
+def video_id(url: str) -> str:
+    """Return a stable video id for ``url``."""
+    get_video_id = require(_PKG, "get_video_id", layer=_LAYER)
+    return get_video_id(url)

media_intelligence/ocr.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Extraction layer (images) — OCR.
+Canonical owner: ``abstract_ocr``. Multi-engine, layout-aware OCR plus the
+per-frame video-OCR helpers. ``abstract_videos`` re-implements a slice of these;
+the platform routes everyone through ``abstract_ocr`` so there is one OCR owner.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from ._lazy import require, soft_import
+_PKG = "abstract_ocr"
+_LAYER = "ocr"
+__all__ = [
+    "image_to_text",
+    "image_to_text_layout",
+    "directory_to_texts",
+    "video_frames",
+    "video_text",
+    "best_thumbnail",
+]
+def image_to_text(image_path: str, preprocess: bool = True) -> str:
+    """OCR a single image to text (Paddle-first with Tesseract fallback)."""
+    fn = require(_PKG, "convert_image_to_text", layer=_LAYER)
+    return fn(image_path, preprocess=preprocess)
+def image_to_text_layout(image_path: str, config: Any = None, chain: Optional[list] = None):
+    """Run the modern layout-aware OCR pipeline → ``PipelineReport``.
+    Reading-order aware, with column detection and region segmentation.
+    """
+    mod = soft_import(_PKG + ".layout_ocr.pipeline", layer=_LAYER)
+    run_on_image = getattr(mod, "run_on_image", None)
+    if run_on_image is None:  # fall back to top-level export in some builds
+        run_on_image = require(_PKG, "run_on_image", layer=_LAYER)
+    return run_on_image(image_path, config=config, chain=chain)
+def directory_to_texts(directory: str, **kwargs: Any):
+    """Batch-OCR every image in a directory."""
+    fn = require(_PKG, "extract_image_texts_from_directory", layer=_LAYER)
+    return fn(directory, **kwargs)
+def video_frames(video_path: str, directory: str, **kwargs: Any):
+    """Sample frames from a video into ``directory``."""
+    fn = require(_PKG, "extract_video_frames", layer=_LAYER)
+    return fn(video_path, directory, **kwargs)
+def video_text(video_path: str, **kwargs: Any):
+    """Extract on-screen text across a video's frames."""
+    fn = require(_PKG, "analyze_video_text", layer=_LAYER)
+    return fn(video_path, **kwargs)
+def best_thumbnail(video_text_or_whisper: Any, keywords: Any, directory: str, **kwargs: Any):
+    """Pick the most representative thumbnail frame using transcript keywords."""
+    fn = require(_PKG, "pick_optimal_thumbnail", layer=_LAYER)
+    return fn(video_text_or_whisper, keywords, directory, **kwargs)

media_intelligence/persist/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""Persistence layer — filesystem now, DB-pluggable interface.
+::
+    store = mi.persist.FileStore("/data")
+    store.save_manifest(item.media_id, manifest)
+    # later, same interface:
+    # store = mi.persist.PgStore(dsn=...)   # JSONB backend (planned)
+"""
+from __future__ import annotations
+from .base import Store
+from .filestore import FileStore
+from .pgstore import PgStore
+__all__ = ["Store", "FileStore", "PgStore"]

media_intelligence/persist/base.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Persistence interface — one contract, swappable backends.
+Vision principle: *local-first, cloud-optional*. v1 ships a filesystem store;
+the same interface admits a Postgres/JSONB store later (``abstract_database``)
+with no change to the pipeline. Stores are addressed by ``media_id`` and persist
+the typed manifest plus a pointer to the on-disk asset collection.
+"""
+from __future__ import annotations
+from typing import Any, Optional, Protocol, runtime_checkable
+__all__ = ["Store"]
+@runtime_checkable
+class Store(Protocol):
+    """The persistence contract every backend implements."""
+    def save_manifest(self, media_id: str, manifest: dict[str, Any]) -> str:
+        """Persist the lean index ``manifest``; return a locator (path/row id)."""
+        ...
+    def load_manifest(self, media_id: str) -> Optional[dict[str, Any]]:
+        """Return the stored manifest for ``media_id`` or ``None``."""
+        ...
+    def save_document(self, media_id: str, document: dict[str, Any]) -> str:
+        """Persist the full content ``document`` (text/pages/transcript)."""
+        ...
+    def load_document(self, media_id: str) -> Optional[dict[str, Any]]:
+        """Return the full content document for ``media_id`` or ``None``."""
+        ...
+    def exists(self, media_id: str) -> bool:
+        """Whether a manifest already exists (drives idempotent resume)."""
+        ...
+    def collection_dir(self, media_id: str) -> str:
+        """Return the directory that holds this item's asset collection."""
+        ...