media-intelligence 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,158 @@
1
+ """media_intelligence — the Abstract Intelligence Platform facade.
2
+
3
+ A unified, layered access layer that turns raw media (PDFs, images, video) into
4
+ structured, searchable, SEO-ready data. It does not reimplement any engine; it
5
+ selects the *best* function of each sibling package and exposes it behind one
6
+ clean, lazy API.
7
+
8
+ Two ways to use it
9
+ ------------------
10
+
11
+ 1. Direct namespace access — grab one tool::
12
+
13
+ import media_intelligence as mi
14
+ text = mi.ocr.image_to_text("page.png")
15
+ kw = mi.enrich.keywords(text)
16
+ mi.documents.process_pdf("doc.pdf")
17
+
18
+ 2. The orchestrated, idempotent/resumable pipeline::
19
+
20
+ from media_intelligence import MediaPipeline
21
+ pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
22
+ pipe.ingest().extract().structure().enrich().persist().publish()
23
+ print(pipe.report.summary)
24
+ # ... or just: pipe.run()
25
+
26
+ Layers map one-to-one onto canonical owning packages:
27
+
28
+ ingest -> abstract_webtools (scrape + yt-dlp video download)
29
+ ocr -> abstract_ocr (layout-aware multi-engine OCR)
30
+ documents -> abstract_pdfs (PDF decomposition + HTML)
31
+ video -> abstract_videos (registry pipeline: download/frames/SEO)
32
+ transcribe-> hugpy (Whisper ASR; abstract_ocr fallback)
33
+ enrich -> hugpy (summaries, keywords, vision, SEO)
34
+ persist -> filesystem now, DB-pluggable interface
35
+ publish -> abstract_react + abstract_nginx (SEO/OG + static HTML)
36
+
37
+ Every backing package is imported lazily, so ``import media_intelligence`` is
38
+ cheap and a missing optional package only errors when that layer is used.
39
+ """
40
+ from __future__ import annotations
41
+
42
+ import importlib
43
+ import importlib.util
44
+ from typing import TYPE_CHECKING
45
+
46
+ from ._lazy import MediaIntelligenceError, MissingDependency
47
+ from .schemas import (
48
+ MediaItem,
49
+ MediaKind,
50
+ PipelineReport,
51
+ Stage,
52
+ StageResult,
53
+ detect_media_kind,
54
+ )
55
+
56
+ __version__ = "0.1.0"
57
+
58
+ # Submodules exposed as lazy namespaces via module __getattr__ below.
59
+ _LAZY_SUBMODULES = {
60
+ "ingest",
61
+ "ocr",
62
+ "documents",
63
+ "video",
64
+ "transcribe",
65
+ "enrich",
66
+ "structure",
67
+ "persist",
68
+ "publish",
69
+ }
70
+
71
+ # Which backing package(s) each layer needs. ``persist`` is pure-stdlib (always
72
+ # available); ``transcribe`` works if either hugpy or abstract_ocr is present.
73
+ _LAYER_PACKAGES = {
74
+ "ingest": ("abstract_webtools",),
75
+ "ocr": ("abstract_ocr",),
76
+ "documents": ("abstract_pdfs",),
77
+ "video": ("abstract_videos",),
78
+ "transcribe": ("hugpy", "abstract_ocr"), # any-of
79
+ "enrich": ("hugpy",),
80
+ "structure": (),
81
+ "persist": (),
82
+ "publish": ("abstract_react",), # nginx HTML is an additional option
83
+ }
84
+
85
+ __all__ = [
86
+ "MediaPipeline",
87
+ "MediaItem",
88
+ "MediaKind",
89
+ "Stage",
90
+ "StageResult",
91
+ "PipelineReport",
92
+ "detect_media_kind",
93
+ "available",
94
+ "MediaIntelligenceError",
95
+ "MissingDependency",
96
+ "__version__",
97
+ *sorted(_LAZY_SUBMODULES),
98
+ ]
99
+
100
+
101
+ def _installed(package: str) -> bool:
102
+ """Whether ``package`` is importable — without importing it."""
103
+ try:
104
+ return importlib.util.find_spec(package) is not None
105
+ except (ImportError, ValueError):
106
+ return False
107
+
108
+
109
+ def available(layer: str | None = None):
110
+ """Report which layers are usable in this environment, without importing them.
111
+
112
+ >>> import media_intelligence as mi
113
+ >>> mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
114
+ >>> mi.available("enrich") # True / False
115
+
116
+ A layer is available if (any of) its backing package(s) are installed. The
117
+ pure-stdlib layers (``structure``, ``persist``) are always available.
118
+ """
119
+ def _ok(needed: tuple) -> bool:
120
+ return True if not needed else any(_installed(p) for p in needed)
121
+
122
+ if layer is not None:
123
+ if layer not in _LAYER_PACKAGES:
124
+ raise ValueError(f"unknown layer {layer!r}; choose from {sorted(_LAYER_PACKAGES)}")
125
+ return _ok(_LAYER_PACKAGES[layer])
126
+ return {name: _ok(pkgs) for name, pkgs in _LAYER_PACKAGES.items()}
127
+
128
+ if TYPE_CHECKING: # for type checkers / IDEs only — no runtime import cost
129
+ from . import ( # noqa: F401
130
+ documents,
131
+ enrich,
132
+ ingest,
133
+ ocr,
134
+ persist,
135
+ publish,
136
+ structure,
137
+ transcribe,
138
+ video,
139
+ )
140
+ from .pipeline import MediaPipeline # noqa: F401
141
+
142
+
143
+ def __getattr__(name: str):
144
+ """PEP 562 lazy attribute access.
145
+
146
+ Keeps the import graph flat: namespaces and the (heavier) pipeline module
147
+ are only imported when first referenced.
148
+ """
149
+ if name == "MediaPipeline":
150
+ module = importlib.import_module(".pipeline", __name__)
151
+ return module.MediaPipeline
152
+ if name in _LAZY_SUBMODULES:
153
+ return importlib.import_module(f".{name}", __name__)
154
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
155
+
156
+
157
+ def __dir__():
158
+ return sorted(set(__all__) | set(globals()))
@@ -0,0 +1,100 @@
1
+ """Lazy / soft import plumbing for the media_intelligence facade.
2
+
3
+ The whole point of this package is to be a *thin* unified access layer over a
4
+ set of heavy sibling packages (paddleocr, torch, yt-dlp, whisper, ...). Importing
5
+ ``media_intelligence`` must stay cheap, so every sibling package is imported
6
+ lazily — at first *use*, never at module import time — and the result is cached.
7
+
8
+ If an optional layer's backing package is not installed, we raise a single,
9
+ actionable :class:`MissingDependency` error that names the extra to install
10
+ rather than leaking a raw ``ModuleNotFoundError`` from deep inside a submodule.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import functools
15
+ import importlib
16
+ from types import ModuleType
17
+ from typing import Any, Callable
18
+
19
+ __all__ = [
20
+ "MediaIntelligenceError",
21
+ "MissingDependency",
22
+ "soft_import",
23
+ "require",
24
+ "lazy_namespace",
25
+ ]
26
+
27
+
28
+ class MediaIntelligenceError(RuntimeError):
29
+ """Base error for the media_intelligence facade."""
30
+
31
+
32
+ class MissingDependency(MediaIntelligenceError):
33
+ """A layer was used but its backing package is not installed."""
34
+
35
+
36
+ # Which pip extra installs which backing package — used to build helpful errors.
37
+ _EXTRA_FOR_PACKAGE = {
38
+ "abstract_essentials": "(core)",
39
+ "abstract_webtools": "ingest",
40
+ "abstract_ocr": "ocr",
41
+ "abstract_pdfs": "documents",
42
+ "abstract_videos": "video",
43
+ "hugpy": "enrich",
44
+ "abstract_react": "publish",
45
+ "abstract_nginx": "publish",
46
+ }
47
+
48
+ _MODULE_CACHE: dict[str, ModuleType] = {}
49
+
50
+
51
+ def soft_import(package: str, *, layer: str | None = None) -> ModuleType:
52
+ """Import ``package`` lazily, caching the module.
53
+
54
+ Raises :class:`MissingDependency` (not ``ModuleNotFoundError``) with an
55
+ install hint if the package is absent.
56
+ """
57
+ cached = _MODULE_CACHE.get(package)
58
+ if cached is not None:
59
+ return cached
60
+ try:
61
+ module = importlib.import_module(package)
62
+ except ModuleNotFoundError as exc:
63
+ # Only translate a *missing backing package*; a genuine sub-import error
64
+ # inside an installed package should surface unchanged.
65
+ if exc.name and (exc.name == package or package.startswith(exc.name + ".")):
66
+ # Resolve the install hint against the *top-level* package, so a
67
+ # missing submodule (e.g. "abstract_nginx.generate_htmls") still
68
+ # points at the right extra.
69
+ top = package.split(".", 1)[0]
70
+ extra = _EXTRA_FOR_PACKAGE.get(top, top)
71
+ hint = f'pip install "media_intelligence[{extra}]"' if extra and extra != "(core)" \
72
+ else f"pip install {top}"
73
+ raise MissingDependency(
74
+ f"The '{layer or package}' layer needs '{package}', which is not "
75
+ f"installed. Install it with: {hint}"
76
+ ) from exc
77
+ raise
78
+ _MODULE_CACHE[package] = module
79
+ return module
80
+
81
+
82
+ def require(package: str, attr: str, *, layer: str | None = None) -> Any:
83
+ """Return ``attr`` from a soft-imported ``package``.
84
+
85
+ Raises a clear error if the package is installed but the symbol is gone
86
+ (e.g. an upstream rename) so failures point at the facade, not the user.
87
+ """
88
+ module = soft_import(package, layer=layer)
89
+ try:
90
+ return getattr(module, attr)
91
+ except AttributeError as exc:
92
+ raise MediaIntelligenceError(
93
+ f"'{package}.{attr}' is not available — the upstream API may have "
94
+ f"changed. The media_intelligence '{layer or package}' layer needs updating."
95
+ ) from exc
96
+
97
+
98
+ def lazy_namespace(loader: Callable[[], ModuleType]) -> Callable[[], ModuleType]:
99
+ """Wrap a submodule loader so the import happens once and is memoised."""
100
+ return functools.lru_cache(maxsize=1)(loader)
@@ -0,0 +1,119 @@
1
+ """Extraction + structuring layer (documents) — PDFs.
2
+
3
+ Canonical owner: ``abstract_pdfs``. Page-level decomposition (text + images),
4
+ manifest generation, OCR (delegating to ``abstract_ocr``), enrichment, and
5
+ static HTML (viewer + gallery).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ from typing import Any, Optional
12
+
13
+ from ._lazy import require, soft_import
14
+
15
+ _PKG = "abstract_pdfs"
16
+ _LAYER = "documents"
17
+
18
+ __all__ = [
19
+ "process_pdf",
20
+ "process_pdfs",
21
+ "process_all_pdfs",
22
+ "generate_pdf",
23
+ "pdf_pages",
24
+ "DocumentPipeline",
25
+ "SliceManager",
26
+ ]
27
+
28
+
29
+ def process_pdf(pdf_path: str, **kwargs: Any) -> dict:
30
+ """Process every page of one PDF (image→text→info→metadata→html + gallery)."""
31
+ fn = require(_PKG, "process_pdf", layer=_LAYER)
32
+ return fn(pdf_path, **kwargs)
33
+
34
+
35
+ def process_pdfs(pdf_paths: list[str], **kwargs: Any) -> list:
36
+ """Batch process many PDFs with two-level parallelism (PDFs × pages)."""
37
+ fn = require(_PKG, "process_pdfs", layer=_LAYER)
38
+ return fn(pdf_paths, **kwargs)
39
+
40
+
41
+ def process_all_pdfs(directory: str, **kwargs: Any):
42
+ """Discover and process every ``.pdf`` under ``directory``."""
43
+ fn = require(_PKG, "process_all_pdfs", layer=_LAYER)
44
+ return fn(directory, **kwargs)
45
+
46
+
47
+ def generate_pdf(pdf_path: str, **kwargs: Any) -> dict:
48
+ """One-call end-to-end: slice + OCR + enriched manifests + viewer HTML."""
49
+ mod = soft_import(_PKG + ".pipeline", layer=_LAYER)
50
+ fn = getattr(mod, "generate_pdf", None) or require(_PKG, "generate_pdf", layer=_LAYER)
51
+ return fn(pdf_path, **kwargs)
52
+
53
+
54
+ def _resolve_pdf_dir(pdf_path: str) -> Optional[str]:
55
+ """Find the directory holding ``pages/`` for a processed PDF.
56
+
57
+ ``process_pdf`` relocates ``<dir>/foo.pdf`` into ``<dir>/foo/foo.pdf`` and
58
+ writes pages under ``<dir>/foo/pages/``. We check the relocated dir first,
59
+ then the original dir, so this works whether or not relocation happened.
60
+ """
61
+ p = os.path.abspath(pdf_path)
62
+ parent = os.path.dirname(p)
63
+ stem = os.path.splitext(os.path.basename(p))[0]
64
+ for candidate in (os.path.join(parent, stem), parent):
65
+ if os.path.isdir(os.path.join(candidate, "pages")):
66
+ return candidate
67
+ return None
68
+
69
+
70
+ def pdf_pages(pdf_path: str) -> tuple[list[dict[str, Any]], Optional[str]]:
71
+ """Read back the per-page OCR'd text/info that ``process_pdf`` wrote to disk.
72
+
73
+ Returns ``(pages, full_text)`` where ``pages`` is a list of
74
+ ``{"index", "page", "text", "info"}`` dicts in page order, and ``full_text``
75
+ is the pages joined. Reads the cached ``pages/NNNN/text.txt`` + ``info.json``
76
+ layout directly — no re-OCR, no fragile deep imports. ``([], None)`` if the
77
+ PDF hasn't been processed yet.
78
+ """
79
+ base = _resolve_pdf_dir(pdf_path)
80
+ if base is None:
81
+ return [], None
82
+ pages_dir = os.path.join(base, "pages")
83
+ # zero-padded names (0001, 0002, ...) so lexical sort == page order
84
+ names = sorted(
85
+ d for d in os.listdir(pages_dir) if os.path.isdir(os.path.join(pages_dir, d))
86
+ )
87
+ pages: list[dict[str, Any]] = []
88
+ for i, name in enumerate(names):
89
+ pdir = os.path.join(pages_dir, name)
90
+ text_path = os.path.join(pdir, "text.txt")
91
+ info_path = os.path.join(pdir, "info.json")
92
+ text = ""
93
+ if os.path.isfile(text_path):
94
+ with open(text_path, "r", encoding="utf-8", errors="replace") as fh:
95
+ text = fh.read().strip()
96
+ info: dict[str, Any] = {}
97
+ if os.path.isfile(info_path):
98
+ try:
99
+ with open(info_path, "r", encoding="utf-8") as fh:
100
+ info = json.load(fh)
101
+ except Exception:
102
+ info = {}
103
+ pages.append(
104
+ {"index": int(name) if name.isdigit() else i, "page": name, "text": text, "info": info}
105
+ )
106
+ full_text = "\n\n".join(p["text"] for p in pages if p["text"]).strip() or None
107
+ return pages, full_text
108
+
109
+
110
+ def DocumentPipeline(*args: Any, **kwargs: Any):
111
+ """Construct the per-PDF ``DocumentPipeline`` orchestrator."""
112
+ cls = require(_PKG, "DocumentPipeline", layer=_LAYER)
113
+ return cls(*args, **kwargs)
114
+
115
+
116
+ def SliceManager(*args: Any, **kwargs: Any):
117
+ """Construct the slice-aware multi-engine column OCR ``SliceManager``."""
118
+ cls = require(_PKG, "SliceManager", layer=_LAYER)
119
+ return cls(*args, **kwargs)
@@ -0,0 +1,94 @@
1
+ """Enrichment layer — content understanding.
2
+
3
+ Canonical owner: ``hugpy`` (the latest ML/NLP namespace, its own project). One
4
+ seam for summarization, keyword extraction/refinement, image captioning (vision),
5
+ and the higher-level ``analyze*`` helpers that produce SEO-ready metadata.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Optional
10
+
11
+ from ._lazy import require
12
+
13
+ _PKG = "hugpy"
14
+ _LAYER = "enrich"
15
+
16
+ __all__ = [
17
+ "summarize",
18
+ "summarize_image",
19
+ "keywords",
20
+ "refine_keywords",
21
+ "keyword_density",
22
+ "caption",
23
+ "analyze",
24
+ "analyze_pdf",
25
+ "analyze_video",
26
+ "execute",
27
+ ]
28
+
29
+
30
+ def summarize(text: Optional[str] = None, **kwargs: Any) -> str:
31
+ """Abstractive summary of ``text`` (chunked + consolidated)."""
32
+ fn = require(_PKG, "summarize", layer=_LAYER)
33
+ return fn(text, **kwargs) if text is not None else fn(**kwargs)
34
+
35
+
36
+ def summarize_image(image_path: str, **kwargs: Any) -> str:
37
+ """Summarize/describe an image (vision model)."""
38
+ fn = require(_PKG, "summarize_image", layer=_LAYER)
39
+ return fn(image_path, **kwargs)
40
+
41
+
42
+ def keywords(text: str, **kwargs: Any):
43
+ """Extract keyphrases (KeyBERT + spaCy) → list[(phrase, score)]."""
44
+ fn = require(_PKG, "extract_keywords", layer=_LAYER)
45
+ return fn(text, **kwargs)
46
+
47
+
48
+ def refine_keywords(text: str, **kwargs: Any):
49
+ """SEO-refined, density-filtered keywords → ``RefinedResult``."""
50
+ fn = require(_PKG, "refine_keywords", layer=_LAYER)
51
+ return fn(text, **kwargs)
52
+
53
+
54
+ def keyword_density(text: str, kws: Any) -> dict:
55
+ """Keyword density map for ``text`` against ``kws``."""
56
+ fn = require(_PKG, "keyword_density", layer=_LAYER)
57
+ return fn(text, kws)
58
+
59
+
60
+ def caption(image_path: str, prompt: str = "please describe this image", **kwargs: Any) -> str:
61
+ """Caption / visually analyse an image via the cached vision coder."""
62
+ get_vision_coder = require(_PKG, "get_vision_coder", layer=_LAYER)
63
+ coder = get_vision_coder(**{k: v for k, v in kwargs.items() if k in
64
+ {"model_key", "torch_dtype", "max_tokens", "min_tokens"}})
65
+ analyze_kwargs = {k: v for k, v in kwargs.items() if k in {"max_new_tokens", "max_tokens"}}
66
+ return coder.analyze_image(image_path, prompt=prompt, **analyze_kwargs)
67
+
68
+
69
+ def analyze(text: str, **kwargs: Any):
70
+ """Full text analysis → summary + keywords + metadata bundle."""
71
+ fn = require(_PKG, "analyze", layer=_LAYER)
72
+ return fn(text, **kwargs)
73
+
74
+
75
+ def analyze_pdf(pdf_path: str, **kwargs: Any):
76
+ """Analyze an already-extracted PDF's text into SEO/metadata."""
77
+ fn = require(_PKG, "analyze_pdf", layer=_LAYER)
78
+ return fn(pdf_path, **kwargs)
79
+
80
+
81
+ def analyze_video(*args: Any, **kwargs: Any):
82
+ """Frame-by-frame video understanding (vision over extracted frames)."""
83
+ fn = require(_PKG, "analyze_video", layer=_LAYER)
84
+ return fn(*args, **kwargs)
85
+
86
+
87
+ def execute(*args: Any, **kwargs: Any):
88
+ """Escape hatch: hugpy's unified dispatch (``execute_prompt``).
89
+
90
+ Routes any ``(framework, task)`` — chat, vision, ASR, summarize, embed,
91
+ image-gen, keywords — through the model registry.
92
+ """
93
+ fn = require(_PKG, "execute_prompt", layer=_LAYER)
94
+ return fn(*args, **kwargs)
@@ -0,0 +1,74 @@
1
+ """Ingestion layer — bring raw media in from the web.
2
+
3
+ Canonical owner: ``abstract_webtools``. This is the single seam for *acquiring*
4
+ media: scraping/parsing a page and downloading video (yt-dlp + ffmpeg). The
5
+ ``abstract_videos`` package also downloads video, but for the platform we make
6
+ webtools the one owner of "fetch from a URL" and let the video pipeline consume
7
+ the local file.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Optional
12
+
13
+ from ._lazy import require, soft_import
14
+
15
+ _PKG = "abstract_webtools"
16
+ _LAYER = "ingest"
17
+
18
+ __all__ = [
19
+ "scrape",
20
+ "soup",
21
+ "page_text",
22
+ "links",
23
+ "download_video",
24
+ "video_info",
25
+ "video_id",
26
+ ]
27
+
28
+
29
+ def scrape(url: str, **kwargs: Any):
30
+ """Return a ``UnifiedWebManager`` for ``url`` (lazy ``.soup``/``.link_mgr``/...)."""
31
+ UnifiedWebManager = require(_PKG, "UnifiedWebManager", layer=_LAYER)
32
+ return UnifiedWebManager(url=url, **kwargs)
33
+
34
+
35
+ def soup(url: str, **kwargs: Any):
36
+ """Return a parsed BeautifulSoup-backed soup manager for ``url``."""
37
+ get_soup = require(_PKG, "get_soup", layer=_LAYER)
38
+ return get_soup(url, **kwargs)
39
+
40
+
41
+ def page_text(url: str) -> str:
42
+ """Return the visible text of a page."""
43
+ get_soup_text = require(_PKG, "get_soup_text", layer=_LAYER)
44
+ return get_soup_text(url)
45
+
46
+
47
+ def links(url: str, **kwargs: Any):
48
+ """Discover links/images on a page via the link manager."""
49
+ mgr = scrape(url, **kwargs)
50
+ return mgr.link_mgr
51
+
52
+
53
+ def download_video(url: str, download_directory: Optional[str] = None, **kwargs: Any):
54
+ """Download a video from ``url`` (yt-dlp/ffmpeg/m3u8) and return the manager.
55
+
56
+ The manager exposes the resolved local path(s) and metadata.
57
+ """
58
+ get_video_mgr = require(_PKG, "get_video_mgr", layer=_LAYER)
59
+ if download_directory is not None:
60
+ kwargs.setdefault("download_directory", download_directory)
61
+ kwargs.setdefault("download_video", True)
62
+ return get_video_mgr(url, **kwargs)
63
+
64
+
65
+ def video_info(url: str, **kwargs: Any) -> dict:
66
+ """Resolve video metadata for ``url`` without downloading."""
67
+ get_video_info = require(_PKG, "get_video_info", layer=_LAYER)
68
+ return get_video_info(url, **kwargs)
69
+
70
+
71
+ def video_id(url: str) -> str:
72
+ """Return a stable video id for ``url``."""
73
+ get_video_id = require(_PKG, "get_video_id", layer=_LAYER)
74
+ return get_video_id(url)
@@ -0,0 +1,65 @@
1
+ """Extraction layer (images) — OCR.
2
+
3
+ Canonical owner: ``abstract_ocr``. Multi-engine, layout-aware OCR plus the
4
+ per-frame video-OCR helpers. ``abstract_videos`` re-implements a slice of these;
5
+ the platform routes everyone through ``abstract_ocr`` so there is one OCR owner.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import Any, Optional
10
+
11
+ from ._lazy import require, soft_import
12
+
13
+ _PKG = "abstract_ocr"
14
+ _LAYER = "ocr"
15
+
16
+ __all__ = [
17
+ "image_to_text",
18
+ "image_to_text_layout",
19
+ "directory_to_texts",
20
+ "video_frames",
21
+ "video_text",
22
+ "best_thumbnail",
23
+ ]
24
+
25
+
26
+ def image_to_text(image_path: str, preprocess: bool = True) -> str:
27
+ """OCR a single image to text (Paddle-first with Tesseract fallback)."""
28
+ fn = require(_PKG, "convert_image_to_text", layer=_LAYER)
29
+ return fn(image_path, preprocess=preprocess)
30
+
31
+
32
+ def image_to_text_layout(image_path: str, config: Any = None, chain: Optional[list] = None):
33
+ """Run the modern layout-aware OCR pipeline → ``PipelineReport``.
34
+
35
+ Reading-order aware, with column detection and region segmentation.
36
+ """
37
+ mod = soft_import(_PKG + ".layout_ocr.pipeline", layer=_LAYER)
38
+ run_on_image = getattr(mod, "run_on_image", None)
39
+ if run_on_image is None: # fall back to top-level export in some builds
40
+ run_on_image = require(_PKG, "run_on_image", layer=_LAYER)
41
+ return run_on_image(image_path, config=config, chain=chain)
42
+
43
+
44
+ def directory_to_texts(directory: str, **kwargs: Any):
45
+ """Batch-OCR every image in a directory."""
46
+ fn = require(_PKG, "extract_image_texts_from_directory", layer=_LAYER)
47
+ return fn(directory, **kwargs)
48
+
49
+
50
+ def video_frames(video_path: str, directory: str, **kwargs: Any):
51
+ """Sample frames from a video into ``directory``."""
52
+ fn = require(_PKG, "extract_video_frames", layer=_LAYER)
53
+ return fn(video_path, directory, **kwargs)
54
+
55
+
56
+ def video_text(video_path: str, **kwargs: Any):
57
+ """Extract on-screen text across a video's frames."""
58
+ fn = require(_PKG, "analyze_video_text", layer=_LAYER)
59
+ return fn(video_path, **kwargs)
60
+
61
+
62
+ def best_thumbnail(video_text_or_whisper: Any, keywords: Any, directory: str, **kwargs: Any):
63
+ """Pick the most representative thumbnail frame using transcript keywords."""
64
+ fn = require(_PKG, "pick_optimal_thumbnail", layer=_LAYER)
65
+ return fn(video_text_or_whisper, keywords, directory, **kwargs)
@@ -0,0 +1,16 @@
1
+ """Persistence layer — filesystem now, DB-pluggable interface.
2
+
3
+ ::
4
+
5
+ store = mi.persist.FileStore("/data")
6
+ store.save_manifest(item.media_id, manifest)
7
+ # later, same interface:
8
+ # store = mi.persist.PgStore(dsn=...) # JSONB backend (planned)
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from .base import Store
13
+ from .filestore import FileStore
14
+ from .pgstore import PgStore
15
+
16
+ __all__ = ["Store", "FileStore", "PgStore"]
@@ -0,0 +1,41 @@
1
+ """Persistence interface — one contract, swappable backends.
2
+
3
+ Vision principle: *local-first, cloud-optional*. v1 ships a filesystem store;
4
+ the same interface admits a Postgres/JSONB store later (``abstract_database``)
5
+ with no change to the pipeline. Stores are addressed by ``media_id`` and persist
6
+ the typed manifest plus a pointer to the on-disk asset collection.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import Any, Optional, Protocol, runtime_checkable
11
+
12
+ __all__ = ["Store"]
13
+
14
+
15
+ @runtime_checkable
16
+ class Store(Protocol):
17
+ """The persistence contract every backend implements."""
18
+
19
+ def save_manifest(self, media_id: str, manifest: dict[str, Any]) -> str:
20
+ """Persist the lean index ``manifest``; return a locator (path/row id)."""
21
+ ...
22
+
23
+ def load_manifest(self, media_id: str) -> Optional[dict[str, Any]]:
24
+ """Return the stored manifest for ``media_id`` or ``None``."""
25
+ ...
26
+
27
+ def save_document(self, media_id: str, document: dict[str, Any]) -> str:
28
+ """Persist the full content ``document`` (text/pages/transcript)."""
29
+ ...
30
+
31
+ def load_document(self, media_id: str) -> Optional[dict[str, Any]]:
32
+ """Return the full content document for ``media_id`` or ``None``."""
33
+ ...
34
+
35
+ def exists(self, media_id: str) -> bool:
36
+ """Whether a manifest already exists (drives idempotent resume)."""
37
+ ...
38
+
39
+ def collection_dir(self, media_id: str) -> str:
40
+ """Return the directory that holds this item's asset collection."""
41
+ ...