media-intelligence 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. media_intelligence-0.1.0/PKG-INFO +146 -0
  2. media_intelligence-0.1.0/README.md +112 -0
  3. media_intelligence-0.1.0/pyproject.toml +46 -0
  4. media_intelligence-0.1.0/setup.cfg +4 -0
  5. media_intelligence-0.1.0/src/media_intelligence/__init__.py +158 -0
  6. media_intelligence-0.1.0/src/media_intelligence/_lazy.py +100 -0
  7. media_intelligence-0.1.0/src/media_intelligence/documents.py +119 -0
  8. media_intelligence-0.1.0/src/media_intelligence/enrich.py +94 -0
  9. media_intelligence-0.1.0/src/media_intelligence/ingest.py +74 -0
  10. media_intelligence-0.1.0/src/media_intelligence/ocr.py +65 -0
  11. media_intelligence-0.1.0/src/media_intelligence/persist/__init__.py +16 -0
  12. media_intelligence-0.1.0/src/media_intelligence/persist/base.py +41 -0
  13. media_intelligence-0.1.0/src/media_intelligence/persist/filestore.py +86 -0
  14. media_intelligence-0.1.0/src/media_intelligence/persist/pgstore.py +50 -0
  15. media_intelligence-0.1.0/src/media_intelligence/pipeline.py +281 -0
  16. media_intelligence-0.1.0/src/media_intelligence/publish.py +72 -0
  17. media_intelligence-0.1.0/src/media_intelligence/py.typed +0 -0
  18. media_intelligence-0.1.0/src/media_intelligence/schemas.py +163 -0
  19. media_intelligence-0.1.0/src/media_intelligence/structure.py +114 -0
  20. media_intelligence-0.1.0/src/media_intelligence/transcribe.py +37 -0
  21. media_intelligence-0.1.0/src/media_intelligence/video.py +59 -0
  22. media_intelligence-0.1.0/src/media_intelligence.egg-info/PKG-INFO +146 -0
  23. media_intelligence-0.1.0/src/media_intelligence.egg-info/SOURCES.txt +24 -0
  24. media_intelligence-0.1.0/src/media_intelligence.egg-info/dependency_links.txt +1 -0
  25. media_intelligence-0.1.0/src/media_intelligence.egg-info/requires.txt +34 -0
  26. media_intelligence-0.1.0/src/media_intelligence.egg-info/top_level.txt +1 -0
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: media_intelligence
3
+ Version: 0.1.0
4
+ Summary: Abstract Intelligence Platform — a unified, layered pipeline that turns raw media (PDFs, images, video) into structured, searchable, SEO-ready data.
5
+ Author: AbstractEndeavors
6
+ Keywords: ocr,pdf,video,transcription,summarization,seo,media,pipeline
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Provides-Extra: core
10
+ Requires-Dist: abstract_essentials; extra == "core"
11
+ Provides-Extra: ingest
12
+ Requires-Dist: abstract_webtools; extra == "ingest"
13
+ Provides-Extra: ocr
14
+ Requires-Dist: abstract_ocr; extra == "ocr"
15
+ Provides-Extra: documents
16
+ Requires-Dist: abstract_pdfs; extra == "documents"
17
+ Provides-Extra: video
18
+ Requires-Dist: abstract_videos; extra == "video"
19
+ Provides-Extra: transcribe
20
+ Requires-Dist: hugpy; extra == "transcribe"
21
+ Provides-Extra: enrich
22
+ Requires-Dist: hugpy; extra == "enrich"
23
+ Provides-Extra: publish
24
+ Requires-Dist: abstract_react; extra == "publish"
25
+ Requires-Dist: abstract_nginx; extra == "publish"
26
+ Provides-Extra: all
27
+ Requires-Dist: abstract_webtools; extra == "all"
28
+ Requires-Dist: abstract_ocr; extra == "all"
29
+ Requires-Dist: abstract_pdfs; extra == "all"
30
+ Requires-Dist: abstract_videos; extra == "all"
31
+ Requires-Dist: hugpy; extra == "all"
32
+ Requires-Dist: abstract_react; extra == "all"
33
+ Requires-Dist: abstract_nginx; extra == "all"
34
+
35
+ # media_intelligence — Abstract Intelligence Platform
36
+
37
+ A unified, layered facade that turns raw media — **PDFs, images, and video** —
38
+ into **structured, searchable, SEO-ready data**. It does not reimplement any
39
+ engine: it selects the *best* function of each sibling package and exposes it
40
+ behind one clean, lazy API, plus an orchestrated pipeline.
41
+
42
+ ```text
43
+ Raw Media (PDF / Image / Video / URL)
44
+
45
+
46
+ ingest → extract → structure → enrich → persist → publish
47
+ (webtools) (ocr/ (typed (hugpy) (FS / DB) (react/
48
+ pdfs/ metadata) nginx)
49
+ videos)
50
+ ```
51
+
52
+ ## Layers → canonical owners
53
+
54
+ | Layer | Owner package | What it does |
55
+ |--------------|----------------------|------------------------------------------------|
56
+ | `ingest` | `abstract_webtools` | scrape pages, download video (yt-dlp/ffmpeg) |
57
+ | `ocr` | `abstract_ocr` | layout-aware, multi-engine OCR |
58
+ | `documents` | `abstract_pdfs` | PDF decomposition + manifests + HTML |
59
+ | `video` | `abstract_videos` | registry pipeline: download/frames/transcribe |
60
+ | `transcribe` | `hugpy` (→ `abstract_ocr` fallback) | Whisper speech-to-text |
61
+ | `enrich` | `hugpy` | summaries, keywords, vision captioning, SEO |
62
+ | `persist` | filesystem (DB-pluggable) | typed JSON/JSONB manifests |
63
+ | `publish` | `abstract_react` + `abstract_nginx` | SEO/OG metadata + static HTML |
64
+
65
+ Overlapping capabilities are resolved to **one owner** (Whisper → `hugpy`;
66
+ video download → `webtools`; summarize/keywords → `hugpy`).
67
+
68
+ ## Install
69
+
70
+ `media_intelligence` is *just this `src/` facade* — it contains none of the
71
+ engines. Each layer's owner is its own PyPI package, declared as an **optional
72
+ extra**, so you install only what you use:
73
+
74
+ ```bash
75
+ pip install media_intelligence # zero third-party deps — facade only
76
+ pip install "media_intelligence[ocr,enrich]" # just those layers
77
+ pip install "media_intelligence[all]" # the full platform
78
+ ```
79
+
80
+ The package has **no required third-party dependencies**: importing it is cheap
81
+ (~20 ms) and pulls **none** of the backing packages. Each sibling is imported
82
+ **lazily**, only when its layer is actually called; a missing one raises a clear
83
+ `MissingDependency` naming the extra to install.
84
+
85
+ Check what's usable in the current environment without importing anything:
86
+
87
+ ```python
88
+ import media_intelligence as mi
89
+ mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
90
+ mi.available("enrich") # True / False
91
+ ```
92
+
93
+ ## Usage
94
+
95
+ ### Direct namespace access
96
+
97
+ ```python
98
+ import media_intelligence as mi
99
+
100
+ text = mi.ocr.image_to_text("page.png")
101
+ kw = mi.enrich.keywords(text)
102
+ mi.documents.process_pdf("doc.pdf")
103
+ mi.ingest.download_video("https://site.com/v.mp4", download_directory="/data")
104
+ ```
105
+
106
+ ### Orchestrated pipeline (idempotent + resumable)
107
+
108
+ ```python
109
+ from media_intelligence import MediaPipeline
110
+
111
+ pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
112
+ pipe.ingest().extract().structure().enrich().persist().publish()
113
+ print(pipe.report.summary)
114
+ # ... or simply:
115
+ pipe.run()
116
+ ```
117
+
118
+ The pipeline autodetects media kind, dispatches each stage accordingly, skips
119
+ stages already satisfied (idempotent), and rehydrates from a prior manifest on
120
+ re-run (resumable). Results land in `out_root/<media_id>/manifest.json`.
121
+
122
+ ### Persistence (DB-pluggable, two records)
123
+
124
+ Each item is persisted as **two** records so indexing stays cheap while
125
+ aggregation stays simple:
126
+
127
+ - `manifest.json` — lean index: ids, counts, `text_chars`, summary, keywords,
128
+ SEO, asset pointers. (The JSONB metadata row.)
129
+ - `document.json` — canonical content: full `text`, `pages`/segments,
130
+ `transcript`. The single source of truth for search / aggregation / LLM
131
+ datasets — one read per item, no re-stitching of per-owner on-disk files.
132
+
133
+ ```python
134
+ store = mi.persist.FileStore("/data")
135
+ store.save_manifest(item.media_id, manifest) # lean index
136
+ store.save_document(item.media_id, document) # full body
137
+ doc = store.load_document(item.media_id) # aggregation reads this
138
+
139
+ # later, identical interface, JSONB backend:
140
+ # store = mi.persist.PgStore(dsn=...) # planned (abstract_database)
141
+ # -> metadata in JSONB, body text in a full-text-indexed column
142
+ ```
143
+
144
+ `MediaPipeline.persist()` writes both. On re-run, the body is rehydrated from
145
+ `document.json`, so `extract`/`enrich` skip (no re-OCR / re-transcribe).
146
+ ```
@@ -0,0 +1,112 @@
1
+ # media_intelligence — Abstract Intelligence Platform
2
+
3
+ A unified, layered facade that turns raw media — **PDFs, images, and video** —
4
+ into **structured, searchable, SEO-ready data**. It does not reimplement any
5
+ engine: it selects the *best* function of each sibling package and exposes it
6
+ behind one clean, lazy API, plus an orchestrated pipeline.
7
+
8
+ ```text
9
+ Raw Media (PDF / Image / Video / URL)
10
+
11
+
12
+ ingest → extract → structure → enrich → persist → publish
13
+ (webtools) (ocr/ (typed (hugpy) (FS / DB) (react/
14
+ pdfs/ metadata) nginx)
15
+ videos)
16
+ ```
17
+
18
+ ## Layers → canonical owners
19
+
20
+ | Layer | Owner package | What it does |
21
+ |--------------|----------------------|------------------------------------------------|
22
+ | `ingest` | `abstract_webtools` | scrape pages, download video (yt-dlp/ffmpeg) |
23
+ | `ocr` | `abstract_ocr` | layout-aware, multi-engine OCR |
24
+ | `documents` | `abstract_pdfs` | PDF decomposition + manifests + HTML |
25
+ | `video` | `abstract_videos` | registry pipeline: download/frames/transcribe |
26
+ | `transcribe` | `hugpy` (→ `abstract_ocr` fallback) | Whisper speech-to-text |
27
+ | `enrich` | `hugpy` | summaries, keywords, vision captioning, SEO |
28
+ | `persist` | filesystem (DB-pluggable) | typed JSON/JSONB manifests |
29
+ | `publish` | `abstract_react` + `abstract_nginx` | SEO/OG metadata + static HTML |
30
+
31
+ Overlapping capabilities are resolved to **one owner** (Whisper → `hugpy`;
32
+ video download → `webtools`; summarize/keywords → `hugpy`).
33
+
34
+ ## Install
35
+
36
+ `media_intelligence` is *just this `src/` facade* — it contains none of the
37
+ engines. Each layer's owner is its own PyPI package, declared as an **optional
38
+ extra**, so you install only what you use:
39
+
40
+ ```bash
41
+ pip install media_intelligence # zero third-party deps — facade only
42
+ pip install "media_intelligence[ocr,enrich]" # just those layers
43
+ pip install "media_intelligence[all]" # the full platform
44
+ ```
45
+
46
+ The package has **no required third-party dependencies**: importing it is cheap
47
+ (~20 ms) and pulls **none** of the backing packages. Each sibling is imported
48
+ **lazily**, only when its layer is actually called; a missing one raises a clear
49
+ `MissingDependency` naming the extra to install.
50
+
51
+ Check what's usable in the current environment without importing anything:
52
+
53
+ ```python
54
+ import media_intelligence as mi
55
+ mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
56
+ mi.available("enrich") # True / False
57
+ ```
58
+
59
+ ## Usage
60
+
61
+ ### Direct namespace access
62
+
63
+ ```python
64
+ import media_intelligence as mi
65
+
66
+ text = mi.ocr.image_to_text("page.png")
67
+ kw = mi.enrich.keywords(text)
68
+ mi.documents.process_pdf("doc.pdf")
69
+ mi.ingest.download_video("https://site.com/v.mp4", download_directory="/data")
70
+ ```
71
+
72
+ ### Orchestrated pipeline (idempotent + resumable)
73
+
74
+ ```python
75
+ from media_intelligence import MediaPipeline
76
+
77
+ pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
78
+ pipe.ingest().extract().structure().enrich().persist().publish()
79
+ print(pipe.report.summary)
80
+ # ... or simply:
81
+ pipe.run()
82
+ ```
83
+
84
+ The pipeline autodetects media kind, dispatches each stage accordingly, skips
85
+ stages already satisfied (idempotent), and rehydrates from a prior manifest on
86
+ re-run (resumable). Results land in `out_root/<media_id>/manifest.json`.
87
+
88
+ ### Persistence (DB-pluggable, two records)
89
+
90
+ Each item is persisted as **two** records so indexing stays cheap while
91
+ aggregation stays simple:
92
+
93
+ - `manifest.json` — lean index: ids, counts, `text_chars`, summary, keywords,
94
+ SEO, asset pointers. (The JSONB metadata row.)
95
+ - `document.json` — canonical content: full `text`, `pages`/segments,
96
+ `transcript`. The single source of truth for search / aggregation / LLM
97
+ datasets — one read per item, no re-stitching of per-owner on-disk files.
98
+
99
+ ```python
100
+ store = mi.persist.FileStore("/data")
101
+ store.save_manifest(item.media_id, manifest) # lean index
102
+ store.save_document(item.media_id, document) # full body
103
+ doc = store.load_document(item.media_id) # aggregation reads this
104
+
105
+ # later, identical interface, JSONB backend:
106
+ # store = mi.persist.PgStore(dsn=...) # planned (abstract_database)
107
+ # -> metadata in JSONB, body text in a full-text-indexed column
108
+ ```
109
+
110
+ `MediaPipeline.persist()` writes both. On re-run, the body is rehydrated from
111
+ `document.json`, so `extract`/`enrich` skip (no re-OCR / re-transcribe).
112
+ ```
@@ -0,0 +1,46 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "media_intelligence"
7
+ version = "0.1.0"
8
+ description = "Abstract Intelligence Platform — a unified, layered pipeline that turns raw media (PDFs, images, video) into structured, searchable, SEO-ready data."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "AbstractEndeavors" }]
12
+ keywords = ["ocr", "pdf", "video", "transcription", "summarization", "seo", "media", "pipeline"]
13
+
14
+ # The facade is a thin, pure-stdlib access layer with ZERO required third-party
15
+ # deps: importing it pulls nothing. Each backing package is its own PyPI project,
16
+ # imported lazily only when that layer is used. Install just the layers you need
17
+ # via the extras below (or `[all]`). `abstract_essentials` is used opportunistically
18
+ # by the filesystem store but has a stdlib fallback, so it's optional too.
19
+ dependencies = []
20
+
21
+ [project.optional-dependencies]
22
+ # Each layer maps to one canonical owning package. Install only what you use.
23
+ core = ["abstract_essentials"] # nicer atomic JSON I/O for FileStore (optional)
24
+ ingest = ["abstract_webtools"]
25
+ ocr = ["abstract_ocr"]
26
+ documents = ["abstract_pdfs"]
27
+ video = ["abstract_videos"]
28
+ transcribe= ["hugpy"] # canonical ASR (falls back to abstract_ocr if absent)
29
+ enrich = ["hugpy"] # canonical ML/NLP namespace (latest iteration)
30
+ publish = ["abstract_react", "abstract_nginx"]
31
+ # Everything for the full end-to-end platform.
32
+ all = [
33
+ "abstract_webtools",
34
+ "abstract_ocr",
35
+ "abstract_pdfs",
36
+ "abstract_videos",
37
+ "hugpy",
38
+ "abstract_react",
39
+ "abstract_nginx",
40
+ ]
41
+
42
+ [tool.setuptools.packages.find]
43
+ where = ["src"]
44
+
45
+ [tool.setuptools.package-data]
46
+ media_intelligence = ["py.typed"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,158 @@
1
+ """media_intelligence — the Abstract Intelligence Platform facade.
2
+
3
+ A unified, layered access layer that turns raw media (PDFs, images, video) into
4
+ structured, searchable, SEO-ready data. It does not reimplement any engine; it
5
+ selects the *best* function of each sibling package and exposes it behind one
6
+ clean, lazy API.
7
+
8
+ Two ways to use it
9
+ ------------------
10
+
11
+ 1. Direct namespace access — grab one tool::
12
+
13
+ import media_intelligence as mi
14
+ text = mi.ocr.image_to_text("page.png")
15
+ kw = mi.enrich.keywords(text)
16
+ mi.documents.process_pdf("doc.pdf")
17
+
18
+ 2. The orchestrated, idempotent/resumable pipeline::
19
+
20
+ from media_intelligence import MediaPipeline
21
+ pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
22
+ pipe.ingest().extract().structure().enrich().persist().publish()
23
+ print(pipe.report.summary)
24
+ # ... or just: pipe.run()
25
+
26
+ Layers map one-to-one onto canonical owning packages:
27
+
28
+ ingest -> abstract_webtools (scrape + yt-dlp video download)
29
+ ocr -> abstract_ocr (layout-aware multi-engine OCR)
30
+ documents -> abstract_pdfs (PDF decomposition + HTML)
31
+ video -> abstract_videos (registry pipeline: download/frames/SEO)
32
+ transcribe-> hugpy (Whisper ASR; abstract_ocr fallback)
33
+ enrich -> hugpy (summaries, keywords, vision, SEO)
34
+ persist -> filesystem now, DB-pluggable interface
35
+ publish -> abstract_react + abstract_nginx (SEO/OG + static HTML)
36
+
37
+ Every backing package is imported lazily, so ``import media_intelligence`` is
38
+ cheap and a missing optional package only errors when that layer is used.
39
+ """
40
+ from __future__ import annotations
41
+
42
+ import importlib
43
+ import importlib.util
44
+ from typing import TYPE_CHECKING
45
+
46
+ from ._lazy import MediaIntelligenceError, MissingDependency
47
+ from .schemas import (
48
+ MediaItem,
49
+ MediaKind,
50
+ PipelineReport,
51
+ Stage,
52
+ StageResult,
53
+ detect_media_kind,
54
+ )
55
+
56
+ __version__ = "0.1.0"
57
+
58
+ # Submodules exposed as lazy namespaces via module __getattr__ below.
59
+ _LAZY_SUBMODULES = {
60
+ "ingest",
61
+ "ocr",
62
+ "documents",
63
+ "video",
64
+ "transcribe",
65
+ "enrich",
66
+ "structure",
67
+ "persist",
68
+ "publish",
69
+ }
70
+
71
+ # Which backing package(s) each layer needs. ``persist`` is pure-stdlib (always
72
+ # available); ``transcribe`` works if either hugpy or abstract_ocr is present.
73
+ _LAYER_PACKAGES = {
74
+ "ingest": ("abstract_webtools",),
75
+ "ocr": ("abstract_ocr",),
76
+ "documents": ("abstract_pdfs",),
77
+ "video": ("abstract_videos",),
78
+ "transcribe": ("hugpy", "abstract_ocr"), # any-of
79
+ "enrich": ("hugpy",),
80
+ "structure": (),
81
+ "persist": (),
82
+ "publish": ("abstract_react",), # nginx HTML is an additional option
83
+ }
84
+
85
+ __all__ = [
86
+ "MediaPipeline",
87
+ "MediaItem",
88
+ "MediaKind",
89
+ "Stage",
90
+ "StageResult",
91
+ "PipelineReport",
92
+ "detect_media_kind",
93
+ "available",
94
+ "MediaIntelligenceError",
95
+ "MissingDependency",
96
+ "__version__",
97
+ *sorted(_LAZY_SUBMODULES),
98
+ ]
99
+
100
+
101
+ def _installed(package: str) -> bool:
102
+ """Whether ``package`` is importable — without importing it."""
103
+ try:
104
+ return importlib.util.find_spec(package) is not None
105
+ except (ImportError, ValueError):
106
+ return False
107
+
108
+
109
+ def available(layer: str | None = None):
110
+ """Report which layers are usable in this environment, without importing them.
111
+
112
+ >>> import media_intelligence as mi
113
+ >>> mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
114
+ >>> mi.available("enrich") # True / False
115
+
116
+ A layer is available if (any of) its backing package(s) are installed. The
117
+ pure-stdlib layers (``structure``, ``persist``) are always available.
118
+ """
119
+ def _ok(needed: tuple) -> bool:
120
+ return True if not needed else any(_installed(p) for p in needed)
121
+
122
+ if layer is not None:
123
+ if layer not in _LAYER_PACKAGES:
124
+ raise ValueError(f"unknown layer {layer!r}; choose from {sorted(_LAYER_PACKAGES)}")
125
+ return _ok(_LAYER_PACKAGES[layer])
126
+ return {name: _ok(pkgs) for name, pkgs in _LAYER_PACKAGES.items()}
127
+
128
+ if TYPE_CHECKING: # for type checkers / IDEs only — no runtime import cost
129
+ from . import ( # noqa: F401
130
+ documents,
131
+ enrich,
132
+ ingest,
133
+ ocr,
134
+ persist,
135
+ publish,
136
+ structure,
137
+ transcribe,
138
+ video,
139
+ )
140
+ from .pipeline import MediaPipeline # noqa: F401
141
+
142
+
143
+ def __getattr__(name: str):
144
+ """PEP 562 lazy attribute access.
145
+
146
+ Keeps the import graph flat: namespaces and the (heavier) pipeline module
147
+ are only imported when first referenced.
148
+ """
149
+ if name == "MediaPipeline":
150
+ module = importlib.import_module(".pipeline", __name__)
151
+ return module.MediaPipeline
152
+ if name in _LAZY_SUBMODULES:
153
+ return importlib.import_module(f".{name}", __name__)
154
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
155
+
156
+
157
+ def __dir__():
158
+ return sorted(set(__all__) | set(globals()))
@@ -0,0 +1,100 @@
1
+ """Lazy / soft import plumbing for the media_intelligence facade.
2
+
3
+ The whole point of this package is to be a *thin* unified access layer over a
4
+ set of heavy sibling packages (paddleocr, torch, yt-dlp, whisper, ...). Importing
5
+ ``media_intelligence`` must stay cheap, so every sibling package is imported
6
+ lazily — at first *use*, never at module import time — and the result is cached.
7
+
8
+ If an optional layer's backing package is not installed, we raise a single,
9
+ actionable :class:`MissingDependency` error that names the extra to install
10
+ rather than leaking a raw ``ModuleNotFoundError`` from deep inside a submodule.
11
+ """
12
+ from __future__ import annotations
13
+
14
+ import functools
15
+ import importlib
16
+ from types import ModuleType
17
+ from typing import Any, Callable
18
+
19
+ __all__ = [
20
+ "MediaIntelligenceError",
21
+ "MissingDependency",
22
+ "soft_import",
23
+ "require",
24
+ "lazy_namespace",
25
+ ]
26
+
27
+
28
+ class MediaIntelligenceError(RuntimeError):
29
+ """Base error for the media_intelligence facade."""
30
+
31
+
32
+ class MissingDependency(MediaIntelligenceError):
33
+ """A layer was used but its backing package is not installed."""
34
+
35
+
36
+ # Which pip extra installs which backing package — used to build helpful errors.
37
+ _EXTRA_FOR_PACKAGE = {
38
+ "abstract_essentials": "(core)",
39
+ "abstract_webtools": "ingest",
40
+ "abstract_ocr": "ocr",
41
+ "abstract_pdfs": "documents",
42
+ "abstract_videos": "video",
43
+ "hugpy": "enrich",
44
+ "abstract_react": "publish",
45
+ "abstract_nginx": "publish",
46
+ }
47
+
48
+ _MODULE_CACHE: dict[str, ModuleType] = {}
49
+
50
+
51
+ def soft_import(package: str, *, layer: str | None = None) -> ModuleType:
52
+ """Import ``package`` lazily, caching the module.
53
+
54
+ Raises :class:`MissingDependency` (not ``ModuleNotFoundError``) with an
55
+ install hint if the package is absent.
56
+ """
57
+ cached = _MODULE_CACHE.get(package)
58
+ if cached is not None:
59
+ return cached
60
+ try:
61
+ module = importlib.import_module(package)
62
+ except ModuleNotFoundError as exc:
63
+ # Only translate a *missing backing package*; a genuine sub-import error
64
+ # inside an installed package should surface unchanged.
65
+ if exc.name and (exc.name == package or package.startswith(exc.name + ".")):
66
+ # Resolve the install hint against the *top-level* package, so a
67
+ # missing submodule (e.g. "abstract_nginx.generate_htmls") still
68
+ # points at the right extra.
69
+ top = package.split(".", 1)[0]
70
+ extra = _EXTRA_FOR_PACKAGE.get(top, top)
71
+ hint = f'pip install "media_intelligence[{extra}]"' if extra and extra != "(core)" \
72
+ else f"pip install {top}"
73
+ raise MissingDependency(
74
+ f"The '{layer or package}' layer needs '{package}', which is not "
75
+ f"installed. Install it with: {hint}"
76
+ ) from exc
77
+ raise
78
+ _MODULE_CACHE[package] = module
79
+ return module
80
+
81
+
82
+ def require(package: str, attr: str, *, layer: str | None = None) -> Any:
83
+ """Return ``attr`` from a soft-imported ``package``.
84
+
85
+ Raises a clear error if the package is installed but the symbol is gone
86
+ (e.g. an upstream rename) so failures point at the facade, not the user.
87
+ """
88
+ module = soft_import(package, layer=layer)
89
+ try:
90
+ return getattr(module, attr)
91
+ except AttributeError as exc:
92
+ raise MediaIntelligenceError(
93
+ f"'{package}.{attr}' is not available — the upstream API may have "
94
+ f"changed. The media_intelligence '{layer or package}' layer needs updating."
95
+ ) from exc
96
+
97
+
98
+ def lazy_namespace(loader: Callable[[], ModuleType]) -> Callable[[], ModuleType]:
99
+ """Wrap a submodule loader so the import happens once and is memoised."""
100
+ return functools.lru_cache(maxsize=1)(loader)
@@ -0,0 +1,119 @@
1
+ """Extraction + structuring layer (documents) — PDFs.
2
+
3
+ Canonical owner: ``abstract_pdfs``. Page-level decomposition (text + images),
4
+ manifest generation, OCR (delegating to ``abstract_ocr``), enrichment, and
5
+ static HTML (viewer + gallery).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import os
11
+ from typing import Any, Optional
12
+
13
+ from ._lazy import require, soft_import
14
+
15
+ _PKG = "abstract_pdfs"
16
+ _LAYER = "documents"
17
+
18
+ __all__ = [
19
+ "process_pdf",
20
+ "process_pdfs",
21
+ "process_all_pdfs",
22
+ "generate_pdf",
23
+ "pdf_pages",
24
+ "DocumentPipeline",
25
+ "SliceManager",
26
+ ]
27
+
28
+
29
+ def process_pdf(pdf_path: str, **kwargs: Any) -> dict:
30
+ """Process every page of one PDF (image→text→info→metadata→html + gallery)."""
31
+ fn = require(_PKG, "process_pdf", layer=_LAYER)
32
+ return fn(pdf_path, **kwargs)
33
+
34
+
35
+ def process_pdfs(pdf_paths: list[str], **kwargs: Any) -> list:
36
+ """Batch process many PDFs with two-level parallelism (PDFs × pages)."""
37
+ fn = require(_PKG, "process_pdfs", layer=_LAYER)
38
+ return fn(pdf_paths, **kwargs)
39
+
40
+
41
+ def process_all_pdfs(directory: str, **kwargs: Any):
42
+ """Discover and process every ``.pdf`` under ``directory``."""
43
+ fn = require(_PKG, "process_all_pdfs", layer=_LAYER)
44
+ return fn(directory, **kwargs)
45
+
46
+
47
+ def generate_pdf(pdf_path: str, **kwargs: Any) -> dict:
48
+ """One-call end-to-end: slice + OCR + enriched manifests + viewer HTML."""
49
+ mod = soft_import(_PKG + ".pipeline", layer=_LAYER)
50
+ fn = getattr(mod, "generate_pdf", None) or require(_PKG, "generate_pdf", layer=_LAYER)
51
+ return fn(pdf_path, **kwargs)
52
+
53
+
54
+ def _resolve_pdf_dir(pdf_path: str) -> Optional[str]:
55
+ """Find the directory holding ``pages/`` for a processed PDF.
56
+
57
+ ``process_pdf`` relocates ``<dir>/foo.pdf`` into ``<dir>/foo/foo.pdf`` and
58
+ writes pages under ``<dir>/foo/pages/``. We check the relocated dir first,
59
+ then the original dir, so this works whether or not relocation happened.
60
+ """
61
+ p = os.path.abspath(pdf_path)
62
+ parent = os.path.dirname(p)
63
+ stem = os.path.splitext(os.path.basename(p))[0]
64
+ for candidate in (os.path.join(parent, stem), parent):
65
+ if os.path.isdir(os.path.join(candidate, "pages")):
66
+ return candidate
67
+ return None
68
+
69
+
70
+ def pdf_pages(pdf_path: str) -> tuple[list[dict[str, Any]], Optional[str]]:
71
+ """Read back the per-page OCR'd text/info that ``process_pdf`` wrote to disk.
72
+
73
+ Returns ``(pages, full_text)`` where ``pages`` is a list of
74
+ ``{"index", "page", "text", "info"}`` dicts in page order, and ``full_text``
75
+ is the pages joined. Reads the cached ``pages/NNNN/text.txt`` + ``info.json``
76
+ layout directly — no re-OCR, no fragile deep imports. ``([], None)`` if the
77
+ PDF hasn't been processed yet.
78
+ """
79
+ base = _resolve_pdf_dir(pdf_path)
80
+ if base is None:
81
+ return [], None
82
+ pages_dir = os.path.join(base, "pages")
83
+ # zero-padded names (0001, 0002, ...) so lexical sort == page order
84
+ names = sorted(
85
+ d for d in os.listdir(pages_dir) if os.path.isdir(os.path.join(pages_dir, d))
86
+ )
87
+ pages: list[dict[str, Any]] = []
88
+ for i, name in enumerate(names):
89
+ pdir = os.path.join(pages_dir, name)
90
+ text_path = os.path.join(pdir, "text.txt")
91
+ info_path = os.path.join(pdir, "info.json")
92
+ text = ""
93
+ if os.path.isfile(text_path):
94
+ with open(text_path, "r", encoding="utf-8", errors="replace") as fh:
95
+ text = fh.read().strip()
96
+ info: dict[str, Any] = {}
97
+ if os.path.isfile(info_path):
98
+ try:
99
+ with open(info_path, "r", encoding="utf-8") as fh:
100
+ info = json.load(fh)
101
+ except Exception:
102
+ info = {}
103
+ pages.append(
104
+ {"index": int(name) if name.isdigit() else i, "page": name, "text": text, "info": info}
105
+ )
106
+ full_text = "\n\n".join(p["text"] for p in pages if p["text"]).strip() or None
107
+ return pages, full_text
108
+
109
+
110
+ def DocumentPipeline(*args: Any, **kwargs: Any):
111
+ """Construct the per-PDF ``DocumentPipeline`` orchestrator."""
112
+ cls = require(_PKG, "DocumentPipeline", layer=_LAYER)
113
+ return cls(*args, **kwargs)
114
+
115
+
116
+ def SliceManager(*args: Any, **kwargs: Any):
117
+ """Construct the slice-aware multi-engine column OCR ``SliceManager``."""
118
+ cls = require(_PKG, "SliceManager", layer=_LAYER)
119
+ return cls(*args, **kwargs)