media-intelligence 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- media_intelligence-0.1.0/PKG-INFO +146 -0
- media_intelligence-0.1.0/README.md +112 -0
- media_intelligence-0.1.0/pyproject.toml +46 -0
- media_intelligence-0.1.0/setup.cfg +4 -0
- media_intelligence-0.1.0/src/media_intelligence/__init__.py +158 -0
- media_intelligence-0.1.0/src/media_intelligence/_lazy.py +100 -0
- media_intelligence-0.1.0/src/media_intelligence/documents.py +119 -0
- media_intelligence-0.1.0/src/media_intelligence/enrich.py +94 -0
- media_intelligence-0.1.0/src/media_intelligence/ingest.py +74 -0
- media_intelligence-0.1.0/src/media_intelligence/ocr.py +65 -0
- media_intelligence-0.1.0/src/media_intelligence/persist/__init__.py +16 -0
- media_intelligence-0.1.0/src/media_intelligence/persist/base.py +41 -0
- media_intelligence-0.1.0/src/media_intelligence/persist/filestore.py +86 -0
- media_intelligence-0.1.0/src/media_intelligence/persist/pgstore.py +50 -0
- media_intelligence-0.1.0/src/media_intelligence/pipeline.py +281 -0
- media_intelligence-0.1.0/src/media_intelligence/publish.py +72 -0
- media_intelligence-0.1.0/src/media_intelligence/py.typed +0 -0
- media_intelligence-0.1.0/src/media_intelligence/schemas.py +163 -0
- media_intelligence-0.1.0/src/media_intelligence/structure.py +114 -0
- media_intelligence-0.1.0/src/media_intelligence/transcribe.py +37 -0
- media_intelligence-0.1.0/src/media_intelligence/video.py +59 -0
- media_intelligence-0.1.0/src/media_intelligence.egg-info/PKG-INFO +146 -0
- media_intelligence-0.1.0/src/media_intelligence.egg-info/SOURCES.txt +24 -0
- media_intelligence-0.1.0/src/media_intelligence.egg-info/dependency_links.txt +1 -0
- media_intelligence-0.1.0/src/media_intelligence.egg-info/requires.txt +34 -0
- media_intelligence-0.1.0/src/media_intelligence.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: media_intelligence
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Abstract Intelligence Platform — a unified, layered pipeline that turns raw media (PDFs, images, video) into structured, searchable, SEO-ready data.
|
|
5
|
+
Author: AbstractEndeavors
|
|
6
|
+
Keywords: ocr,pdf,video,transcription,summarization,seo,media,pipeline
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Provides-Extra: core
|
|
10
|
+
Requires-Dist: abstract_essentials; extra == "core"
|
|
11
|
+
Provides-Extra: ingest
|
|
12
|
+
Requires-Dist: abstract_webtools; extra == "ingest"
|
|
13
|
+
Provides-Extra: ocr
|
|
14
|
+
Requires-Dist: abstract_ocr; extra == "ocr"
|
|
15
|
+
Provides-Extra: documents
|
|
16
|
+
Requires-Dist: abstract_pdfs; extra == "documents"
|
|
17
|
+
Provides-Extra: video
|
|
18
|
+
Requires-Dist: abstract_videos; extra == "video"
|
|
19
|
+
Provides-Extra: transcribe
|
|
20
|
+
Requires-Dist: hugpy; extra == "transcribe"
|
|
21
|
+
Provides-Extra: enrich
|
|
22
|
+
Requires-Dist: hugpy; extra == "enrich"
|
|
23
|
+
Provides-Extra: publish
|
|
24
|
+
Requires-Dist: abstract_react; extra == "publish"
|
|
25
|
+
Requires-Dist: abstract_nginx; extra == "publish"
|
|
26
|
+
Provides-Extra: all
|
|
27
|
+
Requires-Dist: abstract_webtools; extra == "all"
|
|
28
|
+
Requires-Dist: abstract_ocr; extra == "all"
|
|
29
|
+
Requires-Dist: abstract_pdfs; extra == "all"
|
|
30
|
+
Requires-Dist: abstract_videos; extra == "all"
|
|
31
|
+
Requires-Dist: hugpy; extra == "all"
|
|
32
|
+
Requires-Dist: abstract_react; extra == "all"
|
|
33
|
+
Requires-Dist: abstract_nginx; extra == "all"
|
|
34
|
+
|
|
35
|
+
# media_intelligence — Abstract Intelligence Platform
|
|
36
|
+
|
|
37
|
+
A unified, layered facade that turns raw media — **PDFs, images, and video** —
|
|
38
|
+
into **structured, searchable, SEO-ready data**. It does not reimplement any
|
|
39
|
+
engine: it selects the *best* function of each sibling package and exposes it
|
|
40
|
+
behind one clean, lazy API, plus an orchestrated pipeline.
|
|
41
|
+
|
|
42
|
+
```text
|
|
43
|
+
Raw Media (PDF / Image / Video / URL)
|
|
44
|
+
│
|
|
45
|
+
▼
|
|
46
|
+
ingest → extract → structure → enrich → persist → publish
|
|
47
|
+
(webtools) (ocr/ (typed (hugpy) (FS / DB) (react/
|
|
48
|
+
pdfs/ metadata) nginx)
|
|
49
|
+
videos)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Layers → canonical owners
|
|
53
|
+
|
|
54
|
+
| Layer | Owner package | What it does |
|
|
55
|
+
|--------------|----------------------|------------------------------------------------|
|
|
56
|
+
| `ingest` | `abstract_webtools` | scrape pages, download video (yt-dlp/ffmpeg) |
|
|
57
|
+
| `ocr` | `abstract_ocr` | layout-aware, multi-engine OCR |
|
|
58
|
+
| `documents` | `abstract_pdfs` | PDF decomposition + manifests + HTML |
|
|
59
|
+
| `video` | `abstract_videos` | registry pipeline: download/frames/transcribe |
|
|
60
|
+
| `transcribe` | `hugpy` (→ `abstract_ocr` fallback) | Whisper speech-to-text |
|
|
61
|
+
| `enrich` | `hugpy` | summaries, keywords, vision captioning, SEO |
|
|
62
|
+
| `persist` | filesystem (DB-pluggable) | typed JSON/JSONB manifests |
|
|
63
|
+
| `publish` | `abstract_react` + `abstract_nginx` | SEO/OG metadata + static HTML |
|
|
64
|
+
|
|
65
|
+
Overlapping capabilities are resolved to **one owner** (Whisper → `hugpy`;
|
|
66
|
+
video download → `webtools`; summarize/keywords → `hugpy`).
|
|
67
|
+
|
|
68
|
+
## Install
|
|
69
|
+
|
|
70
|
+
`media_intelligence` is *just this `src/` facade* — it contains none of the
|
|
71
|
+
engines. Each layer's owner is its own PyPI package, declared as an **optional
|
|
72
|
+
extra**, so you install only what you use:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install media_intelligence # zero third-party deps — facade only
|
|
76
|
+
pip install "media_intelligence[ocr,enrich]" # just those layers
|
|
77
|
+
pip install "media_intelligence[all]" # the full platform
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The package has **no required third-party dependencies**: importing it is cheap
|
|
81
|
+
(~20 ms) and pulls **none** of the backing packages. Each sibling is imported
|
|
82
|
+
**lazily**, only when its layer is actually called; a missing one raises a clear
|
|
83
|
+
`MissingDependency` naming the extra to install.
|
|
84
|
+
|
|
85
|
+
Check what's usable in the current environment without importing anything:
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import media_intelligence as mi
|
|
89
|
+
mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
|
|
90
|
+
mi.available("enrich") # True / False
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
### Direct namespace access
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
import media_intelligence as mi
|
|
99
|
+
|
|
100
|
+
text = mi.ocr.image_to_text("page.png")
|
|
101
|
+
kw = mi.enrich.keywords(text)
|
|
102
|
+
mi.documents.process_pdf("doc.pdf")
|
|
103
|
+
mi.ingest.download_video("https://site.com/v.mp4", download_directory="/data")
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Orchestrated pipeline (idempotent + resumable)
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from media_intelligence import MediaPipeline
|
|
110
|
+
|
|
111
|
+
pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
|
|
112
|
+
pipe.ingest().extract().structure().enrich().persist().publish()
|
|
113
|
+
print(pipe.report.summary)
|
|
114
|
+
# ... or simply:
|
|
115
|
+
pipe.run()
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
The pipeline autodetects media kind, dispatches each stage accordingly, skips
|
|
119
|
+
stages already satisfied (idempotent), and rehydrates from a prior manifest on
|
|
120
|
+
re-run (resumable). Results land in `out_root/<media_id>/manifest.json`.
|
|
121
|
+
|
|
122
|
+
### Persistence (DB-pluggable, two records)
|
|
123
|
+
|
|
124
|
+
Each item is persisted as **two** records so indexing stays cheap while
|
|
125
|
+
aggregation stays simple:
|
|
126
|
+
|
|
127
|
+
- `manifest.json` — lean index: ids, counts, `text_chars`, summary, keywords,
|
|
128
|
+
SEO, asset pointers. (The JSONB metadata row.)
|
|
129
|
+
- `document.json` — canonical content: full `text`, `pages`/segments,
|
|
130
|
+
`transcript`. The single source of truth for search / aggregation / LLM
|
|
131
|
+
datasets — one read per item, no re-stitching of per-owner on-disk files.
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
store = mi.persist.FileStore("/data")
|
|
135
|
+
store.save_manifest(item.media_id, manifest) # lean index
|
|
136
|
+
store.save_document(item.media_id, document) # full body
|
|
137
|
+
doc = store.load_document(item.media_id) # aggregation reads this
|
|
138
|
+
|
|
139
|
+
# later, identical interface, JSONB backend:
|
|
140
|
+
# store = mi.persist.PgStore(dsn=...) # planned (abstract_database)
|
|
141
|
+
# -> metadata in JSONB, body text in a full-text-indexed column
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
`MediaPipeline.persist()` writes both. On re-run, the body is rehydrated from
|
|
145
|
+
`document.json`, so `extract`/`enrich` skip (no re-OCR / re-transcribe).
|
|
146
|
+
```
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# media_intelligence — Abstract Intelligence Platform
|
|
2
|
+
|
|
3
|
+
A unified, layered facade that turns raw media — **PDFs, images, and video** —
|
|
4
|
+
into **structured, searchable, SEO-ready data**. It does not reimplement any
|
|
5
|
+
engine: it selects the *best* function of each sibling package and exposes it
|
|
6
|
+
behind one clean, lazy API, plus an orchestrated pipeline.
|
|
7
|
+
|
|
8
|
+
```text
|
|
9
|
+
Raw Media (PDF / Image / Video / URL)
|
|
10
|
+
│
|
|
11
|
+
▼
|
|
12
|
+
ingest → extract → structure → enrich → persist → publish
|
|
13
|
+
(webtools) (ocr/ (typed (hugpy) (FS / DB) (react/
|
|
14
|
+
pdfs/ metadata) nginx)
|
|
15
|
+
videos)
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Layers → canonical owners
|
|
19
|
+
|
|
20
|
+
| Layer | Owner package | What it does |
|
|
21
|
+
|--------------|----------------------|------------------------------------------------|
|
|
22
|
+
| `ingest` | `abstract_webtools` | scrape pages, download video (yt-dlp/ffmpeg) |
|
|
23
|
+
| `ocr` | `abstract_ocr` | layout-aware, multi-engine OCR |
|
|
24
|
+
| `documents` | `abstract_pdfs` | PDF decomposition + manifests + HTML |
|
|
25
|
+
| `video` | `abstract_videos` | registry pipeline: download/frames/transcribe |
|
|
26
|
+
| `transcribe` | `hugpy` (→ `abstract_ocr` fallback) | Whisper speech-to-text |
|
|
27
|
+
| `enrich` | `hugpy` | summaries, keywords, vision captioning, SEO |
|
|
28
|
+
| `persist` | filesystem (DB-pluggable) | typed JSON/JSONB manifests |
|
|
29
|
+
| `publish` | `abstract_react` + `abstract_nginx` | SEO/OG metadata + static HTML |
|
|
30
|
+
|
|
31
|
+
Overlapping capabilities are resolved to **one owner** (Whisper → `hugpy`;
|
|
32
|
+
video download → `webtools`; summarize/keywords → `hugpy`).
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
`media_intelligence` is *just this `src/` facade* — it contains none of the
|
|
37
|
+
engines. Each layer's owner is its own PyPI package, declared as an **optional
|
|
38
|
+
extra**, so you install only what you use:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install media_intelligence # zero third-party deps — facade only
|
|
42
|
+
pip install "media_intelligence[ocr,enrich]" # just those layers
|
|
43
|
+
pip install "media_intelligence[all]" # the full platform
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
The package has **no required third-party dependencies**: importing it is cheap
|
|
47
|
+
(~20 ms) and pulls **none** of the backing packages. Each sibling is imported
|
|
48
|
+
**lazily**, only when its layer is actually called; a missing one raises a clear
|
|
49
|
+
`MissingDependency` naming the extra to install.
|
|
50
|
+
|
|
51
|
+
Check what's usable in the current environment without importing anything:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import media_intelligence as mi
|
|
55
|
+
mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
|
|
56
|
+
mi.available("enrich") # True / False
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Usage
|
|
60
|
+
|
|
61
|
+
### Direct namespace access
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import media_intelligence as mi
|
|
65
|
+
|
|
66
|
+
text = mi.ocr.image_to_text("page.png")
|
|
67
|
+
kw = mi.enrich.keywords(text)
|
|
68
|
+
mi.documents.process_pdf("doc.pdf")
|
|
69
|
+
mi.ingest.download_video("https://site.com/v.mp4", download_directory="/data")
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Orchestrated pipeline (idempotent + resumable)
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from media_intelligence import MediaPipeline
|
|
76
|
+
|
|
77
|
+
pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
|
|
78
|
+
pipe.ingest().extract().structure().enrich().persist().publish()
|
|
79
|
+
print(pipe.report.summary)
|
|
80
|
+
# ... or simply:
|
|
81
|
+
pipe.run()
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
The pipeline autodetects media kind, dispatches each stage accordingly, skips
|
|
85
|
+
stages already satisfied (idempotent), and rehydrates from a prior manifest on
|
|
86
|
+
re-run (resumable). Results land in `out_root/<media_id>/manifest.json`.
|
|
87
|
+
|
|
88
|
+
### Persistence (DB-pluggable, two records)
|
|
89
|
+
|
|
90
|
+
Each item is persisted as **two** records so indexing stays cheap while
|
|
91
|
+
aggregation stays simple:
|
|
92
|
+
|
|
93
|
+
- `manifest.json` — lean index: ids, counts, `text_chars`, summary, keywords,
|
|
94
|
+
SEO, asset pointers. (The JSONB metadata row.)
|
|
95
|
+
- `document.json` — canonical content: full `text`, `pages`/segments,
|
|
96
|
+
`transcript`. The single source of truth for search / aggregation / LLM
|
|
97
|
+
datasets — one read per item, no re-stitching of per-owner on-disk files.
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
store = mi.persist.FileStore("/data")
|
|
101
|
+
store.save_manifest(item.media_id, manifest) # lean index
|
|
102
|
+
store.save_document(item.media_id, document) # full body
|
|
103
|
+
doc = store.load_document(item.media_id) # aggregation reads this
|
|
104
|
+
|
|
105
|
+
# later, identical interface, JSONB backend:
|
|
106
|
+
# store = mi.persist.PgStore(dsn=...) # planned (abstract_database)
|
|
107
|
+
# -> metadata in JSONB, body text in a full-text-indexed column
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
`MediaPipeline.persist()` writes both. On re-run, the body is rehydrated from
|
|
111
|
+
`document.json`, so `extract`/`enrich` skip (no re-OCR / re-transcribe).
|
|
112
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "media_intelligence"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Abstract Intelligence Platform — a unified, layered pipeline that turns raw media (PDFs, images, video) into structured, searchable, SEO-ready data."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "AbstractEndeavors" }]
|
|
12
|
+
keywords = ["ocr", "pdf", "video", "transcription", "summarization", "seo", "media", "pipeline"]
|
|
13
|
+
|
|
14
|
+
# The facade is a thin, pure-stdlib access layer with ZERO required third-party
|
|
15
|
+
# deps: importing it pulls nothing. Each backing package is its own PyPI project,
|
|
16
|
+
# imported lazily only when that layer is used. Install just the layers you need
|
|
17
|
+
# via the extras below (or `[all]`). `abstract_essentials` is used opportunistically
|
|
18
|
+
# by the filesystem store but has a stdlib fallback, so it's optional too.
|
|
19
|
+
dependencies = []
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
# Each layer maps to one canonical owning package. Install only what you use.
|
|
23
|
+
core = ["abstract_essentials"] # nicer atomic JSON I/O for FileStore (optional)
|
|
24
|
+
ingest = ["abstract_webtools"]
|
|
25
|
+
ocr = ["abstract_ocr"]
|
|
26
|
+
documents = ["abstract_pdfs"]
|
|
27
|
+
video = ["abstract_videos"]
|
|
28
|
+
transcribe= ["hugpy"] # canonical ASR (falls back to abstract_ocr if absent)
|
|
29
|
+
enrich = ["hugpy"] # canonical ML/NLP namespace (latest iteration)
|
|
30
|
+
publish = ["abstract_react", "abstract_nginx"]
|
|
31
|
+
# Everything for the full end-to-end platform.
|
|
32
|
+
all = [
|
|
33
|
+
"abstract_webtools",
|
|
34
|
+
"abstract_ocr",
|
|
35
|
+
"abstract_pdfs",
|
|
36
|
+
"abstract_videos",
|
|
37
|
+
"hugpy",
|
|
38
|
+
"abstract_react",
|
|
39
|
+
"abstract_nginx",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.package-data]
|
|
46
|
+
media_intelligence = ["py.typed"]
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""media_intelligence — the Abstract Intelligence Platform facade.
|
|
2
|
+
|
|
3
|
+
A unified, layered access layer that turns raw media (PDFs, images, video) into
|
|
4
|
+
structured, searchable, SEO-ready data. It does not reimplement any engine; it
|
|
5
|
+
selects the *best* function of each sibling package and exposes it behind one
|
|
6
|
+
clean, lazy API.
|
|
7
|
+
|
|
8
|
+
Two ways to use it
|
|
9
|
+
------------------
|
|
10
|
+
|
|
11
|
+
1. Direct namespace access — grab one tool::
|
|
12
|
+
|
|
13
|
+
import media_intelligence as mi
|
|
14
|
+
text = mi.ocr.image_to_text("page.png")
|
|
15
|
+
kw = mi.enrich.keywords(text)
|
|
16
|
+
mi.documents.process_pdf("doc.pdf")
|
|
17
|
+
|
|
18
|
+
2. The orchestrated, idempotent/resumable pipeline::
|
|
19
|
+
|
|
20
|
+
from media_intelligence import MediaPipeline
|
|
21
|
+
pipe = MediaPipeline("https://site.com/video.mp4", out_root="/data")
|
|
22
|
+
pipe.ingest().extract().structure().enrich().persist().publish()
|
|
23
|
+
print(pipe.report.summary)
|
|
24
|
+
# ... or just: pipe.run()
|
|
25
|
+
|
|
26
|
+
Layers map one-to-one onto canonical owning packages:
|
|
27
|
+
|
|
28
|
+
ingest -> abstract_webtools (scrape + yt-dlp video download)
|
|
29
|
+
ocr -> abstract_ocr (layout-aware multi-engine OCR)
|
|
30
|
+
documents -> abstract_pdfs (PDF decomposition + HTML)
|
|
31
|
+
video -> abstract_videos (registry pipeline: download/frames/SEO)
|
|
32
|
+
transcribe-> hugpy (Whisper ASR; abstract_ocr fallback)
|
|
33
|
+
enrich -> hugpy (summaries, keywords, vision, SEO)
|
|
34
|
+
persist -> filesystem now, DB-pluggable interface
|
|
35
|
+
publish -> abstract_react + abstract_nginx (SEO/OG + static HTML)
|
|
36
|
+
|
|
37
|
+
Every backing package is imported lazily, so ``import media_intelligence`` is
|
|
38
|
+
cheap and a missing optional package only errors when that layer is used.
|
|
39
|
+
"""
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import importlib
|
|
43
|
+
import importlib.util
|
|
44
|
+
from typing import TYPE_CHECKING
|
|
45
|
+
|
|
46
|
+
from ._lazy import MediaIntelligenceError, MissingDependency
|
|
47
|
+
from .schemas import (
|
|
48
|
+
MediaItem,
|
|
49
|
+
MediaKind,
|
|
50
|
+
PipelineReport,
|
|
51
|
+
Stage,
|
|
52
|
+
StageResult,
|
|
53
|
+
detect_media_kind,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__version__ = "0.1.0"
|
|
57
|
+
|
|
58
|
+
# Submodules exposed as lazy namespaces via module __getattr__ below.
|
|
59
|
+
_LAZY_SUBMODULES = {
|
|
60
|
+
"ingest",
|
|
61
|
+
"ocr",
|
|
62
|
+
"documents",
|
|
63
|
+
"video",
|
|
64
|
+
"transcribe",
|
|
65
|
+
"enrich",
|
|
66
|
+
"structure",
|
|
67
|
+
"persist",
|
|
68
|
+
"publish",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
# Which backing package(s) each layer needs. ``persist`` is pure-stdlib (always
|
|
72
|
+
# available); ``transcribe`` works if either hugpy or abstract_ocr is present.
|
|
73
|
+
_LAYER_PACKAGES = {
|
|
74
|
+
"ingest": ("abstract_webtools",),
|
|
75
|
+
"ocr": ("abstract_ocr",),
|
|
76
|
+
"documents": ("abstract_pdfs",),
|
|
77
|
+
"video": ("abstract_videos",),
|
|
78
|
+
"transcribe": ("hugpy", "abstract_ocr"), # any-of
|
|
79
|
+
"enrich": ("hugpy",),
|
|
80
|
+
"structure": (),
|
|
81
|
+
"persist": (),
|
|
82
|
+
"publish": ("abstract_react",), # nginx HTML is an additional option
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
__all__ = [
|
|
86
|
+
"MediaPipeline",
|
|
87
|
+
"MediaItem",
|
|
88
|
+
"MediaKind",
|
|
89
|
+
"Stage",
|
|
90
|
+
"StageResult",
|
|
91
|
+
"PipelineReport",
|
|
92
|
+
"detect_media_kind",
|
|
93
|
+
"available",
|
|
94
|
+
"MediaIntelligenceError",
|
|
95
|
+
"MissingDependency",
|
|
96
|
+
"__version__",
|
|
97
|
+
*sorted(_LAZY_SUBMODULES),
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _installed(package: str) -> bool:
|
|
102
|
+
"""Whether ``package`` is importable — without importing it."""
|
|
103
|
+
try:
|
|
104
|
+
return importlib.util.find_spec(package) is not None
|
|
105
|
+
except (ImportError, ValueError):
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def available(layer: str | None = None):
|
|
110
|
+
"""Report which layers are usable in this environment, without importing them.
|
|
111
|
+
|
|
112
|
+
>>> import media_intelligence as mi
|
|
113
|
+
>>> mi.available() # {'ingest': True, 'ocr': True, 'publish': False, ...}
|
|
114
|
+
>>> mi.available("enrich") # True / False
|
|
115
|
+
|
|
116
|
+
A layer is available if (any of) its backing package(s) are installed. The
|
|
117
|
+
pure-stdlib layers (``structure``, ``persist``) are always available.
|
|
118
|
+
"""
|
|
119
|
+
def _ok(needed: tuple) -> bool:
|
|
120
|
+
return True if not needed else any(_installed(p) for p in needed)
|
|
121
|
+
|
|
122
|
+
if layer is not None:
|
|
123
|
+
if layer not in _LAYER_PACKAGES:
|
|
124
|
+
raise ValueError(f"unknown layer {layer!r}; choose from {sorted(_LAYER_PACKAGES)}")
|
|
125
|
+
return _ok(_LAYER_PACKAGES[layer])
|
|
126
|
+
return {name: _ok(pkgs) for name, pkgs in _LAYER_PACKAGES.items()}
|
|
127
|
+
|
|
128
|
+
if TYPE_CHECKING: # for type checkers / IDEs only — no runtime import cost
|
|
129
|
+
from . import ( # noqa: F401
|
|
130
|
+
documents,
|
|
131
|
+
enrich,
|
|
132
|
+
ingest,
|
|
133
|
+
ocr,
|
|
134
|
+
persist,
|
|
135
|
+
publish,
|
|
136
|
+
structure,
|
|
137
|
+
transcribe,
|
|
138
|
+
video,
|
|
139
|
+
)
|
|
140
|
+
from .pipeline import MediaPipeline # noqa: F401
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def __getattr__(name: str):
|
|
144
|
+
"""PEP 562 lazy attribute access.
|
|
145
|
+
|
|
146
|
+
Keeps the import graph flat: namespaces and the (heavier) pipeline module
|
|
147
|
+
are only imported when first referenced.
|
|
148
|
+
"""
|
|
149
|
+
if name == "MediaPipeline":
|
|
150
|
+
module = importlib.import_module(".pipeline", __name__)
|
|
151
|
+
return module.MediaPipeline
|
|
152
|
+
if name in _LAZY_SUBMODULES:
|
|
153
|
+
return importlib.import_module(f".{name}", __name__)
|
|
154
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def __dir__():
|
|
158
|
+
return sorted(set(__all__) | set(globals()))
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Lazy / soft import plumbing for the media_intelligence facade.
|
|
2
|
+
|
|
3
|
+
The whole point of this package is to be a *thin* unified access layer over a
|
|
4
|
+
set of heavy sibling packages (paddleocr, torch, yt-dlp, whisper, ...). Importing
|
|
5
|
+
``media_intelligence`` must stay cheap, so every sibling package is imported
|
|
6
|
+
lazily — at first *use*, never at module import time — and the result is cached.
|
|
7
|
+
|
|
8
|
+
If an optional layer's backing package is not installed, we raise a single,
|
|
9
|
+
actionable :class:`MissingDependency` error that names the extra to install
|
|
10
|
+
rather than leaking a raw ``ModuleNotFoundError`` from deep inside a submodule.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import functools
|
|
15
|
+
import importlib
|
|
16
|
+
from types import ModuleType
|
|
17
|
+
from typing import Any, Callable
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"MediaIntelligenceError",
|
|
21
|
+
"MissingDependency",
|
|
22
|
+
"soft_import",
|
|
23
|
+
"require",
|
|
24
|
+
"lazy_namespace",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class MediaIntelligenceError(RuntimeError):
|
|
29
|
+
"""Base error for the media_intelligence facade."""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class MissingDependency(MediaIntelligenceError):
|
|
33
|
+
"""A layer was used but its backing package is not installed."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# Which pip extra installs which backing package — used to build helpful errors.
|
|
37
|
+
_EXTRA_FOR_PACKAGE = {
|
|
38
|
+
"abstract_essentials": "(core)",
|
|
39
|
+
"abstract_webtools": "ingest",
|
|
40
|
+
"abstract_ocr": "ocr",
|
|
41
|
+
"abstract_pdfs": "documents",
|
|
42
|
+
"abstract_videos": "video",
|
|
43
|
+
"hugpy": "enrich",
|
|
44
|
+
"abstract_react": "publish",
|
|
45
|
+
"abstract_nginx": "publish",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
_MODULE_CACHE: dict[str, ModuleType] = {}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def soft_import(package: str, *, layer: str | None = None) -> ModuleType:
|
|
52
|
+
"""Import ``package`` lazily, caching the module.
|
|
53
|
+
|
|
54
|
+
Raises :class:`MissingDependency` (not ``ModuleNotFoundError``) with an
|
|
55
|
+
install hint if the package is absent.
|
|
56
|
+
"""
|
|
57
|
+
cached = _MODULE_CACHE.get(package)
|
|
58
|
+
if cached is not None:
|
|
59
|
+
return cached
|
|
60
|
+
try:
|
|
61
|
+
module = importlib.import_module(package)
|
|
62
|
+
except ModuleNotFoundError as exc:
|
|
63
|
+
# Only translate a *missing backing package*; a genuine sub-import error
|
|
64
|
+
# inside an installed package should surface unchanged.
|
|
65
|
+
if exc.name and (exc.name == package or package.startswith(exc.name + ".")):
|
|
66
|
+
# Resolve the install hint against the *top-level* package, so a
|
|
67
|
+
# missing submodule (e.g. "abstract_nginx.generate_htmls") still
|
|
68
|
+
# points at the right extra.
|
|
69
|
+
top = package.split(".", 1)[0]
|
|
70
|
+
extra = _EXTRA_FOR_PACKAGE.get(top, top)
|
|
71
|
+
hint = f'pip install "media_intelligence[{extra}]"' if extra and extra != "(core)" \
|
|
72
|
+
else f"pip install {top}"
|
|
73
|
+
raise MissingDependency(
|
|
74
|
+
f"The '{layer or package}' layer needs '{package}', which is not "
|
|
75
|
+
f"installed. Install it with: {hint}"
|
|
76
|
+
) from exc
|
|
77
|
+
raise
|
|
78
|
+
_MODULE_CACHE[package] = module
|
|
79
|
+
return module
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def require(package: str, attr: str, *, layer: str | None = None) -> Any:
|
|
83
|
+
"""Return ``attr`` from a soft-imported ``package``.
|
|
84
|
+
|
|
85
|
+
Raises a clear error if the package is installed but the symbol is gone
|
|
86
|
+
(e.g. an upstream rename) so failures point at the facade, not the user.
|
|
87
|
+
"""
|
|
88
|
+
module = soft_import(package, layer=layer)
|
|
89
|
+
try:
|
|
90
|
+
return getattr(module, attr)
|
|
91
|
+
except AttributeError as exc:
|
|
92
|
+
raise MediaIntelligenceError(
|
|
93
|
+
f"'{package}.{attr}' is not available — the upstream API may have "
|
|
94
|
+
f"changed. The media_intelligence '{layer or package}' layer needs updating."
|
|
95
|
+
) from exc
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def lazy_namespace(loader: Callable[[], ModuleType]) -> Callable[[], ModuleType]:
|
|
99
|
+
"""Wrap a submodule loader so the import happens once and is memoised."""
|
|
100
|
+
return functools.lru_cache(maxsize=1)(loader)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Extraction + structuring layer (documents) — PDFs.
|
|
2
|
+
|
|
3
|
+
Canonical owner: ``abstract_pdfs``. Page-level decomposition (text + images),
|
|
4
|
+
manifest generation, OCR (delegating to ``abstract_ocr``), enrichment, and
|
|
5
|
+
static HTML (viewer + gallery).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import os
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
from ._lazy import require, soft_import
|
|
14
|
+
|
|
15
|
+
_PKG = "abstract_pdfs"
|
|
16
|
+
_LAYER = "documents"
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"process_pdf",
|
|
20
|
+
"process_pdfs",
|
|
21
|
+
"process_all_pdfs",
|
|
22
|
+
"generate_pdf",
|
|
23
|
+
"pdf_pages",
|
|
24
|
+
"DocumentPipeline",
|
|
25
|
+
"SliceManager",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def process_pdf(pdf_path: str, **kwargs: Any) -> dict:
|
|
30
|
+
"""Process every page of one PDF (image→text→info→metadata→html + gallery)."""
|
|
31
|
+
fn = require(_PKG, "process_pdf", layer=_LAYER)
|
|
32
|
+
return fn(pdf_path, **kwargs)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def process_pdfs(pdf_paths: list[str], **kwargs: Any) -> list:
|
|
36
|
+
"""Batch process many PDFs with two-level parallelism (PDFs × pages)."""
|
|
37
|
+
fn = require(_PKG, "process_pdfs", layer=_LAYER)
|
|
38
|
+
return fn(pdf_paths, **kwargs)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def process_all_pdfs(directory: str, **kwargs: Any):
|
|
42
|
+
"""Discover and process every ``.pdf`` under ``directory``."""
|
|
43
|
+
fn = require(_PKG, "process_all_pdfs", layer=_LAYER)
|
|
44
|
+
return fn(directory, **kwargs)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def generate_pdf(pdf_path: str, **kwargs: Any) -> dict:
|
|
48
|
+
"""One-call end-to-end: slice + OCR + enriched manifests + viewer HTML."""
|
|
49
|
+
mod = soft_import(_PKG + ".pipeline", layer=_LAYER)
|
|
50
|
+
fn = getattr(mod, "generate_pdf", None) or require(_PKG, "generate_pdf", layer=_LAYER)
|
|
51
|
+
return fn(pdf_path, **kwargs)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _resolve_pdf_dir(pdf_path: str) -> Optional[str]:
|
|
55
|
+
"""Find the directory holding ``pages/`` for a processed PDF.
|
|
56
|
+
|
|
57
|
+
``process_pdf`` relocates ``<dir>/foo.pdf`` into ``<dir>/foo/foo.pdf`` and
|
|
58
|
+
writes pages under ``<dir>/foo/pages/``. We check the relocated dir first,
|
|
59
|
+
then the original dir, so this works whether or not relocation happened.
|
|
60
|
+
"""
|
|
61
|
+
p = os.path.abspath(pdf_path)
|
|
62
|
+
parent = os.path.dirname(p)
|
|
63
|
+
stem = os.path.splitext(os.path.basename(p))[0]
|
|
64
|
+
for candidate in (os.path.join(parent, stem), parent):
|
|
65
|
+
if os.path.isdir(os.path.join(candidate, "pages")):
|
|
66
|
+
return candidate
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def pdf_pages(pdf_path: str) -> tuple[list[dict[str, Any]], Optional[str]]:
|
|
71
|
+
"""Read back the per-page OCR'd text/info that ``process_pdf`` wrote to disk.
|
|
72
|
+
|
|
73
|
+
Returns ``(pages, full_text)`` where ``pages`` is a list of
|
|
74
|
+
``{"index", "page", "text", "info"}`` dicts in page order, and ``full_text``
|
|
75
|
+
is the pages joined. Reads the cached ``pages/NNNN/text.txt`` + ``info.json``
|
|
76
|
+
layout directly — no re-OCR, no fragile deep imports. ``([], None)`` if the
|
|
77
|
+
PDF hasn't been processed yet.
|
|
78
|
+
"""
|
|
79
|
+
base = _resolve_pdf_dir(pdf_path)
|
|
80
|
+
if base is None:
|
|
81
|
+
return [], None
|
|
82
|
+
pages_dir = os.path.join(base, "pages")
|
|
83
|
+
# zero-padded names (0001, 0002, ...) so lexical sort == page order
|
|
84
|
+
names = sorted(
|
|
85
|
+
d for d in os.listdir(pages_dir) if os.path.isdir(os.path.join(pages_dir, d))
|
|
86
|
+
)
|
|
87
|
+
pages: list[dict[str, Any]] = []
|
|
88
|
+
for i, name in enumerate(names):
|
|
89
|
+
pdir = os.path.join(pages_dir, name)
|
|
90
|
+
text_path = os.path.join(pdir, "text.txt")
|
|
91
|
+
info_path = os.path.join(pdir, "info.json")
|
|
92
|
+
text = ""
|
|
93
|
+
if os.path.isfile(text_path):
|
|
94
|
+
with open(text_path, "r", encoding="utf-8", errors="replace") as fh:
|
|
95
|
+
text = fh.read().strip()
|
|
96
|
+
info: dict[str, Any] = {}
|
|
97
|
+
if os.path.isfile(info_path):
|
|
98
|
+
try:
|
|
99
|
+
with open(info_path, "r", encoding="utf-8") as fh:
|
|
100
|
+
info = json.load(fh)
|
|
101
|
+
except Exception:
|
|
102
|
+
info = {}
|
|
103
|
+
pages.append(
|
|
104
|
+
{"index": int(name) if name.isdigit() else i, "page": name, "text": text, "info": info}
|
|
105
|
+
)
|
|
106
|
+
full_text = "\n\n".join(p["text"] for p in pages if p["text"]).strip() or None
|
|
107
|
+
return pages, full_text
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def DocumentPipeline(*args: Any, **kwargs: Any):
|
|
111
|
+
"""Construct the per-PDF ``DocumentPipeline`` orchestrator."""
|
|
112
|
+
cls = require(_PKG, "DocumentPipeline", layer=_LAYER)
|
|
113
|
+
return cls(*args, **kwargs)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def SliceManager(*args: Any, **kwargs: Any):
|
|
117
|
+
"""Construct the slice-aware multi-engine column OCR ``SliceManager``."""
|
|
118
|
+
cls = require(_PKG, "SliceManager", layer=_LAYER)
|
|
119
|
+
return cls(*args, **kwargs)
|