arcus-provider-runtime 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arcus_provider_runtime-0.3.1/.gitignore +12 -0
- arcus_provider_runtime-0.3.1/LICENSE +21 -0
- arcus_provider_runtime-0.3.1/PKG-INFO +94 -0
- arcus_provider_runtime-0.3.1/README.md +50 -0
- arcus_provider_runtime-0.3.1/pyproject.toml +60 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/__init__.py +25 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/factory.py +174 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/log.py +46 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/provider_interface.py +86 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/_shared/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/_shared/file_extract.py +433 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/docs/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/docs/docs.py +230 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/_athena_fetch_page.py +1272 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/html.py +231 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/html2md.mjs +202 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/pdf/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/pdf/pdf.py +230 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/nlm_fallback.py +126 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/nlm_limit.py +69 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/url.py +45 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/vtt.py +102 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/youtube.py +231 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/ytdlp_adapter.py +155 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/registry.py +28 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/slug.py +61 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/types.py +71 -0
- arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/writer.py +163 -0
- arcus_provider_runtime-0.3.1/tests/providers/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/_make_fixtures.py +116 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.docx +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.epub +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.pptx +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.xlsx +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/docs/test_docs_provider.py +221 -0
- arcus_provider_runtime-0.3.1/tests/providers/html/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/html/test_html_provider.py +309 -0
- arcus_provider_runtime-0.3.1/tests/providers/pdf/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/pdf/fixtures/small.pdf +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/pdf/test_pdf_provider.py +278 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/__init__.py +0 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_nlm_fallback.py +25 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_nlm_limit.py +62 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_url.py +40 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_vtt.py +77 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_youtube_provider.py +108 -0
- arcus_provider_runtime-0.3.1/tests/providers/youtube/test_ytdlp_adapter.py +68 -0
- arcus_provider_runtime-0.3.1/tests/test_factory.py +193 -0
- arcus_provider_runtime-0.3.1/tests/test_log.py +44 -0
- arcus_provider_runtime-0.3.1/tests/test_slug.py +56 -0
- arcus_provider_runtime-0.3.1/tests/test_types.py +57 -0
- arcus_provider_runtime-0.3.1/tests/test_writer.py +134 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 POLLEO.AI
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: arcus-provider-runtime
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: Content-extraction provider runtime for arcus — turn a URL or file into normalized markdown + structured metadata.
|
|
5
|
+
Project-URL: Homepage, https://github.com/polleoai/arcus
|
|
6
|
+
Project-URL: Repository, https://github.com/polleoai/arcus
|
|
7
|
+
Project-URL: Issues, https://github.com/polleoai/arcus/issues
|
|
8
|
+
Author-email: "POLLEO.AI" <support@polleo.ai>
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: content-extraction,html-to-markdown,llm,pdf-extraction,rag,scraping,youtube-transcript
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
22
|
+
Requires-Python: >=3.11
|
|
23
|
+
Requires-Dist: pyyaml>=6.0
|
|
24
|
+
Requires-Dist: yt-dlp>=2025.5.1
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: openpyxl>=3.1; extra == 'all'
|
|
27
|
+
Requires-Dist: playwright>=1.40; extra == 'all'
|
|
28
|
+
Requires-Dist: pymupdf4llm>=0.0.10; extra == 'all'
|
|
29
|
+
Requires-Dist: python-docx>=1.0; extra == 'all'
|
|
30
|
+
Requires-Dist: python-pptx>=0.6; extra == 'all'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest-mock>=3.12; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
35
|
+
Provides-Extra: html
|
|
36
|
+
Requires-Dist: playwright>=1.40; extra == 'html'
|
|
37
|
+
Provides-Extra: office
|
|
38
|
+
Requires-Dist: openpyxl>=3.1; extra == 'office'
|
|
39
|
+
Requires-Dist: python-docx>=1.0; extra == 'office'
|
|
40
|
+
Requires-Dist: python-pptx>=0.6; extra == 'office'
|
|
41
|
+
Provides-Extra: pdf
|
|
42
|
+
Requires-Dist: pymupdf4llm>=0.0.10; extra == 'pdf'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# arcus-provider-runtime
|
|
46
|
+
|
|
47
|
+
The content-extraction kernel behind [arcus](https://github.com/polleoai/arcus):
|
|
48
|
+
give it one URL or one file path, get back normalized markdown plus structured
|
|
49
|
+
metadata. No vault, no database, no project awareness — a pure download +
|
|
50
|
+
extraction layer you can drop into any pipeline (RAG ingest, knowledge bases,
|
|
51
|
+
LLM context building).
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install "arcus-provider-runtime[html,pdf,office]"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Extras pull in only the heavy dependencies you need:
|
|
60
|
+
|
|
61
|
+
| Extra | Adds | For |
|
|
62
|
+
|---|---|---|
|
|
63
|
+
| `html` | `playwright` | JS-rendered pages, X.com / LinkedIn, SPA articles |
|
|
64
|
+
| `pdf` | `pymupdf4llm` | PDF → markdown extraction |
|
|
65
|
+
| `office` | `python-docx`, `python-pptx`, `openpyxl` | DOCX / PPTX / XLSX / EPUB |
|
|
66
|
+
| `all` | everything above | — |
|
|
67
|
+
|
|
68
|
+
The base install (YouTube transcripts via `yt-dlp`) has no extras. The HTML
|
|
69
|
+
provider also needs Chromium (`python -m playwright install chromium`) and
|
|
70
|
+
`node` on `PATH` (the vendored `html2md.mjs` converter).
|
|
71
|
+
|
|
72
|
+
## Use
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from arcus.provider_runtime import Factory
|
|
76
|
+
|
|
77
|
+
result = Factory().run("https://example.com/article", out_dir="./out")
|
|
78
|
+
# result.markdown_path → ./out/<slug>.md (frontmatter + readable body)
|
|
79
|
+
# result.metadata_path → ./out/<slug>.json (segments, timing, provenance)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
One `Factory.run()` entry point dispatches to the right provider by inspecting
|
|
83
|
+
the input. Providers live under
|
|
84
|
+
`arcus.provider_runtime.providers.<kind>/` and are individually registerable.
|
|
85
|
+
|
|
86
|
+
## What it deliberately does NOT do
|
|
87
|
+
|
|
88
|
+
arcus has zero awareness of any consuming app's storage, topics, or wiki. One
|
|
89
|
+
input in, one extracted artifact out. Vault-aware orchestration (dedup,
|
|
90
|
+
cross-referencing, synthesis) belongs in the consumer, not here.
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
MIT © 2026 POLLEO.AI
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# arcus-provider-runtime
|
|
2
|
+
|
|
3
|
+
The content-extraction kernel behind [arcus](https://github.com/polleoai/arcus):
|
|
4
|
+
give it one URL or one file path, get back normalized markdown plus structured
|
|
5
|
+
metadata. No vault, no database, no project awareness — a pure download +
|
|
6
|
+
extraction layer you can drop into any pipeline (RAG ingest, knowledge bases,
|
|
7
|
+
LLM context building).
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install "arcus-provider-runtime[html,pdf,office]"
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Extras pull in only the heavy dependencies you need:
|
|
16
|
+
|
|
17
|
+
| Extra | Adds | For |
|
|
18
|
+
|---|---|---|
|
|
19
|
+
| `html` | `playwright` | JS-rendered pages, X.com / LinkedIn, SPA articles |
|
|
20
|
+
| `pdf` | `pymupdf4llm` | PDF → markdown extraction |
|
|
21
|
+
| `office` | `python-docx`, `python-pptx`, `openpyxl` | DOCX / PPTX / XLSX / EPUB |
|
|
22
|
+
| `all` | everything above | — |
|
|
23
|
+
|
|
24
|
+
The base install (YouTube transcripts via `yt-dlp`) has no extras. The HTML
|
|
25
|
+
provider also needs Chromium (`python -m playwright install chromium`) and
|
|
26
|
+
`node` on `PATH` (the vendored `html2md.mjs` converter).
|
|
27
|
+
|
|
28
|
+
## Use
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from arcus.provider_runtime import Factory
|
|
32
|
+
|
|
33
|
+
result = Factory().run("https://example.com/article", out_dir="./out")
|
|
34
|
+
# result.markdown_path → ./out/<slug>.md (frontmatter + readable body)
|
|
35
|
+
# result.metadata_path → ./out/<slug>.json (segments, timing, provenance)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
One `Factory.run()` entry point dispatches to the right provider by inspecting
|
|
39
|
+
the input. Providers live under
|
|
40
|
+
`arcus.provider_runtime.providers.<kind>/` and are individually registerable.
|
|
41
|
+
|
|
42
|
+
## What it deliberately does NOT do
|
|
43
|
+
|
|
44
|
+
arcus has zero awareness of any consuming app's storage, topics, or wiki. One
|
|
45
|
+
input in, one extracted artifact out. Vault-aware orchestration (dedup,
|
|
46
|
+
cross-referencing, synthesis) belongs in the consumer, not here.
|
|
47
|
+
|
|
48
|
+
## License
|
|
49
|
+
|
|
50
|
+
MIT © 2026 POLLEO.AI
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "arcus-provider-runtime"
|
|
3
|
+
version = "0.3.1"
|
|
4
|
+
description = "Content-extraction provider runtime for arcus — turn a URL or file into normalized markdown + structured metadata."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "POLLEO.AI", email = "support@polleo.ai" }]
|
|
9
|
+
keywords = [
|
|
10
|
+
"content-extraction", "html-to-markdown", "pdf-extraction",
|
|
11
|
+
"youtube-transcript", "scraping", "llm", "rag",
|
|
12
|
+
]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Software Development :: Libraries",
|
|
23
|
+
"Topic :: Text Processing :: Markup",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"pyyaml>=6.0",
|
|
27
|
+
"yt-dlp>=2025.5.1",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/polleoai/arcus"
|
|
32
|
+
Repository = "https://github.com/polleoai/arcus"
|
|
33
|
+
Issues = "https://github.com/polleoai/arcus/issues"
|
|
34
|
+
|
|
35
|
+
[project.optional-dependencies]
|
|
36
|
+
dev = ["pytest>=8", "pytest-mock>=3.12", "ruff>=0.4"]
|
|
37
|
+
# HtmlProvider — Playwright drives a headless browser; html2md.mjs (vendored)
|
|
38
|
+
# requires `node` on PATH at runtime, not a Python dep.
|
|
39
|
+
html = ["playwright>=1.40"]
|
|
40
|
+
# PdfProvider — pymupdf4llm is the primary extractor; pdftotext is the
|
|
41
|
+
# subprocess fallback (system binary, not a pip dep).
|
|
42
|
+
pdf = ["pymupdf4llm>=0.0.10"]
|
|
43
|
+
# Office extractors used by file_extract.py's docx/xlsx/pptx/epub paths.
|
|
44
|
+
# Athena's PdfProvider lift keeps these reachable even though arcus's
|
|
45
|
+
# canonical v1 surface is PDF only.
|
|
46
|
+
office = ["python-docx>=1.0", "python-pptx>=0.6", "openpyxl>=3.1"]
|
|
47
|
+
all = [
|
|
48
|
+
"playwright>=1.40",
|
|
49
|
+
"pymupdf4llm>=0.0.10",
|
|
50
|
+
"python-docx>=1.0",
|
|
51
|
+
"python-pptx>=0.6",
|
|
52
|
+
"openpyxl>=3.1",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[build-system]
|
|
56
|
+
requires = ["hatchling"]
|
|
57
|
+
build-backend = "hatchling.build"
|
|
58
|
+
|
|
59
|
+
[tool.hatch.build.targets.wheel]
|
|
60
|
+
packages = ["src/arcus"]
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""arcus provider-runtime public API."""
|
|
2
|
+
|
|
3
|
+
from .factory import Factory, register_defaults
|
|
4
|
+
from .provider_interface import ExtractionContext, Provider
|
|
5
|
+
from .registry import ProviderRegistry
|
|
6
|
+
from .types import (
|
|
7
|
+
EXIT_CODES,
|
|
8
|
+
DetectionResult,
|
|
9
|
+
ExtractionResult,
|
|
10
|
+
Segment,
|
|
11
|
+
SourceMetadata,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"EXIT_CODES",
|
|
16
|
+
"DetectionResult",
|
|
17
|
+
"ExtractionContext",
|
|
18
|
+
"ExtractionResult",
|
|
19
|
+
"Factory",
|
|
20
|
+
"Provider",
|
|
21
|
+
"ProviderRegistry",
|
|
22
|
+
"Segment",
|
|
23
|
+
"SourceMetadata",
|
|
24
|
+
"register_defaults",
|
|
25
|
+
]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Factory: end-to-end orchestration of detection → cache check → extract → write."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import tempfile
|
|
7
|
+
import traceback
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from .log import EventLogger, now_iso
|
|
11
|
+
from .provider_interface import ExtractionContext, Provider
|
|
12
|
+
from .registry import ProviderRegistry
|
|
13
|
+
from .types import EXIT_CODES
|
|
14
|
+
from .writer import cache_hit_exists, write_failure_stub, write_success
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Factory:
|
|
18
|
+
"""Owns a ProviderRegistry; provides `.run(input, out_dir)` for one-shot extraction."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, registry: ProviderRegistry) -> None:
|
|
21
|
+
self.registry = registry
|
|
22
|
+
|
|
23
|
+
def detect(self, raw_input: str):
|
|
24
|
+
return self.registry.detect(raw_input)
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
raw_input: str,
|
|
29
|
+
*,
|
|
30
|
+
out_dir: Path,
|
|
31
|
+
force: bool = False,
|
|
32
|
+
json_log: bool = False,
|
|
33
|
+
keep_intermediates: bool = False,
|
|
34
|
+
notebook_tag: str | None = None,
|
|
35
|
+
) -> int:
|
|
36
|
+
"""Detect → cache check → extract → write outputs. Returns exit code."""
|
|
37
|
+
logger = EventLogger(out_dir, json_log_stderr=json_log)
|
|
38
|
+
logger.emit({"ts": now_iso(), "raw": raw_input, "status": "started"})
|
|
39
|
+
|
|
40
|
+
match = self.registry.detect(raw_input)
|
|
41
|
+
if match is None:
|
|
42
|
+
logger.emit({"ts": now_iso(), "raw": raw_input, "status": "failed",
|
|
43
|
+
"error": "no provider matched"})
|
|
44
|
+
return EXIT_CODES["EXTRACTORS_EXHAUSTED"]
|
|
45
|
+
|
|
46
|
+
provider, detection = match
|
|
47
|
+
logger.emit({
|
|
48
|
+
"ts": now_iso(),
|
|
49
|
+
"kind": provider.kind,
|
|
50
|
+
"source_id": detection.source_id,
|
|
51
|
+
"event": "detected",
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
# Cache check uses the provider's predicted slug. If the on-disk
|
|
55
|
+
# file's frontmatter `source_id` matches this detection's source_id,
|
|
56
|
+
# we trust the file and short-circuit. Disambiguated forms
|
|
57
|
+
# (`<slug>--<8char>.md`) are checked too — see cache_hit_exists.
|
|
58
|
+
try:
|
|
59
|
+
predicted_slug = provider.predict_slug(detection)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# predict_slug failed (e.g., metadata fetch hit network error).
|
|
62
|
+
# Fall through to extraction — the real extract() call will
|
|
63
|
+
# surface the same error in a structured way.
|
|
64
|
+
logger.emit({
|
|
65
|
+
"ts": now_iso(),
|
|
66
|
+
"kind": provider.kind,
|
|
67
|
+
"source_id": detection.source_id,
|
|
68
|
+
"event": "predict_slug_failed",
|
|
69
|
+
"error": str(e),
|
|
70
|
+
})
|
|
71
|
+
predicted_slug = None
|
|
72
|
+
|
|
73
|
+
if (
|
|
74
|
+
not force
|
|
75
|
+
and predicted_slug is not None
|
|
76
|
+
and cache_hit_exists(out_dir, predicted_slug, detection.source_id)
|
|
77
|
+
):
|
|
78
|
+
logger.emit({
|
|
79
|
+
"ts": now_iso(),
|
|
80
|
+
"kind": provider.kind,
|
|
81
|
+
"source_id": detection.source_id,
|
|
82
|
+
"status": "cache_hit",
|
|
83
|
+
"slug": predicted_slug,
|
|
84
|
+
})
|
|
85
|
+
return EXIT_CODES["SUCCESS"]
|
|
86
|
+
|
|
87
|
+
# Sanitize source_id for use as a tempdir prefix — URLs and local
|
|
88
|
+
# paths both contain '/' which mkdtemp interprets as a path separator.
|
|
89
|
+
safe_prefix = re.sub(r"[^A-Za-z0-9._-]", "_", detection.source_id)[:40]
|
|
90
|
+
with tempfile.TemporaryDirectory(prefix=f"arcus-{safe_prefix}-") as tmp:
|
|
91
|
+
context = ExtractionContext(
|
|
92
|
+
out_dir=out_dir,
|
|
93
|
+
work_dir=Path(tmp),
|
|
94
|
+
notebook_tag=notebook_tag,
|
|
95
|
+
keep_intermediates=keep_intermediates,
|
|
96
|
+
factory=self,
|
|
97
|
+
)
|
|
98
|
+
try:
|
|
99
|
+
result = provider.extract(detection, context)
|
|
100
|
+
except Exception as e: # provider-level uncaught — never crash the CLI
|
|
101
|
+
tb = traceback.format_exc()
|
|
102
|
+
logger.emit({
|
|
103
|
+
"ts": now_iso(),
|
|
104
|
+
"kind": provider.kind,
|
|
105
|
+
"source_id": detection.source_id,
|
|
106
|
+
"status": "failed",
|
|
107
|
+
"error": f"unhandled: {e}",
|
|
108
|
+
"traceback": tb,
|
|
109
|
+
})
|
|
110
|
+
write_failure_stub(
|
|
111
|
+
out_dir,
|
|
112
|
+
slug=detection.source_id,
|
|
113
|
+
source=detection.raw,
|
|
114
|
+
source_id=detection.source_id,
|
|
115
|
+
kind=provider.kind,
|
|
116
|
+
title=None,
|
|
117
|
+
exit_code=EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
|
|
118
|
+
extractor_attempted=[provider.kind],
|
|
119
|
+
error=str(e),
|
|
120
|
+
)
|
|
121
|
+
return EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
|
|
122
|
+
|
|
123
|
+
if result.status == "success":
|
|
124
|
+
write_success(out_dir, result.metadata.slug, result)
|
|
125
|
+
logger.emit({
|
|
126
|
+
"ts": now_iso(),
|
|
127
|
+
"kind": provider.kind,
|
|
128
|
+
"source_id": detection.source_id,
|
|
129
|
+
"status": "success",
|
|
130
|
+
"slug": result.metadata.slug,
|
|
131
|
+
})
|
|
132
|
+
return EXIT_CODES["SUCCESS"]
|
|
133
|
+
|
|
134
|
+
# status == "failed"
|
|
135
|
+
write_failure_stub(
|
|
136
|
+
out_dir,
|
|
137
|
+
slug=result.metadata.slug,
|
|
138
|
+
source=detection.raw,
|
|
139
|
+
source_id=detection.source_id,
|
|
140
|
+
kind=provider.kind,
|
|
141
|
+
title=result.metadata.title or None,
|
|
142
|
+
exit_code=result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
|
|
143
|
+
extractor_attempted=[provider.kind],
|
|
144
|
+
error=result.error or "unknown failure",
|
|
145
|
+
)
|
|
146
|
+
logger.emit({
|
|
147
|
+
"ts": now_iso(),
|
|
148
|
+
"kind": provider.kind,
|
|
149
|
+
"source_id": detection.source_id,
|
|
150
|
+
"status": "failed",
|
|
151
|
+
"error": result.error,
|
|
152
|
+
})
|
|
153
|
+
return result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def register_defaults(registry: ProviderRegistry) -> None:
|
|
157
|
+
"""Register the v1 providers.
|
|
158
|
+
|
|
159
|
+
Dispatch order matters — first-match-wins:
|
|
160
|
+
1. YouTube (specific host pattern)
|
|
161
|
+
2. PDF (specific file suffix; must run before HTML so .pdf URLs
|
|
162
|
+
don't get caught by HTML's broad scheme match)
|
|
163
|
+
3. Docs (specific file suffixes — docx/xlsx/pptx/epub; same reason
|
|
164
|
+
as PDF — register before HTML)
|
|
165
|
+
4. HTML (catch-all for any other http(s) URL)
|
|
166
|
+
"""
|
|
167
|
+
from .providers.docs.docs import DocsProvider
|
|
168
|
+
from .providers.html.html import HtmlProvider
|
|
169
|
+
from .providers.pdf.pdf import PdfProvider
|
|
170
|
+
from .providers.youtube.youtube import YouTubeProvider
|
|
171
|
+
registry.register(YouTubeProvider())
|
|
172
|
+
registry.register(PdfProvider())
|
|
173
|
+
registry.register(DocsProvider())
|
|
174
|
+
registry.register(HtmlProvider())
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""NDJSON event logger.
|
|
2
|
+
|
|
3
|
+
Every arcus invocation appends structured events to
|
|
4
|
+
`<out_dir>/.log/extract-log.ndjson`. Consumers parse this for
|
|
5
|
+
audit logging. Optionally mirrors to stderr (`--json-log` flag)
|
|
6
|
+
so subprocess callers can read events without touching disk.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def now_iso() -> str:
|
|
19
|
+
"""UTC ISO-8601 timestamp like '2026-05-17T19:42:01.123456+00:00'."""
|
|
20
|
+
return datetime.now(timezone.utc).isoformat()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EventLogger:
|
|
24
|
+
"""Append-only NDJSON event logger."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, out_dir: Path, *, json_log_stderr: bool = False) -> None:
|
|
27
|
+
self.out_dir = out_dir
|
|
28
|
+
self.json_log_stderr = json_log_stderr
|
|
29
|
+
self._log_dir = out_dir / ".log"
|
|
30
|
+
self._log_file = self._log_dir / "extract-log.ndjson"
|
|
31
|
+
self._dir_ready = False
|
|
32
|
+
|
|
33
|
+
def emit(self, event: dict[str, Any]) -> None:
|
|
34
|
+
"""Append a single event. Creates the .log directory lazily."""
|
|
35
|
+
line = json.dumps(event, ensure_ascii=False) + "\n"
|
|
36
|
+
|
|
37
|
+
if self.json_log_stderr:
|
|
38
|
+
sys.stderr.write(line)
|
|
39
|
+
sys.stderr.flush()
|
|
40
|
+
|
|
41
|
+
if not self._dir_ready:
|
|
42
|
+
self._log_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
self._dir_ready = True
|
|
44
|
+
|
|
45
|
+
with self._log_file.open("a", encoding="utf-8") as f:
|
|
46
|
+
f.write(line)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Provider Protocol — every content-extraction provider implements this shape.
|
|
2
|
+
|
|
3
|
+
This file is documentation by code. Concrete providers live under
|
|
4
|
+
`providers/<kind>/`. The Protocol is `@runtime_checkable` so the factory
|
|
5
|
+
can `isinstance(p, Provider)` for sanity in tests, though duck-typing is
|
|
6
|
+
the contract; the Protocol is not enforced beyond static type checking.
|
|
7
|
+
|
|
8
|
+
Lifecycle:
|
|
9
|
+
1. Factory.detect(input) walks registered providers calling .matches(input).
|
|
10
|
+
First non-None DetectionResult wins.
|
|
11
|
+
2. Caller passes detection to provider.extract(detection, context).
|
|
12
|
+
3. Provider returns ExtractionResult; caller writes via shared writer.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Protocol, runtime_checkable
|
|
20
|
+
|
|
21
|
+
from .types import DetectionResult, ExtractionResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ExtractionContext:
|
|
26
|
+
"""Context passed to provider.extract().
|
|
27
|
+
|
|
28
|
+
arcus is a single-source download/extraction layer — see
|
|
29
|
+
feedback-arcus-pure-download-layer. Providers don't recurse; the
|
|
30
|
+
`factory` field is reserved/unused by current providers.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
out_dir: Path
|
|
34
|
+
work_dir: Path
|
|
35
|
+
notebook_tag: str | None = None
|
|
36
|
+
keep_intermediates: bool = False
|
|
37
|
+
# Reserved; no current provider recurses (per the pure-download-layer rule).
|
|
38
|
+
factory: "Factory | None" = None # forward-ref to break import cycle
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@runtime_checkable
|
|
42
|
+
class Provider(Protocol):
|
|
43
|
+
"""Single content-extraction provider."""
|
|
44
|
+
|
|
45
|
+
kind: str
|
|
46
|
+
"""Stable identifier — e.g. 'youtube', 'html', 'pdf', 'athena_topic'."""
|
|
47
|
+
|
|
48
|
+
def matches(self, raw_input: str) -> DetectionResult | None:
|
|
49
|
+
"""Pure: return parsed detection if this provider handles the input.
|
|
50
|
+
|
|
51
|
+
No network, no file IO. Detection uses string shape only.
|
|
52
|
+
"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
def predict_slug(self, detection: DetectionResult) -> str:
|
|
56
|
+
"""Return the bare (pre-disambiguation) slug that `extract()` will use.
|
|
57
|
+
|
|
58
|
+
The factory calls this before extraction so it can cache-check the
|
|
59
|
+
right filename. The slug MUST match what `extract()` will eventually
|
|
60
|
+
assign to `ExtractionResult.metadata.slug` (modulo collision
|
|
61
|
+
disambiguation, which the writer handles per-file).
|
|
62
|
+
|
|
63
|
+
Providers MAY fetch metadata here. Heavy fetches should cache
|
|
64
|
+
internally so `extract()` doesn't repeat the work.
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
def extract(
|
|
69
|
+
self,
|
|
70
|
+
detection: DetectionResult,
|
|
71
|
+
context: ExtractionContext,
|
|
72
|
+
) -> ExtractionResult:
|
|
73
|
+
"""Fetch + transform the content. Network IO + filesystem allowed.
|
|
74
|
+
|
|
75
|
+
Returns ExtractionResult with status='success' or 'failed'.
|
|
76
|
+
Single-source only — providers do not recurse or aggregate.
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# `ExtractionContext.factory` is typed as the string `"Factory | None"`.
|
|
82
|
+
# We deliberately do NOT import Factory at module load time — `from __future__
|
|
83
|
+
# import annotations` keeps every annotation as a string, and dataclasses
|
|
84
|
+
# never resolves it. Composite providers that need the live class can
|
|
85
|
+
# `from .factory import Factory` lazily inside a method body without
|
|
86
|
+
# creating an import cycle.
|
|
File without changes
|
|
File without changes
|