arcus-provider-runtime 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. arcus_provider_runtime-0.3.1/.gitignore +12 -0
  2. arcus_provider_runtime-0.3.1/LICENSE +21 -0
  3. arcus_provider_runtime-0.3.1/PKG-INFO +94 -0
  4. arcus_provider_runtime-0.3.1/README.md +50 -0
  5. arcus_provider_runtime-0.3.1/pyproject.toml +60 -0
  6. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/__init__.py +25 -0
  7. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/factory.py +174 -0
  8. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/log.py +46 -0
  9. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/provider_interface.py +86 -0
  10. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/__init__.py +0 -0
  11. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/_shared/__init__.py +0 -0
  12. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/_shared/file_extract.py +433 -0
  13. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/docs/__init__.py +0 -0
  14. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/docs/docs.py +230 -0
  15. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/__init__.py +0 -0
  16. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/_athena_fetch_page.py +1272 -0
  17. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/html.py +231 -0
  18. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/html/html2md.mjs +202 -0
  19. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/pdf/__init__.py +0 -0
  20. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/pdf/pdf.py +230 -0
  21. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/__init__.py +0 -0
  22. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/nlm_fallback.py +126 -0
  23. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/nlm_limit.py +69 -0
  24. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/url.py +45 -0
  25. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/vtt.py +102 -0
  26. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/youtube.py +231 -0
  27. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/providers/youtube/ytdlp_adapter.py +155 -0
  28. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/registry.py +28 -0
  29. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/slug.py +61 -0
  30. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/types.py +71 -0
  31. arcus_provider_runtime-0.3.1/src/arcus/provider_runtime/writer.py +163 -0
  32. arcus_provider_runtime-0.3.1/tests/providers/__init__.py +0 -0
  33. arcus_provider_runtime-0.3.1/tests/providers/docs/__init__.py +0 -0
  34. arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/_make_fixtures.py +116 -0
  35. arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.docx +0 -0
  36. arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.epub +0 -0
  37. arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.pptx +0 -0
  38. arcus_provider_runtime-0.3.1/tests/providers/docs/fixtures/small.xlsx +0 -0
  39. arcus_provider_runtime-0.3.1/tests/providers/docs/test_docs_provider.py +221 -0
  40. arcus_provider_runtime-0.3.1/tests/providers/html/__init__.py +0 -0
  41. arcus_provider_runtime-0.3.1/tests/providers/html/test_html_provider.py +309 -0
  42. arcus_provider_runtime-0.3.1/tests/providers/pdf/__init__.py +0 -0
  43. arcus_provider_runtime-0.3.1/tests/providers/pdf/fixtures/small.pdf +0 -0
  44. arcus_provider_runtime-0.3.1/tests/providers/pdf/test_pdf_provider.py +278 -0
  45. arcus_provider_runtime-0.3.1/tests/providers/youtube/__init__.py +0 -0
  46. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_nlm_fallback.py +25 -0
  47. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_nlm_limit.py +62 -0
  48. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_url.py +40 -0
  49. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_vtt.py +77 -0
  50. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_youtube_provider.py +108 -0
  51. arcus_provider_runtime-0.3.1/tests/providers/youtube/test_ytdlp_adapter.py +68 -0
  52. arcus_provider_runtime-0.3.1/tests/test_factory.py +193 -0
  53. arcus_provider_runtime-0.3.1/tests/test_log.py +44 -0
  54. arcus_provider_runtime-0.3.1/tests/test_slug.py +56 -0
  55. arcus_provider_runtime-0.3.1/tests/test_types.py +57 -0
  56. arcus_provider_runtime-0.3.1/tests/test_writer.py +134 -0
@@ -0,0 +1,12 @@
1
+ .DS_Store
2
+ __pycache__/
3
+ *.pyc
4
+ .venv/
5
+ dist/
6
+ build/
7
+ *.egg-info/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .coverage
11
+ out/
12
+ .intermediate/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 POLLEO.AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.4
2
+ Name: arcus-provider-runtime
3
+ Version: 0.3.1
4
+ Summary: Content-extraction provider runtime for arcus — turn a URL or file into normalized markdown + structured metadata.
5
+ Project-URL: Homepage, https://github.com/polleoai/arcus
6
+ Project-URL: Repository, https://github.com/polleoai/arcus
7
+ Project-URL: Issues, https://github.com/polleoai/arcus/issues
8
+ Author-email: "POLLEO.AI" <support@polleo.ai>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: content-extraction,html-to-markdown,llm,pdf-extraction,rag,scraping,youtube-transcript
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Libraries
21
+ Classifier: Topic :: Text Processing :: Markup
22
+ Requires-Python: >=3.11
23
+ Requires-Dist: pyyaml>=6.0
24
+ Requires-Dist: yt-dlp>=2025.5.1
25
+ Provides-Extra: all
26
+ Requires-Dist: openpyxl>=3.1; extra == 'all'
27
+ Requires-Dist: playwright>=1.40; extra == 'all'
28
+ Requires-Dist: pymupdf4llm>=0.0.10; extra == 'all'
29
+ Requires-Dist: python-docx>=1.0; extra == 'all'
30
+ Requires-Dist: python-pptx>=0.6; extra == 'all'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest-mock>=3.12; extra == 'dev'
33
+ Requires-Dist: pytest>=8; extra == 'dev'
34
+ Requires-Dist: ruff>=0.4; extra == 'dev'
35
+ Provides-Extra: html
36
+ Requires-Dist: playwright>=1.40; extra == 'html'
37
+ Provides-Extra: office
38
+ Requires-Dist: openpyxl>=3.1; extra == 'office'
39
+ Requires-Dist: python-docx>=1.0; extra == 'office'
40
+ Requires-Dist: python-pptx>=0.6; extra == 'office'
41
+ Provides-Extra: pdf
42
+ Requires-Dist: pymupdf4llm>=0.0.10; extra == 'pdf'
43
+ Description-Content-Type: text/markdown
44
+
45
+ # arcus-provider-runtime
46
+
47
+ The content-extraction kernel behind [arcus](https://github.com/polleoai/arcus):
48
+ give it one URL or one file path, get back normalized markdown plus structured
49
+ metadata. No vault, no database, no project awareness — a pure download +
50
+ extraction layer you can drop into any pipeline (RAG ingest, knowledge bases,
51
+ LLM context building).
52
+
53
+ ## Install
54
+
55
+ ```bash
56
+ pip install "arcus-provider-runtime[html,pdf,office]"
57
+ ```
58
+
59
+ Extras pull in only the heavy dependencies you need:
60
+
61
+ | Extra | Adds | For |
62
+ |---|---|---|
63
+ | `html` | `playwright` | JS-rendered pages, X.com / LinkedIn, SPA articles |
64
+ | `pdf` | `pymupdf4llm` | PDF → markdown extraction |
65
+ | `office` | `python-docx`, `python-pptx`, `openpyxl` | DOCX / PPTX / XLSX / EPUB |
66
+ | `all` | everything above | — |
67
+
68
+ The base install (YouTube transcripts via `yt-dlp`) has no extras. The HTML
69
+ provider also needs Chromium (`python -m playwright install chromium`) and
70
+ `node` on `PATH` (the vendored `html2md.mjs` converter).
71
+
72
+ ## Use
73
+
74
+ ```python
75
+ from arcus.provider_runtime import Factory
76
+
77
+ result = Factory().run("https://example.com/article", out_dir="./out")
78
+ # result.markdown_path → ./out/<slug>.md (frontmatter + readable body)
79
+ # result.metadata_path → ./out/<slug>.json (segments, timing, provenance)
80
+ ```
81
+
82
+ One `Factory.run()` entry point dispatches to the right provider by inspecting
83
+ the input. Providers live under
84
+ `arcus.provider_runtime.providers.<kind>/` and are individually registerable.
85
+
86
+ ## What it deliberately does NOT do
87
+
88
+ arcus has zero awareness of any consuming app's storage, topics, or wiki. One
89
+ input in, one extracted artifact out. Vault-aware orchestration (dedup,
90
+ cross-referencing, synthesis) belongs in the consumer, not here.
91
+
92
+ ## License
93
+
94
+ MIT © 2026 POLLEO.AI
@@ -0,0 +1,50 @@
1
+ # arcus-provider-runtime
2
+
3
+ The content-extraction kernel behind [arcus](https://github.com/polleoai/arcus):
4
+ give it one URL or one file path, get back normalized markdown plus structured
5
+ metadata. No vault, no database, no project awareness — a pure download +
6
+ extraction layer you can drop into any pipeline (RAG ingest, knowledge bases,
7
+ LLM context building).
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install "arcus-provider-runtime[html,pdf,office]"
13
+ ```
14
+
15
+ Extras pull in only the heavy dependencies you need:
16
+
17
+ | Extra | Adds | For |
18
+ |---|---|---|
19
+ | `html` | `playwright` | JS-rendered pages, X.com / LinkedIn, SPA articles |
20
+ | `pdf` | `pymupdf4llm` | PDF → markdown extraction |
21
+ | `office` | `python-docx`, `python-pptx`, `openpyxl` | DOCX / PPTX / XLSX / EPUB |
22
+ | `all` | everything above | — |
23
+
24
+ The base install (YouTube transcripts via `yt-dlp`) has no extras. The HTML
25
+ provider also needs Chromium (`python -m playwright install chromium`) and
26
+ `node` on `PATH` (the vendored `html2md.mjs` converter).
27
+
28
+ ## Use
29
+
30
+ ```python
31
+ from arcus.provider_runtime import Factory
32
+
33
+ result = Factory().run("https://example.com/article", out_dir="./out")
34
+ # result.markdown_path → ./out/<slug>.md (frontmatter + readable body)
35
+ # result.metadata_path → ./out/<slug>.json (segments, timing, provenance)
36
+ ```
37
+
38
+ One `Factory.run()` entry point dispatches to the right provider by inspecting
39
+ the input. Providers live under
40
+ `arcus.provider_runtime.providers.<kind>/` and are individually registerable.
41
+
42
+ ## What it deliberately does NOT do
43
+
44
+ arcus has zero awareness of any consuming app's storage, topics, or wiki. One
45
+ input in, one extracted artifact out. Vault-aware orchestration (dedup,
46
+ cross-referencing, synthesis) belongs in the consumer, not here.
47
+
48
+ ## License
49
+
50
+ MIT © 2026 POLLEO.AI
@@ -0,0 +1,60 @@
1
+ [project]
2
+ name = "arcus-provider-runtime"
3
+ version = "0.3.1"
4
+ description = "Content-extraction provider runtime for arcus — turn a URL or file into normalized markdown + structured metadata."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "POLLEO.AI", email = "support@polleo.ai" }]
9
+ keywords = [
10
+ "content-extraction", "html-to-markdown", "pdf-extraction",
11
+ "youtube-transcript", "scraping", "llm", "rag",
12
+ ]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Software Development :: Libraries",
23
+ "Topic :: Text Processing :: Markup",
24
+ ]
25
+ dependencies = [
26
+ "pyyaml>=6.0",
27
+ "yt-dlp>=2025.5.1",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/polleoai/arcus"
32
+ Repository = "https://github.com/polleoai/arcus"
33
+ Issues = "https://github.com/polleoai/arcus/issues"
34
+
35
+ [project.optional-dependencies]
36
+ dev = ["pytest>=8", "pytest-mock>=3.12", "ruff>=0.4"]
37
+ # HtmlProvider — Playwright drives a headless browser; html2md.mjs (vendored)
38
+ # requires `node` on PATH at runtime, not a Python dep.
39
+ html = ["playwright>=1.40"]
40
+ # PdfProvider — pymupdf4llm is the primary extractor; pdftotext is the
41
+ # subprocess fallback (system binary, not a pip dep).
42
+ pdf = ["pymupdf4llm>=0.0.10"]
43
+ # Office extractors used by file_extract.py's docx/xlsx/pptx/epub paths.
44
+ # Athena's PdfProvider lift keeps these reachable even though arcus's
45
+ # canonical v1 surface is PDF only.
46
+ office = ["python-docx>=1.0", "python-pptx>=0.6", "openpyxl>=3.1"]
47
+ all = [
48
+ "playwright>=1.40",
49
+ "pymupdf4llm>=0.0.10",
50
+ "python-docx>=1.0",
51
+ "python-pptx>=0.6",
52
+ "openpyxl>=3.1",
53
+ ]
54
+
55
+ [build-system]
56
+ requires = ["hatchling"]
57
+ build-backend = "hatchling.build"
58
+
59
+ [tool.hatch.build.targets.wheel]
60
+ packages = ["src/arcus"]
@@ -0,0 +1,25 @@
1
+ """arcus provider-runtime public API."""
2
+
3
+ from .factory import Factory, register_defaults
4
+ from .provider_interface import ExtractionContext, Provider
5
+ from .registry import ProviderRegistry
6
+ from .types import (
7
+ EXIT_CODES,
8
+ DetectionResult,
9
+ ExtractionResult,
10
+ Segment,
11
+ SourceMetadata,
12
+ )
13
+
14
+ __all__ = [
15
+ "EXIT_CODES",
16
+ "DetectionResult",
17
+ "ExtractionContext",
18
+ "ExtractionResult",
19
+ "Factory",
20
+ "Provider",
21
+ "ProviderRegistry",
22
+ "Segment",
23
+ "SourceMetadata",
24
+ "register_defaults",
25
+ ]
@@ -0,0 +1,174 @@
1
+ """Factory: end-to-end orchestration of detection → cache check → extract → write."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import tempfile
7
+ import traceback
8
+ from pathlib import Path
9
+
10
+ from .log import EventLogger, now_iso
11
+ from .provider_interface import ExtractionContext, Provider
12
+ from .registry import ProviderRegistry
13
+ from .types import EXIT_CODES
14
+ from .writer import cache_hit_exists, write_failure_stub, write_success
15
+
16
+
17
+ class Factory:
18
+ """Owns a ProviderRegistry; provides `.run(input, out_dir)` for one-shot extraction."""
19
+
20
+ def __init__(self, registry: ProviderRegistry) -> None:
21
+ self.registry = registry
22
+
23
+ def detect(self, raw_input: str):
24
+ return self.registry.detect(raw_input)
25
+
26
+ def run(
27
+ self,
28
+ raw_input: str,
29
+ *,
30
+ out_dir: Path,
31
+ force: bool = False,
32
+ json_log: bool = False,
33
+ keep_intermediates: bool = False,
34
+ notebook_tag: str | None = None,
35
+ ) -> int:
36
+ """Detect → cache check → extract → write outputs. Returns exit code."""
37
+ logger = EventLogger(out_dir, json_log_stderr=json_log)
38
+ logger.emit({"ts": now_iso(), "raw": raw_input, "status": "started"})
39
+
40
+ match = self.registry.detect(raw_input)
41
+ if match is None:
42
+ logger.emit({"ts": now_iso(), "raw": raw_input, "status": "failed",
43
+ "error": "no provider matched"})
44
+ return EXIT_CODES["EXTRACTORS_EXHAUSTED"]
45
+
46
+ provider, detection = match
47
+ logger.emit({
48
+ "ts": now_iso(),
49
+ "kind": provider.kind,
50
+ "source_id": detection.source_id,
51
+ "event": "detected",
52
+ })
53
+
54
+ # Cache check uses the provider's predicted slug. If the on-disk
55
+ # file's frontmatter `source_id` matches this detection's source_id,
56
+ # we trust the file and short-circuit. Disambiguated forms
57
+ # (`<slug>--<8char>.md`) are checked too — see cache_hit_exists.
58
+ try:
59
+ predicted_slug = provider.predict_slug(detection)
60
+ except Exception as e:
61
+ # predict_slug failed (e.g., metadata fetch hit network error).
62
+ # Fall through to extraction — the real extract() call will
63
+ # surface the same error in a structured way.
64
+ logger.emit({
65
+ "ts": now_iso(),
66
+ "kind": provider.kind,
67
+ "source_id": detection.source_id,
68
+ "event": "predict_slug_failed",
69
+ "error": str(e),
70
+ })
71
+ predicted_slug = None
72
+
73
+ if (
74
+ not force
75
+ and predicted_slug is not None
76
+ and cache_hit_exists(out_dir, predicted_slug, detection.source_id)
77
+ ):
78
+ logger.emit({
79
+ "ts": now_iso(),
80
+ "kind": provider.kind,
81
+ "source_id": detection.source_id,
82
+ "status": "cache_hit",
83
+ "slug": predicted_slug,
84
+ })
85
+ return EXIT_CODES["SUCCESS"]
86
+
87
+ # Sanitize source_id for use as a tempdir prefix — URLs and local
88
+ # paths both contain '/' which mkdtemp interprets as a path separator.
89
+ safe_prefix = re.sub(r"[^A-Za-z0-9._-]", "_", detection.source_id)[:40]
90
+ with tempfile.TemporaryDirectory(prefix=f"arcus-{safe_prefix}-") as tmp:
91
+ context = ExtractionContext(
92
+ out_dir=out_dir,
93
+ work_dir=Path(tmp),
94
+ notebook_tag=notebook_tag,
95
+ keep_intermediates=keep_intermediates,
96
+ factory=self,
97
+ )
98
+ try:
99
+ result = provider.extract(detection, context)
100
+ except Exception as e: # provider-level uncaught — never crash the CLI
101
+ tb = traceback.format_exc()
102
+ logger.emit({
103
+ "ts": now_iso(),
104
+ "kind": provider.kind,
105
+ "source_id": detection.source_id,
106
+ "status": "failed",
107
+ "error": f"unhandled: {e}",
108
+ "traceback": tb,
109
+ })
110
+ write_failure_stub(
111
+ out_dir,
112
+ slug=detection.source_id,
113
+ source=detection.raw,
114
+ source_id=detection.source_id,
115
+ kind=provider.kind,
116
+ title=None,
117
+ exit_code=EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
118
+ extractor_attempted=[provider.kind],
119
+ error=str(e),
120
+ )
121
+ return EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
122
+
123
+ if result.status == "success":
124
+ write_success(out_dir, result.metadata.slug, result)
125
+ logger.emit({
126
+ "ts": now_iso(),
127
+ "kind": provider.kind,
128
+ "source_id": detection.source_id,
129
+ "status": "success",
130
+ "slug": result.metadata.slug,
131
+ })
132
+ return EXIT_CODES["SUCCESS"]
133
+
134
+ # status == "failed"
135
+ write_failure_stub(
136
+ out_dir,
137
+ slug=result.metadata.slug,
138
+ source=detection.raw,
139
+ source_id=detection.source_id,
140
+ kind=provider.kind,
141
+ title=result.metadata.title or None,
142
+ exit_code=result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
143
+ extractor_attempted=[provider.kind],
144
+ error=result.error or "unknown failure",
145
+ )
146
+ logger.emit({
147
+ "ts": now_iso(),
148
+ "kind": provider.kind,
149
+ "source_id": detection.source_id,
150
+ "status": "failed",
151
+ "error": result.error,
152
+ })
153
+ return result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
154
+
155
+
156
+ def register_defaults(registry: ProviderRegistry) -> None:
157
+ """Register the v1 providers.
158
+
159
+ Dispatch order matters — first-match-wins:
160
+ 1. YouTube (specific host pattern)
161
+ 2. PDF (specific file suffix; must run before HTML so .pdf URLs
162
+ don't get caught by HTML's broad scheme match)
163
+ 3. Docs (specific file suffixes — docx/xlsx/pptx/epub; same reason
164
+ as PDF — register before HTML)
165
+ 4. HTML (catch-all for any other http(s) URL)
166
+ """
167
+ from .providers.docs.docs import DocsProvider
168
+ from .providers.html.html import HtmlProvider
169
+ from .providers.pdf.pdf import PdfProvider
170
+ from .providers.youtube.youtube import YouTubeProvider
171
+ registry.register(YouTubeProvider())
172
+ registry.register(PdfProvider())
173
+ registry.register(DocsProvider())
174
+ registry.register(HtmlProvider())
@@ -0,0 +1,46 @@
1
+ """NDJSON event logger.
2
+
3
+ Every arcus invocation appends structured events to
4
+ `<out_dir>/.log/extract-log.ndjson`. Consumers parse this for
5
+ audit logging. Optionally mirrors to stderr (`--json-log` flag)
6
+ so subprocess callers can read events without touching disk.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import sys
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ def now_iso() -> str:
19
+ """UTC ISO-8601 timestamp like '2026-05-17T19:42:01.123456+00:00'."""
20
+ return datetime.now(timezone.utc).isoformat()
21
+
22
+
23
+ class EventLogger:
24
+ """Append-only NDJSON event logger."""
25
+
26
+ def __init__(self, out_dir: Path, *, json_log_stderr: bool = False) -> None:
27
+ self.out_dir = out_dir
28
+ self.json_log_stderr = json_log_stderr
29
+ self._log_dir = out_dir / ".log"
30
+ self._log_file = self._log_dir / "extract-log.ndjson"
31
+ self._dir_ready = False
32
+
33
+ def emit(self, event: dict[str, Any]) -> None:
34
+ """Append a single event. Creates the .log directory lazily."""
35
+ line = json.dumps(event, ensure_ascii=False) + "\n"
36
+
37
+ if self.json_log_stderr:
38
+ sys.stderr.write(line)
39
+ sys.stderr.flush()
40
+
41
+ if not self._dir_ready:
42
+ self._log_dir.mkdir(parents=True, exist_ok=True)
43
+ self._dir_ready = True
44
+
45
+ with self._log_file.open("a", encoding="utf-8") as f:
46
+ f.write(line)
@@ -0,0 +1,86 @@
1
+ """Provider Protocol — every content-extraction provider implements this shape.
2
+
3
+ This file is documentation by code. Concrete providers live under
4
+ `providers/<kind>/`. The Protocol is `@runtime_checkable` so the factory
5
+ can `isinstance(p, Provider)` for sanity in tests, though duck-typing is
6
+ the contract; the Protocol is not enforced beyond static type checking.
7
+
8
+ Lifecycle:
9
+ 1. Factory.detect(input) walks registered providers calling .matches(input).
10
+ First non-None DetectionResult wins.
11
+ 2. Caller passes detection to provider.extract(detection, context).
12
+ 3. Provider returns ExtractionResult; caller writes via shared writer.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Protocol, runtime_checkable
20
+
21
+ from .types import DetectionResult, ExtractionResult
22
+
23
+
24
+ @dataclass
25
+ class ExtractionContext:
26
+ """Context passed to provider.extract().
27
+
28
+ arcus is a single-source download/extraction layer — see
29
+ feedback-arcus-pure-download-layer. Providers don't recurse; the
30
+ `factory` field is reserved/unused by current providers.
31
+ """
32
+
33
+ out_dir: Path
34
+ work_dir: Path
35
+ notebook_tag: str | None = None
36
+ keep_intermediates: bool = False
37
+ # Reserved; no current provider recurses (per the pure-download-layer rule).
38
+ factory: "Factory | None" = None # forward-ref to break import cycle
39
+
40
+
41
+ @runtime_checkable
42
+ class Provider(Protocol):
43
+ """Single content-extraction provider."""
44
+
45
+ kind: str
46
+ """Stable identifier — e.g. 'youtube', 'html', 'pdf', 'athena_topic'."""
47
+
48
+ def matches(self, raw_input: str) -> DetectionResult | None:
49
+ """Pure: return parsed detection if this provider handles the input.
50
+
51
+ No network, no file IO. Detection uses string shape only.
52
+ """
53
+ ...
54
+
55
+ def predict_slug(self, detection: DetectionResult) -> str:
56
+ """Return the bare (pre-disambiguation) slug that `extract()` will use.
57
+
58
+ The factory calls this before extraction so it can cache-check the
59
+ right filename. The slug MUST match what `extract()` will eventually
60
+ assign to `ExtractionResult.metadata.slug` (modulo collision
61
+ disambiguation, which the writer handles per-file).
62
+
63
+ Providers MAY fetch metadata here. Heavy fetches should cache
64
+ internally so `extract()` doesn't repeat the work.
65
+ """
66
+ ...
67
+
68
+ def extract(
69
+ self,
70
+ detection: DetectionResult,
71
+ context: ExtractionContext,
72
+ ) -> ExtractionResult:
73
+ """Fetch + transform the content. Network IO + filesystem allowed.
74
+
75
+ Returns ExtractionResult with status='success' or 'failed'.
76
+ Single-source only — providers do not recurse or aggregate.
77
+ """
78
+ ...
79
+
80
+
81
+ # `ExtractionContext.factory` is typed as the string `"Factory | None"`.
82
+ # We deliberately do NOT import Factory at module load time — `from __future__
83
+ # import annotations` keeps every annotation as a string, and dataclasses
84
+ # never resolves it. Composite providers that need the live class can
85
+ # `from .factory import Factory` lazily inside a method body without
86
+ # creating an import cycle.