arcus-provider-runtime 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. arcus/provider_runtime/__init__.py +25 -0
  2. arcus/provider_runtime/factory.py +174 -0
  3. arcus/provider_runtime/log.py +46 -0
  4. arcus/provider_runtime/provider_interface.py +86 -0
  5. arcus/provider_runtime/providers/__init__.py +0 -0
  6. arcus/provider_runtime/providers/_shared/__init__.py +0 -0
  7. arcus/provider_runtime/providers/_shared/file_extract.py +433 -0
  8. arcus/provider_runtime/providers/docs/__init__.py +0 -0
  9. arcus/provider_runtime/providers/docs/docs.py +230 -0
  10. arcus/provider_runtime/providers/html/__init__.py +0 -0
  11. arcus/provider_runtime/providers/html/_athena_fetch_page.py +1272 -0
  12. arcus/provider_runtime/providers/html/html.py +231 -0
  13. arcus/provider_runtime/providers/html/html2md.mjs +202 -0
  14. arcus/provider_runtime/providers/pdf/__init__.py +0 -0
  15. arcus/provider_runtime/providers/pdf/pdf.py +230 -0
  16. arcus/provider_runtime/providers/youtube/__init__.py +0 -0
  17. arcus/provider_runtime/providers/youtube/nlm_fallback.py +126 -0
  18. arcus/provider_runtime/providers/youtube/nlm_limit.py +69 -0
  19. arcus/provider_runtime/providers/youtube/url.py +45 -0
  20. arcus/provider_runtime/providers/youtube/vtt.py +102 -0
  21. arcus/provider_runtime/providers/youtube/youtube.py +231 -0
  22. arcus/provider_runtime/providers/youtube/ytdlp_adapter.py +155 -0
  23. arcus/provider_runtime/registry.py +28 -0
  24. arcus/provider_runtime/slug.py +61 -0
  25. arcus/provider_runtime/types.py +71 -0
  26. arcus/provider_runtime/writer.py +163 -0
  27. arcus_provider_runtime-0.3.1.dist-info/METADATA +94 -0
  28. arcus_provider_runtime-0.3.1.dist-info/RECORD +30 -0
  29. arcus_provider_runtime-0.3.1.dist-info/WHEEL +4 -0
  30. arcus_provider_runtime-0.3.1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,25 @@
1
+ """arcus provider-runtime public API."""
2
+
3
+ from .factory import Factory, register_defaults
4
+ from .provider_interface import ExtractionContext, Provider
5
+ from .registry import ProviderRegistry
6
+ from .types import (
7
+ EXIT_CODES,
8
+ DetectionResult,
9
+ ExtractionResult,
10
+ Segment,
11
+ SourceMetadata,
12
+ )
13
+
14
+ __all__ = [
15
+ "EXIT_CODES",
16
+ "DetectionResult",
17
+ "ExtractionContext",
18
+ "ExtractionResult",
19
+ "Factory",
20
+ "Provider",
21
+ "ProviderRegistry",
22
+ "Segment",
23
+ "SourceMetadata",
24
+ "register_defaults",
25
+ ]
@@ -0,0 +1,174 @@
1
+ """Factory: end-to-end orchestration of detection → cache check → extract → write."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ import tempfile
7
+ import traceback
8
+ from pathlib import Path
9
+
10
+ from .log import EventLogger, now_iso
11
+ from .provider_interface import ExtractionContext, Provider
12
+ from .registry import ProviderRegistry
13
+ from .types import EXIT_CODES
14
+ from .writer import cache_hit_exists, write_failure_stub, write_success
15
+
16
+
17
+ class Factory:
18
+ """Owns a ProviderRegistry; provides `.run(input, out_dir)` for one-shot extraction."""
19
+
20
+ def __init__(self, registry: ProviderRegistry) -> None:
21
+ self.registry = registry
22
+
23
+ def detect(self, raw_input: str):
24
+ return self.registry.detect(raw_input)
25
+
26
+ def run(
27
+ self,
28
+ raw_input: str,
29
+ *,
30
+ out_dir: Path,
31
+ force: bool = False,
32
+ json_log: bool = False,
33
+ keep_intermediates: bool = False,
34
+ notebook_tag: str | None = None,
35
+ ) -> int:
36
+ """Detect → cache check → extract → write outputs. Returns exit code."""
37
+ logger = EventLogger(out_dir, json_log_stderr=json_log)
38
+ logger.emit({"ts": now_iso(), "raw": raw_input, "status": "started"})
39
+
40
+ match = self.registry.detect(raw_input)
41
+ if match is None:
42
+ logger.emit({"ts": now_iso(), "raw": raw_input, "status": "failed",
43
+ "error": "no provider matched"})
44
+ return EXIT_CODES["EXTRACTORS_EXHAUSTED"]
45
+
46
+ provider, detection = match
47
+ logger.emit({
48
+ "ts": now_iso(),
49
+ "kind": provider.kind,
50
+ "source_id": detection.source_id,
51
+ "event": "detected",
52
+ })
53
+
54
+ # Cache check uses the provider's predicted slug. If the on-disk
55
+ # file's frontmatter `source_id` matches this detection's source_id,
56
+ # we trust the file and short-circuit. Disambiguated forms
57
+ # (`<slug>--<8char>.md`) are checked too — see cache_hit_exists.
58
+ try:
59
+ predicted_slug = provider.predict_slug(detection)
60
+ except Exception as e:
61
+ # predict_slug failed (e.g., metadata fetch hit network error).
62
+ # Fall through to extraction — the real extract() call will
63
+ # surface the same error in a structured way.
64
+ logger.emit({
65
+ "ts": now_iso(),
66
+ "kind": provider.kind,
67
+ "source_id": detection.source_id,
68
+ "event": "predict_slug_failed",
69
+ "error": str(e),
70
+ })
71
+ predicted_slug = None
72
+
73
+ if (
74
+ not force
75
+ and predicted_slug is not None
76
+ and cache_hit_exists(out_dir, predicted_slug, detection.source_id)
77
+ ):
78
+ logger.emit({
79
+ "ts": now_iso(),
80
+ "kind": provider.kind,
81
+ "source_id": detection.source_id,
82
+ "status": "cache_hit",
83
+ "slug": predicted_slug,
84
+ })
85
+ return EXIT_CODES["SUCCESS"]
86
+
87
+ # Sanitize source_id for use as a tempdir prefix — URLs and local
88
+ # paths both contain '/' which mkdtemp interprets as a path separator.
89
+ safe_prefix = re.sub(r"[^A-Za-z0-9._-]", "_", detection.source_id)[:40]
90
+ with tempfile.TemporaryDirectory(prefix=f"arcus-{safe_prefix}-") as tmp:
91
+ context = ExtractionContext(
92
+ out_dir=out_dir,
93
+ work_dir=Path(tmp),
94
+ notebook_tag=notebook_tag,
95
+ keep_intermediates=keep_intermediates,
96
+ factory=self,
97
+ )
98
+ try:
99
+ result = provider.extract(detection, context)
100
+ except Exception as e: # provider-level uncaught — never crash the CLI
101
+ tb = traceback.format_exc()
102
+ logger.emit({
103
+ "ts": now_iso(),
104
+ "kind": provider.kind,
105
+ "source_id": detection.source_id,
106
+ "status": "failed",
107
+ "error": f"unhandled: {e}",
108
+ "traceback": tb,
109
+ })
110
+ write_failure_stub(
111
+ out_dir,
112
+ slug=detection.source_id,
113
+ source=detection.raw,
114
+ source_id=detection.source_id,
115
+ kind=provider.kind,
116
+ title=None,
117
+ exit_code=EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
118
+ extractor_attempted=[provider.kind],
119
+ error=str(e),
120
+ )
121
+ return EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
122
+
123
+ if result.status == "success":
124
+ write_success(out_dir, result.metadata.slug, result)
125
+ logger.emit({
126
+ "ts": now_iso(),
127
+ "kind": provider.kind,
128
+ "source_id": detection.source_id,
129
+ "status": "success",
130
+ "slug": result.metadata.slug,
131
+ })
132
+ return EXIT_CODES["SUCCESS"]
133
+
134
+ # status == "failed"
135
+ write_failure_stub(
136
+ out_dir,
137
+ slug=result.metadata.slug,
138
+ source=detection.raw,
139
+ source_id=detection.source_id,
140
+ kind=provider.kind,
141
+ title=result.metadata.title or None,
142
+ exit_code=result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
143
+ extractor_attempted=[provider.kind],
144
+ error=result.error or "unknown failure",
145
+ )
146
+ logger.emit({
147
+ "ts": now_iso(),
148
+ "kind": provider.kind,
149
+ "source_id": detection.source_id,
150
+ "status": "failed",
151
+ "error": result.error,
152
+ })
153
+ return result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
154
+
155
+
156
+ def register_defaults(registry: ProviderRegistry) -> None:
157
+ """Register the v1 providers.
158
+
159
+ Dispatch order matters — first-match-wins:
160
+ 1. YouTube (specific host pattern)
161
+ 2. PDF (specific file suffix; must run before HTML so .pdf URLs
162
+ don't get caught by HTML's broad scheme match)
163
+ 3. Docs (specific file suffixes — docx/xlsx/pptx/epub; same reason
164
+ as PDF — register before HTML)
165
+ 4. HTML (catch-all for any other http(s) URL)
166
+ """
167
+ from .providers.docs.docs import DocsProvider
168
+ from .providers.html.html import HtmlProvider
169
+ from .providers.pdf.pdf import PdfProvider
170
+ from .providers.youtube.youtube import YouTubeProvider
171
+ registry.register(YouTubeProvider())
172
+ registry.register(PdfProvider())
173
+ registry.register(DocsProvider())
174
+ registry.register(HtmlProvider())
@@ -0,0 +1,46 @@
1
+ """NDJSON event logger.
2
+
3
+ Every arcus invocation appends structured events to
4
+ `<out_dir>/.log/extract-log.ndjson`. Consumers parse this for
5
+ audit logging. Optionally mirrors to stderr (`--json-log` flag)
6
+ so subprocess callers can read events without touching disk.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ import sys
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+
18
+ def now_iso() -> str:
19
+ """UTC ISO-8601 timestamp like '2026-05-17T19:42:01.123456+00:00'."""
20
+ return datetime.now(timezone.utc).isoformat()
21
+
22
+
23
+ class EventLogger:
24
+ """Append-only NDJSON event logger."""
25
+
26
+ def __init__(self, out_dir: Path, *, json_log_stderr: bool = False) -> None:
27
+ self.out_dir = out_dir
28
+ self.json_log_stderr = json_log_stderr
29
+ self._log_dir = out_dir / ".log"
30
+ self._log_file = self._log_dir / "extract-log.ndjson"
31
+ self._dir_ready = False
32
+
33
+ def emit(self, event: dict[str, Any]) -> None:
34
+ """Append a single event. Creates the .log directory lazily."""
35
+ line = json.dumps(event, ensure_ascii=False) + "\n"
36
+
37
+ if self.json_log_stderr:
38
+ sys.stderr.write(line)
39
+ sys.stderr.flush()
40
+
41
+ if not self._dir_ready:
42
+ self._log_dir.mkdir(parents=True, exist_ok=True)
43
+ self._dir_ready = True
44
+
45
+ with self._log_file.open("a", encoding="utf-8") as f:
46
+ f.write(line)
@@ -0,0 +1,86 @@
1
+ """Provider Protocol — every content-extraction provider implements this shape.
2
+
3
+ This file is documentation by code. Concrete providers live under
4
+ `providers/<kind>/`. The Protocol is `@runtime_checkable` so the factory
5
+ can `isinstance(p, Provider)` for sanity in tests, though duck-typing is
6
+ the contract; the Protocol is not enforced beyond static type checking.
7
+
8
+ Lifecycle:
9
+ 1. Factory.detect(input) walks registered providers calling .matches(input).
10
+ First non-None DetectionResult wins.
11
+ 2. Caller passes detection to provider.extract(detection, context).
12
+ 3. Provider returns ExtractionResult; caller writes via shared writer.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Protocol, runtime_checkable
20
+
21
+ from .types import DetectionResult, ExtractionResult
22
+
23
+
24
+ @dataclass
25
+ class ExtractionContext:
26
+ """Context passed to provider.extract().
27
+
28
+ arcus is a single-source download/extraction layer — see
29
+ feedback-arcus-pure-download-layer. Providers don't recurse; the
30
+ `factory` field is reserved/unused by current providers.
31
+ """
32
+
33
+ out_dir: Path
34
+ work_dir: Path
35
+ notebook_tag: str | None = None
36
+ keep_intermediates: bool = False
37
+ # Reserved; no current provider recurses (per the pure-download-layer rule).
38
+ factory: "Factory | None" = None # forward-ref to break import cycle
39
+
40
+
41
+ @runtime_checkable
42
+ class Provider(Protocol):
43
+ """Single content-extraction provider."""
44
+
45
+ kind: str
46
+ """Stable identifier — e.g. 'youtube', 'html', 'pdf', 'athena_topic'."""
47
+
48
+ def matches(self, raw_input: str) -> DetectionResult | None:
49
+ """Pure: return parsed detection if this provider handles the input.
50
+
51
+ No network, no file IO. Detection uses string shape only.
52
+ """
53
+ ...
54
+
55
+ def predict_slug(self, detection: DetectionResult) -> str:
56
+ """Return the bare (pre-disambiguation) slug that `extract()` will use.
57
+
58
+ The factory calls this before extraction so it can cache-check the
59
+ right filename. The slug MUST match what `extract()` will eventually
60
+ assign to `ExtractionResult.metadata.slug` (modulo collision
61
+ disambiguation, which the writer handles per-file).
62
+
63
+ Providers MAY fetch metadata here. Heavy fetches should cache
64
+ internally so `extract()` doesn't repeat the work.
65
+ """
66
+ ...
67
+
68
+ def extract(
69
+ self,
70
+ detection: DetectionResult,
71
+ context: ExtractionContext,
72
+ ) -> ExtractionResult:
73
+ """Fetch + transform the content. Network IO + filesystem allowed.
74
+
75
+ Returns ExtractionResult with status='success' or 'failed'.
76
+ Single-source only — providers do not recurse or aggregate.
77
+ """
78
+ ...
79
+
80
+
81
+ # `ExtractionContext.factory` is typed as the string `"Factory | None"`.
82
+ # We deliberately do NOT import Factory at module load time — `from __future__
83
+ # import annotations` keeps every annotation as a string, and dataclasses
84
+ # never resolves it. Composite providers that need the live class can
85
+ # `from .factory import Factory` lazily inside a method body without
86
+ # creating an import cycle.
File without changes
File without changes