arcus-provider-runtime 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arcus/provider_runtime/__init__.py +25 -0
- arcus/provider_runtime/factory.py +174 -0
- arcus/provider_runtime/log.py +46 -0
- arcus/provider_runtime/provider_interface.py +86 -0
- arcus/provider_runtime/providers/__init__.py +0 -0
- arcus/provider_runtime/providers/_shared/__init__.py +0 -0
- arcus/provider_runtime/providers/_shared/file_extract.py +433 -0
- arcus/provider_runtime/providers/docs/__init__.py +0 -0
- arcus/provider_runtime/providers/docs/docs.py +230 -0
- arcus/provider_runtime/providers/html/__init__.py +0 -0
- arcus/provider_runtime/providers/html/_athena_fetch_page.py +1272 -0
- arcus/provider_runtime/providers/html/html.py +231 -0
- arcus/provider_runtime/providers/html/html2md.mjs +202 -0
- arcus/provider_runtime/providers/pdf/__init__.py +0 -0
- arcus/provider_runtime/providers/pdf/pdf.py +230 -0
- arcus/provider_runtime/providers/youtube/__init__.py +0 -0
- arcus/provider_runtime/providers/youtube/nlm_fallback.py +126 -0
- arcus/provider_runtime/providers/youtube/nlm_limit.py +69 -0
- arcus/provider_runtime/providers/youtube/url.py +45 -0
- arcus/provider_runtime/providers/youtube/vtt.py +102 -0
- arcus/provider_runtime/providers/youtube/youtube.py +231 -0
- arcus/provider_runtime/providers/youtube/ytdlp_adapter.py +155 -0
- arcus/provider_runtime/registry.py +28 -0
- arcus/provider_runtime/slug.py +61 -0
- arcus/provider_runtime/types.py +71 -0
- arcus/provider_runtime/writer.py +163 -0
- arcus_provider_runtime-0.3.1.dist-info/METADATA +94 -0
- arcus_provider_runtime-0.3.1.dist-info/RECORD +30 -0
- arcus_provider_runtime-0.3.1.dist-info/WHEEL +4 -0
- arcus_provider_runtime-0.3.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""arcus provider-runtime public API."""
|
|
2
|
+
|
|
3
|
+
from .factory import Factory, register_defaults
|
|
4
|
+
from .provider_interface import ExtractionContext, Provider
|
|
5
|
+
from .registry import ProviderRegistry
|
|
6
|
+
from .types import (
|
|
7
|
+
EXIT_CODES,
|
|
8
|
+
DetectionResult,
|
|
9
|
+
ExtractionResult,
|
|
10
|
+
Segment,
|
|
11
|
+
SourceMetadata,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"EXIT_CODES",
|
|
16
|
+
"DetectionResult",
|
|
17
|
+
"ExtractionContext",
|
|
18
|
+
"ExtractionResult",
|
|
19
|
+
"Factory",
|
|
20
|
+
"Provider",
|
|
21
|
+
"ProviderRegistry",
|
|
22
|
+
"Segment",
|
|
23
|
+
"SourceMetadata",
|
|
24
|
+
"register_defaults",
|
|
25
|
+
]
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Factory: end-to-end orchestration of detection → cache check → extract → write."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
import tempfile
|
|
7
|
+
import traceback
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from .log import EventLogger, now_iso
|
|
11
|
+
from .provider_interface import ExtractionContext, Provider
|
|
12
|
+
from .registry import ProviderRegistry
|
|
13
|
+
from .types import EXIT_CODES
|
|
14
|
+
from .writer import cache_hit_exists, write_failure_stub, write_success
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Factory:
|
|
18
|
+
"""Owns a ProviderRegistry; provides `.run(input, out_dir)` for one-shot extraction."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, registry: ProviderRegistry) -> None:
|
|
21
|
+
self.registry = registry
|
|
22
|
+
|
|
23
|
+
def detect(self, raw_input: str):
|
|
24
|
+
return self.registry.detect(raw_input)
|
|
25
|
+
|
|
26
|
+
def run(
|
|
27
|
+
self,
|
|
28
|
+
raw_input: str,
|
|
29
|
+
*,
|
|
30
|
+
out_dir: Path,
|
|
31
|
+
force: bool = False,
|
|
32
|
+
json_log: bool = False,
|
|
33
|
+
keep_intermediates: bool = False,
|
|
34
|
+
notebook_tag: str | None = None,
|
|
35
|
+
) -> int:
|
|
36
|
+
"""Detect → cache check → extract → write outputs. Returns exit code."""
|
|
37
|
+
logger = EventLogger(out_dir, json_log_stderr=json_log)
|
|
38
|
+
logger.emit({"ts": now_iso(), "raw": raw_input, "status": "started"})
|
|
39
|
+
|
|
40
|
+
match = self.registry.detect(raw_input)
|
|
41
|
+
if match is None:
|
|
42
|
+
logger.emit({"ts": now_iso(), "raw": raw_input, "status": "failed",
|
|
43
|
+
"error": "no provider matched"})
|
|
44
|
+
return EXIT_CODES["EXTRACTORS_EXHAUSTED"]
|
|
45
|
+
|
|
46
|
+
provider, detection = match
|
|
47
|
+
logger.emit({
|
|
48
|
+
"ts": now_iso(),
|
|
49
|
+
"kind": provider.kind,
|
|
50
|
+
"source_id": detection.source_id,
|
|
51
|
+
"event": "detected",
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
# Cache check uses the provider's predicted slug. If the on-disk
|
|
55
|
+
# file's frontmatter `source_id` matches this detection's source_id,
|
|
56
|
+
# we trust the file and short-circuit. Disambiguated forms
|
|
57
|
+
# (`<slug>--<8char>.md`) are checked too — see cache_hit_exists.
|
|
58
|
+
try:
|
|
59
|
+
predicted_slug = provider.predict_slug(detection)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# predict_slug failed (e.g., metadata fetch hit network error).
|
|
62
|
+
# Fall through to extraction — the real extract() call will
|
|
63
|
+
# surface the same error in a structured way.
|
|
64
|
+
logger.emit({
|
|
65
|
+
"ts": now_iso(),
|
|
66
|
+
"kind": provider.kind,
|
|
67
|
+
"source_id": detection.source_id,
|
|
68
|
+
"event": "predict_slug_failed",
|
|
69
|
+
"error": str(e),
|
|
70
|
+
})
|
|
71
|
+
predicted_slug = None
|
|
72
|
+
|
|
73
|
+
if (
|
|
74
|
+
not force
|
|
75
|
+
and predicted_slug is not None
|
|
76
|
+
and cache_hit_exists(out_dir, predicted_slug, detection.source_id)
|
|
77
|
+
):
|
|
78
|
+
logger.emit({
|
|
79
|
+
"ts": now_iso(),
|
|
80
|
+
"kind": provider.kind,
|
|
81
|
+
"source_id": detection.source_id,
|
|
82
|
+
"status": "cache_hit",
|
|
83
|
+
"slug": predicted_slug,
|
|
84
|
+
})
|
|
85
|
+
return EXIT_CODES["SUCCESS"]
|
|
86
|
+
|
|
87
|
+
# Sanitize source_id for use as a tempdir prefix — URLs and local
|
|
88
|
+
# paths both contain '/' which mkdtemp interprets as a path separator.
|
|
89
|
+
safe_prefix = re.sub(r"[^A-Za-z0-9._-]", "_", detection.source_id)[:40]
|
|
90
|
+
with tempfile.TemporaryDirectory(prefix=f"arcus-{safe_prefix}-") as tmp:
|
|
91
|
+
context = ExtractionContext(
|
|
92
|
+
out_dir=out_dir,
|
|
93
|
+
work_dir=Path(tmp),
|
|
94
|
+
notebook_tag=notebook_tag,
|
|
95
|
+
keep_intermediates=keep_intermediates,
|
|
96
|
+
factory=self,
|
|
97
|
+
)
|
|
98
|
+
try:
|
|
99
|
+
result = provider.extract(detection, context)
|
|
100
|
+
except Exception as e: # provider-level uncaught — never crash the CLI
|
|
101
|
+
tb = traceback.format_exc()
|
|
102
|
+
logger.emit({
|
|
103
|
+
"ts": now_iso(),
|
|
104
|
+
"kind": provider.kind,
|
|
105
|
+
"source_id": detection.source_id,
|
|
106
|
+
"status": "failed",
|
|
107
|
+
"error": f"unhandled: {e}",
|
|
108
|
+
"traceback": tb,
|
|
109
|
+
})
|
|
110
|
+
write_failure_stub(
|
|
111
|
+
out_dir,
|
|
112
|
+
slug=detection.source_id,
|
|
113
|
+
source=detection.raw,
|
|
114
|
+
source_id=detection.source_id,
|
|
115
|
+
kind=provider.kind,
|
|
116
|
+
title=None,
|
|
117
|
+
exit_code=EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
|
|
118
|
+
extractor_attempted=[provider.kind],
|
|
119
|
+
error=str(e),
|
|
120
|
+
)
|
|
121
|
+
return EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
|
|
122
|
+
|
|
123
|
+
if result.status == "success":
|
|
124
|
+
write_success(out_dir, result.metadata.slug, result)
|
|
125
|
+
logger.emit({
|
|
126
|
+
"ts": now_iso(),
|
|
127
|
+
"kind": provider.kind,
|
|
128
|
+
"source_id": detection.source_id,
|
|
129
|
+
"status": "success",
|
|
130
|
+
"slug": result.metadata.slug,
|
|
131
|
+
})
|
|
132
|
+
return EXIT_CODES["SUCCESS"]
|
|
133
|
+
|
|
134
|
+
# status == "failed"
|
|
135
|
+
write_failure_stub(
|
|
136
|
+
out_dir,
|
|
137
|
+
slug=result.metadata.slug,
|
|
138
|
+
source=detection.raw,
|
|
139
|
+
source_id=detection.source_id,
|
|
140
|
+
kind=provider.kind,
|
|
141
|
+
title=result.metadata.title or None,
|
|
142
|
+
exit_code=result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"],
|
|
143
|
+
extractor_attempted=[provider.kind],
|
|
144
|
+
error=result.error or "unknown failure",
|
|
145
|
+
)
|
|
146
|
+
logger.emit({
|
|
147
|
+
"ts": now_iso(),
|
|
148
|
+
"kind": provider.kind,
|
|
149
|
+
"source_id": detection.source_id,
|
|
150
|
+
"status": "failed",
|
|
151
|
+
"error": result.error,
|
|
152
|
+
})
|
|
153
|
+
return result.exit_code or EXIT_CODES["PROVIDER_PRIMARY_FAILED"]
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def register_defaults(registry: ProviderRegistry) -> None:
|
|
157
|
+
"""Register the v1 providers.
|
|
158
|
+
|
|
159
|
+
Dispatch order matters — first-match-wins:
|
|
160
|
+
1. YouTube (specific host pattern)
|
|
161
|
+
2. PDF (specific file suffix; must run before HTML so .pdf URLs
|
|
162
|
+
don't get caught by HTML's broad scheme match)
|
|
163
|
+
3. Docs (specific file suffixes — docx/xlsx/pptx/epub; same reason
|
|
164
|
+
as PDF — register before HTML)
|
|
165
|
+
4. HTML (catch-all for any other http(s) URL)
|
|
166
|
+
"""
|
|
167
|
+
from .providers.docs.docs import DocsProvider
|
|
168
|
+
from .providers.html.html import HtmlProvider
|
|
169
|
+
from .providers.pdf.pdf import PdfProvider
|
|
170
|
+
from .providers.youtube.youtube import YouTubeProvider
|
|
171
|
+
registry.register(YouTubeProvider())
|
|
172
|
+
registry.register(PdfProvider())
|
|
173
|
+
registry.register(DocsProvider())
|
|
174
|
+
registry.register(HtmlProvider())
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""NDJSON event logger.
|
|
2
|
+
|
|
3
|
+
Every arcus invocation appends structured events to
|
|
4
|
+
`<out_dir>/.log/extract-log.ndjson`. Consumers parse this for
|
|
5
|
+
audit logging. Optionally mirrors to stderr (`--json-log` flag)
|
|
6
|
+
so subprocess callers can read events without touching disk.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import sys
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def now_iso() -> str:
|
|
19
|
+
"""UTC ISO-8601 timestamp like '2026-05-17T19:42:01.123456+00:00'."""
|
|
20
|
+
return datetime.now(timezone.utc).isoformat()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class EventLogger:
|
|
24
|
+
"""Append-only NDJSON event logger."""
|
|
25
|
+
|
|
26
|
+
def __init__(self, out_dir: Path, *, json_log_stderr: bool = False) -> None:
|
|
27
|
+
self.out_dir = out_dir
|
|
28
|
+
self.json_log_stderr = json_log_stderr
|
|
29
|
+
self._log_dir = out_dir / ".log"
|
|
30
|
+
self._log_file = self._log_dir / "extract-log.ndjson"
|
|
31
|
+
self._dir_ready = False
|
|
32
|
+
|
|
33
|
+
def emit(self, event: dict[str, Any]) -> None:
|
|
34
|
+
"""Append a single event. Creates the .log directory lazily."""
|
|
35
|
+
line = json.dumps(event, ensure_ascii=False) + "\n"
|
|
36
|
+
|
|
37
|
+
if self.json_log_stderr:
|
|
38
|
+
sys.stderr.write(line)
|
|
39
|
+
sys.stderr.flush()
|
|
40
|
+
|
|
41
|
+
if not self._dir_ready:
|
|
42
|
+
self._log_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
self._dir_ready = True
|
|
44
|
+
|
|
45
|
+
with self._log_file.open("a", encoding="utf-8") as f:
|
|
46
|
+
f.write(line)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Provider Protocol — every content-extraction provider implements this shape.
|
|
2
|
+
|
|
3
|
+
This file is documentation by code. Concrete providers live under
|
|
4
|
+
`providers/<kind>/`. The Protocol is `@runtime_checkable` so the factory
|
|
5
|
+
can `isinstance(p, Provider)` for sanity in tests, though duck-typing is
|
|
6
|
+
the contract; the Protocol is not enforced beyond static type checking.
|
|
7
|
+
|
|
8
|
+
Lifecycle:
|
|
9
|
+
1. Factory.detect(input) walks registered providers calling .matches(input).
|
|
10
|
+
First non-None DetectionResult wins.
|
|
11
|
+
2. Caller passes detection to provider.extract(detection, context).
|
|
12
|
+
3. Provider returns ExtractionResult; caller writes via shared writer.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Protocol, runtime_checkable
|
|
20
|
+
|
|
21
|
+
from .types import DetectionResult, ExtractionResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class ExtractionContext:
|
|
26
|
+
"""Context passed to provider.extract().
|
|
27
|
+
|
|
28
|
+
arcus is a single-source download/extraction layer — see
|
|
29
|
+
feedback-arcus-pure-download-layer. Providers don't recurse; the
|
|
30
|
+
`factory` field is reserved/unused by current providers.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
out_dir: Path
|
|
34
|
+
work_dir: Path
|
|
35
|
+
notebook_tag: str | None = None
|
|
36
|
+
keep_intermediates: bool = False
|
|
37
|
+
# Reserved; no current provider recurses (per the pure-download-layer rule).
|
|
38
|
+
factory: "Factory | None" = None # forward-ref to break import cycle
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@runtime_checkable
|
|
42
|
+
class Provider(Protocol):
|
|
43
|
+
"""Single content-extraction provider."""
|
|
44
|
+
|
|
45
|
+
kind: str
|
|
46
|
+
"""Stable identifier — e.g. 'youtube', 'html', 'pdf', 'athena_topic'."""
|
|
47
|
+
|
|
48
|
+
def matches(self, raw_input: str) -> DetectionResult | None:
|
|
49
|
+
"""Pure: return parsed detection if this provider handles the input.
|
|
50
|
+
|
|
51
|
+
No network, no file IO. Detection uses string shape only.
|
|
52
|
+
"""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
def predict_slug(self, detection: DetectionResult) -> str:
|
|
56
|
+
"""Return the bare (pre-disambiguation) slug that `extract()` will use.
|
|
57
|
+
|
|
58
|
+
The factory calls this before extraction so it can cache-check the
|
|
59
|
+
right filename. The slug MUST match what `extract()` will eventually
|
|
60
|
+
assign to `ExtractionResult.metadata.slug` (modulo collision
|
|
61
|
+
disambiguation, which the writer handles per-file).
|
|
62
|
+
|
|
63
|
+
Providers MAY fetch metadata here. Heavy fetches should cache
|
|
64
|
+
internally so `extract()` doesn't repeat the work.
|
|
65
|
+
"""
|
|
66
|
+
...
|
|
67
|
+
|
|
68
|
+
def extract(
|
|
69
|
+
self,
|
|
70
|
+
detection: DetectionResult,
|
|
71
|
+
context: ExtractionContext,
|
|
72
|
+
) -> ExtractionResult:
|
|
73
|
+
"""Fetch + transform the content. Network IO + filesystem allowed.
|
|
74
|
+
|
|
75
|
+
Returns ExtractionResult with status='success' or 'failed'.
|
|
76
|
+
Single-source only — providers do not recurse or aggregate.
|
|
77
|
+
"""
|
|
78
|
+
...
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# `ExtractionContext.factory` is typed as the string `"Factory | None"`.
|
|
82
|
+
# We deliberately do NOT import Factory at module load time — `from __future__
|
|
83
|
+
# import annotations` keeps every annotation as a string, and dataclasses
|
|
84
|
+
# never resolves it. Composite providers that need the live class can
|
|
85
|
+
# `from .factory import Factory` lazily inside a method body without
|
|
86
|
+
# creating an import cycle.
|
|
File without changes
|
|
File without changes
|