crawfish 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawfish/__init__.py +420 -0
- crawfish/artifacts/__init__.py +12 -0
- crawfish/artifacts/base.py +62 -0
- crawfish/artifacts/local.py +93 -0
- crawfish/batch.py +167 -0
- crawfish/brain.py +130 -0
- crawfish/build.py +141 -0
- crawfish/ccexport.py +234 -0
- crawfish/cli.py +452 -0
- crawfish/config.py +102 -0
- crawfish/core/__init__.py +39 -0
- crawfish/core/compat.py +26 -0
- crawfish/core/context.py +96 -0
- crawfish/core/ids.py +10 -0
- crawfish/core/types.py +95 -0
- crawfish/cost.py +253 -0
- crawfish/definition/__init__.py +30 -0
- crawfish/definition/compiler.py +280 -0
- crawfish/definition/types.py +145 -0
- crawfish/deploy.py +530 -0
- crawfish/discovery.py +117 -0
- crawfish/doctor.py +118 -0
- crawfish/engine.py +54 -0
- crawfish/eval.py +178 -0
- crawfish/executor.py +244 -0
- crawfish/inspector.py +193 -0
- crawfish/ledger.py +122 -0
- crawfish/manage.py +499 -0
- crawfish/memory.py +80 -0
- crawfish/metrics.py +272 -0
- crawfish/nodes/__init__.py +61 -0
- crawfish/nodes/aggregator.py +161 -0
- crawfish/nodes/filter.py +109 -0
- crawfish/nodes/router.py +206 -0
- crawfish/nodes/sink.py +242 -0
- crawfish/nodes/source.py +146 -0
- crawfish/observe.py +184 -0
- crawfish/observer.py +312 -0
- crawfish/output.py +109 -0
- crawfish/py.typed +0 -0
- crawfish/retry.py +113 -0
- crawfish/run.py +196 -0
- crawfish/runtime/__init__.py +69 -0
- crawfish/runtime/base.py +111 -0
- crawfish/runtime/command.py +168 -0
- crawfish/runtime/context_strategy.py +203 -0
- crawfish/runtime/mcp.py +52 -0
- crawfish/runtime/mock.py +44 -0
- crawfish/runtime/prompt.py +69 -0
- crawfish/runtime/replay.py +63 -0
- crawfish/runtime/select.py +31 -0
- crawfish/runtime/stubs.py +40 -0
- crawfish/runtime/team.py +94 -0
- crawfish/sandbox.py +48 -0
- crawfish/scaffold.py +88 -0
- crawfish/secrets.py +192 -0
- crawfish/stability.py +133 -0
- crawfish/store/__init__.py +8 -0
- crawfish/store/base.py +55 -0
- crawfish/store/sqlite.py +155 -0
- crawfish/testing.py +216 -0
- crawfish/triggers.py +229 -0
- crawfish/typesystem/__init__.py +7 -0
- crawfish/typesystem/registry.py +186 -0
- crawfish/versioning/__init__.py +7 -0
- crawfish/versioning/version.py +62 -0
- crawfish/visualize.py +208 -0
- crawfish/workflow.py +187 -0
- crawfish-0.1.0.dist-info/METADATA +39 -0
- crawfish-0.1.0.dist-info/RECORD +74 -0
- crawfish-0.1.0.dist-info/WHEEL +4 -0
- crawfish-0.1.0.dist-info/entry_points.txt +2 -0
- crawfish-0.1.0.dist-info/licenses/LICENSE +201 -0
- crawfish-0.1.0.dist-info/licenses/NOTICE +8 -0
crawfish/__init__.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""Crawfish — agents for bulk work over your data.
|
|
2
|
+
|
|
3
|
+
``Source → Batch (fan-out) → Aggregator (reduce) → Router (branch) → Sink``,
|
|
4
|
+
authored as directories and run locally via ``claude -p``.
|
|
5
|
+
|
|
6
|
+
This module re-exports the stable public surface. As primitives land (M1–M5) they
|
|
7
|
+
are added here, each placed in its stability tier.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from crawfish.artifacts import (
|
|
13
|
+
ArtifactRef,
|
|
14
|
+
ArtifactStore,
|
|
15
|
+
LocalArtifactStore,
|
|
16
|
+
offload_if_large,
|
|
17
|
+
)
|
|
18
|
+
from crawfish.batch import Anomaly, Batch, Task
|
|
19
|
+
from crawfish.build import BuildPlan, generate_containerfile, plan_build, write_containerfile
|
|
20
|
+
from crawfish.ccexport import (
|
|
21
|
+
ClaudeCodeAgent,
|
|
22
|
+
ClaudeCodeSkill,
|
|
23
|
+
definition_to_cc_agent,
|
|
24
|
+
export_claude_code,
|
|
25
|
+
map_tools,
|
|
26
|
+
model_alias,
|
|
27
|
+
)
|
|
28
|
+
from crawfish.config import ProfileConfig, ProjectManifest, ProjectPaths, load_manifest
|
|
29
|
+
from crawfish.core import (
|
|
30
|
+
BudgetExceeded,
|
|
31
|
+
Cancelled,
|
|
32
|
+
CancelToken,
|
|
33
|
+
CostBudget,
|
|
34
|
+
Flow,
|
|
35
|
+
JSONValue,
|
|
36
|
+
Node,
|
|
37
|
+
NodeKind,
|
|
38
|
+
Parameter,
|
|
39
|
+
Policy,
|
|
40
|
+
PolicyKind,
|
|
41
|
+
RunContext,
|
|
42
|
+
new_id,
|
|
43
|
+
parameters_compatible,
|
|
44
|
+
)
|
|
45
|
+
from crawfish.cost import (
|
|
46
|
+
Budget,
|
|
47
|
+
BudgetState,
|
|
48
|
+
CostEstimate,
|
|
49
|
+
CostMeter,
|
|
50
|
+
estimate_cost,
|
|
51
|
+
spent_today,
|
|
52
|
+
)
|
|
53
|
+
from crawfish.definition import (
|
|
54
|
+
AgentSpec,
|
|
55
|
+
Coordination,
|
|
56
|
+
Definition,
|
|
57
|
+
DefinitionAssets,
|
|
58
|
+
DefinitionLoadError,
|
|
59
|
+
DefinitionRef,
|
|
60
|
+
MarketplacePackage,
|
|
61
|
+
MCPConnection,
|
|
62
|
+
Prompt,
|
|
63
|
+
TeamSpec,
|
|
64
|
+
load_definition,
|
|
65
|
+
)
|
|
66
|
+
from crawfish.deploy import (
|
|
67
|
+
DeployEntry,
|
|
68
|
+
DeployRegistry,
|
|
69
|
+
DeployStatus,
|
|
70
|
+
Supervisor,
|
|
71
|
+
deploy,
|
|
72
|
+
stop,
|
|
73
|
+
)
|
|
74
|
+
from crawfish.discovery import Registry, UnitRef
|
|
75
|
+
from crawfish.doctor import DoctorFinding, DoctorReport, diagnose
|
|
76
|
+
from crawfish.engine import Engine, run_pipeline
|
|
77
|
+
from crawfish.eval import (
|
|
78
|
+
EvalCase,
|
|
79
|
+
GoldenSet,
|
|
80
|
+
LLMJudge,
|
|
81
|
+
capture_case,
|
|
82
|
+
gate_against_baseline,
|
|
83
|
+
grade_output,
|
|
84
|
+
load_baseline,
|
|
85
|
+
save_baseline,
|
|
86
|
+
)
|
|
87
|
+
from crawfish.executor import (
|
|
88
|
+
BatchExecutor,
|
|
89
|
+
BatchRunResult,
|
|
90
|
+
CycleError,
|
|
91
|
+
DependencyGraph,
|
|
92
|
+
ExecutionPlan,
|
|
93
|
+
Roadmap,
|
|
94
|
+
)
|
|
95
|
+
from crawfish.inspector import RunReport, format_report, inspect_run, tail_events
|
|
96
|
+
from crawfish.ledger import ExecState, ExecutionLedger
|
|
97
|
+
from crawfish.manage import PipelineStatus, format_table, manage_list, restart_target
|
|
98
|
+
from crawfish.memory import Memory
|
|
99
|
+
from crawfish.metrics import (
|
|
100
|
+
Benchmark,
|
|
101
|
+
Metric,
|
|
102
|
+
Rubric,
|
|
103
|
+
compare,
|
|
104
|
+
confidence_threshold,
|
|
105
|
+
field_present,
|
|
106
|
+
is_nonempty,
|
|
107
|
+
is_regression,
|
|
108
|
+
output_number,
|
|
109
|
+
)
|
|
110
|
+
from crawfish.nodes import (
|
|
111
|
+
Aggregator,
|
|
112
|
+
ApprovalRequired,
|
|
113
|
+
Classifier,
|
|
114
|
+
Filter,
|
|
115
|
+
GitHubPRSink,
|
|
116
|
+
LinearSink,
|
|
117
|
+
PullRequestSource,
|
|
118
|
+
RepoSource,
|
|
119
|
+
Router,
|
|
120
|
+
Sink,
|
|
121
|
+
Source,
|
|
122
|
+
TargetMustBeStaticError,
|
|
123
|
+
UnroutableLabelError,
|
|
124
|
+
collect,
|
|
125
|
+
concat,
|
|
126
|
+
count,
|
|
127
|
+
dedupe,
|
|
128
|
+
definition_reducer,
|
|
129
|
+
fan_in,
|
|
130
|
+
fan_out,
|
|
131
|
+
field_equals,
|
|
132
|
+
field_matches,
|
|
133
|
+
limit,
|
|
134
|
+
title_contains,
|
|
135
|
+
)
|
|
136
|
+
from crawfish.observe import (
|
|
137
|
+
ObserverEvent,
|
|
138
|
+
ObserverSurface,
|
|
139
|
+
RunInfo,
|
|
140
|
+
Severity,
|
|
141
|
+
parse_since,
|
|
142
|
+
)
|
|
143
|
+
from crawfish.observer import (
|
|
144
|
+
CostSpike,
|
|
145
|
+
FailureRateAbove,
|
|
146
|
+
Observer,
|
|
147
|
+
ObserverContext,
|
|
148
|
+
Rule,
|
|
149
|
+
StuckRun,
|
|
150
|
+
)
|
|
151
|
+
from crawfish.output import Output, WireError, check_wire, output_satisfies_inputs
|
|
152
|
+
from crawfish.retry import ItemResult, ItemStatus, RetryPolicy
|
|
153
|
+
from crawfish.run import InputBindingError, Run, RunStatus, RunSuspended
|
|
154
|
+
from crawfish.runtime import (
|
|
155
|
+
AgentRuntime,
|
|
156
|
+
ClientRuntime,
|
|
157
|
+
CommandRuntime,
|
|
158
|
+
ManagedRuntime,
|
|
159
|
+
MockRuntime,
|
|
160
|
+
RecordReplayRuntime,
|
|
161
|
+
RunRequest,
|
|
162
|
+
RunResult,
|
|
163
|
+
RuntimeEvent,
|
|
164
|
+
get_runtime,
|
|
165
|
+
)
|
|
166
|
+
from crawfish.sandbox import EgressBroker, EgressDenied, run_out_of_process
|
|
167
|
+
from crawfish.scaffold import scaffold_project
|
|
168
|
+
from crawfish.secrets import (
|
|
169
|
+
Capabilities,
|
|
170
|
+
ScrubbingStore,
|
|
171
|
+
SecretManager,
|
|
172
|
+
load_env,
|
|
173
|
+
read_capabilities,
|
|
174
|
+
redact,
|
|
175
|
+
resolve_secret,
|
|
176
|
+
)
|
|
177
|
+
from crawfish.stability import (
|
|
178
|
+
Stability,
|
|
179
|
+
deprecated,
|
|
180
|
+
experimental,
|
|
181
|
+
is_breaking,
|
|
182
|
+
stability_of,
|
|
183
|
+
stable,
|
|
184
|
+
)
|
|
185
|
+
from crawfish.store import SqliteStore, Store
|
|
186
|
+
from crawfish.testing import (
|
|
187
|
+
assert_rubric,
|
|
188
|
+
assert_snapshot,
|
|
189
|
+
replaying,
|
|
190
|
+
run_fixtures,
|
|
191
|
+
snapshot_match,
|
|
192
|
+
)
|
|
193
|
+
from crawfish.triggers import (
|
|
194
|
+
Cron,
|
|
195
|
+
CronSchedule,
|
|
196
|
+
CronTrigger,
|
|
197
|
+
Trigger,
|
|
198
|
+
WebhookTrigger,
|
|
199
|
+
verify_webhook,
|
|
200
|
+
)
|
|
201
|
+
from crawfish.typesystem import TypeDef, TypeKind, TypeRegistry, default_registry
|
|
202
|
+
from crawfish.versioning import Freezable, FrozenError, Version
|
|
203
|
+
from crawfish.visualize import dashboard_state, serve_dashboard
|
|
204
|
+
from crawfish.workflow import Workflow
|
|
205
|
+
|
|
206
|
+
__version__ = "0.1.0"
|
|
207
|
+
|
|
208
|
+
__all__ = [
|
|
209
|
+
"__version__",
|
|
210
|
+
# core
|
|
211
|
+
"JSONValue",
|
|
212
|
+
"new_id",
|
|
213
|
+
"Flow",
|
|
214
|
+
"Parameter",
|
|
215
|
+
"NodeKind",
|
|
216
|
+
"Node",
|
|
217
|
+
"PolicyKind",
|
|
218
|
+
"Policy",
|
|
219
|
+
"parameters_compatible",
|
|
220
|
+
"RunContext",
|
|
221
|
+
"CostBudget",
|
|
222
|
+
"CancelToken",
|
|
223
|
+
"BudgetExceeded",
|
|
224
|
+
"Cancelled",
|
|
225
|
+
# type system
|
|
226
|
+
"TypeDef",
|
|
227
|
+
"TypeKind",
|
|
228
|
+
"TypeRegistry",
|
|
229
|
+
"default_registry",
|
|
230
|
+
# versioning
|
|
231
|
+
"Version",
|
|
232
|
+
"FrozenError",
|
|
233
|
+
"Freezable",
|
|
234
|
+
# store
|
|
235
|
+
"Store",
|
|
236
|
+
"SqliteStore",
|
|
237
|
+
# engine
|
|
238
|
+
"Engine",
|
|
239
|
+
"run_pipeline",
|
|
240
|
+
# output
|
|
241
|
+
"Output",
|
|
242
|
+
"output_satisfies_inputs",
|
|
243
|
+
"check_wire",
|
|
244
|
+
"WireError",
|
|
245
|
+
# definition
|
|
246
|
+
"Definition",
|
|
247
|
+
"AgentSpec",
|
|
248
|
+
"TeamSpec",
|
|
249
|
+
"Coordination",
|
|
250
|
+
"Prompt",
|
|
251
|
+
"DefinitionRef",
|
|
252
|
+
"DefinitionAssets",
|
|
253
|
+
"MarketplacePackage",
|
|
254
|
+
"MCPConnection",
|
|
255
|
+
"load_definition",
|
|
256
|
+
"DefinitionLoadError",
|
|
257
|
+
# runtime
|
|
258
|
+
"AgentRuntime",
|
|
259
|
+
"CommandRuntime",
|
|
260
|
+
"MockRuntime",
|
|
261
|
+
"ClientRuntime",
|
|
262
|
+
"ManagedRuntime",
|
|
263
|
+
"RecordReplayRuntime",
|
|
264
|
+
"RunRequest",
|
|
265
|
+
"RunResult",
|
|
266
|
+
"RuntimeEvent",
|
|
267
|
+
"get_runtime",
|
|
268
|
+
# nodes (M2)
|
|
269
|
+
"Source",
|
|
270
|
+
"RepoSource",
|
|
271
|
+
"PullRequestSource",
|
|
272
|
+
"fan_out",
|
|
273
|
+
"Sink",
|
|
274
|
+
"LinearSink",
|
|
275
|
+
"GitHubPRSink",
|
|
276
|
+
"TargetMustBeStaticError",
|
|
277
|
+
"ApprovalRequired",
|
|
278
|
+
"Filter",
|
|
279
|
+
"title_contains",
|
|
280
|
+
"field_equals",
|
|
281
|
+
"field_matches",
|
|
282
|
+
"limit",
|
|
283
|
+
"Memory",
|
|
284
|
+
# run (M2)
|
|
285
|
+
"Run",
|
|
286
|
+
"RunStatus",
|
|
287
|
+
"InputBindingError",
|
|
288
|
+
"RunSuspended",
|
|
289
|
+
# pipelines (M3)
|
|
290
|
+
"Batch",
|
|
291
|
+
"Task",
|
|
292
|
+
"Anomaly",
|
|
293
|
+
"Aggregator",
|
|
294
|
+
"collect",
|
|
295
|
+
"concat",
|
|
296
|
+
"count",
|
|
297
|
+
"dedupe",
|
|
298
|
+
"definition_reducer",
|
|
299
|
+
"fan_in",
|
|
300
|
+
"Router",
|
|
301
|
+
"Classifier",
|
|
302
|
+
"UnroutableLabelError",
|
|
303
|
+
"ArtifactRef",
|
|
304
|
+
"ArtifactStore",
|
|
305
|
+
"LocalArtifactStore",
|
|
306
|
+
"offload_if_large",
|
|
307
|
+
"DependencyGraph",
|
|
308
|
+
"CycleError",
|
|
309
|
+
"Roadmap",
|
|
310
|
+
"ExecutionPlan",
|
|
311
|
+
"BatchExecutor",
|
|
312
|
+
"BatchRunResult",
|
|
313
|
+
"ExecutionLedger",
|
|
314
|
+
"ObserverEvent",
|
|
315
|
+
"ObserverSurface",
|
|
316
|
+
"RunInfo",
|
|
317
|
+
"Severity",
|
|
318
|
+
"parse_since",
|
|
319
|
+
# operate / observe / integrate
|
|
320
|
+
"DeployEntry",
|
|
321
|
+
"DeployRegistry",
|
|
322
|
+
"DeployStatus",
|
|
323
|
+
"Supervisor",
|
|
324
|
+
"deploy",
|
|
325
|
+
"stop",
|
|
326
|
+
"PipelineStatus",
|
|
327
|
+
"manage_list",
|
|
328
|
+
"format_table",
|
|
329
|
+
"restart_target",
|
|
330
|
+
"Observer",
|
|
331
|
+
"ObserverContext",
|
|
332
|
+
"Rule",
|
|
333
|
+
"FailureRateAbove",
|
|
334
|
+
"CostSpike",
|
|
335
|
+
"StuckRun",
|
|
336
|
+
"dashboard_state",
|
|
337
|
+
"serve_dashboard",
|
|
338
|
+
"ClaudeCodeAgent",
|
|
339
|
+
"ClaudeCodeSkill",
|
|
340
|
+
"definition_to_cc_agent",
|
|
341
|
+
"export_claude_code",
|
|
342
|
+
"map_tools",
|
|
343
|
+
"model_alias",
|
|
344
|
+
"ExecState",
|
|
345
|
+
"RetryPolicy",
|
|
346
|
+
"ItemResult",
|
|
347
|
+
"ItemStatus",
|
|
348
|
+
"Workflow",
|
|
349
|
+
# measurement (M4)
|
|
350
|
+
"Metric",
|
|
351
|
+
"Rubric",
|
|
352
|
+
"Benchmark",
|
|
353
|
+
"output_number",
|
|
354
|
+
"field_present",
|
|
355
|
+
"is_nonempty",
|
|
356
|
+
"confidence_threshold",
|
|
357
|
+
"compare",
|
|
358
|
+
"is_regression",
|
|
359
|
+
"estimate_cost",
|
|
360
|
+
"CostEstimate",
|
|
361
|
+
"Budget",
|
|
362
|
+
"BudgetState",
|
|
363
|
+
"CostMeter",
|
|
364
|
+
"spent_today",
|
|
365
|
+
"inspect_run",
|
|
366
|
+
"tail_events",
|
|
367
|
+
"format_report",
|
|
368
|
+
"RunReport",
|
|
369
|
+
# eval data lifecycle (M4)
|
|
370
|
+
"EvalCase",
|
|
371
|
+
"GoldenSet",
|
|
372
|
+
"LLMJudge",
|
|
373
|
+
"capture_case",
|
|
374
|
+
"grade_output",
|
|
375
|
+
"save_baseline",
|
|
376
|
+
"load_baseline",
|
|
377
|
+
"gate_against_baseline",
|
|
378
|
+
# authoring / packaging / ship (M5)
|
|
379
|
+
"Registry",
|
|
380
|
+
"UnitRef",
|
|
381
|
+
"ProfileConfig",
|
|
382
|
+
"ProjectManifest",
|
|
383
|
+
"ProjectPaths",
|
|
384
|
+
"load_manifest",
|
|
385
|
+
"DoctorFinding",
|
|
386
|
+
"DoctorReport",
|
|
387
|
+
"diagnose",
|
|
388
|
+
"Cron",
|
|
389
|
+
"CronSchedule",
|
|
390
|
+
"scaffold_project",
|
|
391
|
+
"resolve_secret",
|
|
392
|
+
"load_env",
|
|
393
|
+
"SecretManager",
|
|
394
|
+
"ScrubbingStore",
|
|
395
|
+
"redact",
|
|
396
|
+
"read_capabilities",
|
|
397
|
+
"Capabilities",
|
|
398
|
+
"snapshot_match",
|
|
399
|
+
"assert_snapshot",
|
|
400
|
+
"run_fixtures",
|
|
401
|
+
"assert_rubric",
|
|
402
|
+
"replaying",
|
|
403
|
+
"generate_containerfile",
|
|
404
|
+
"plan_build",
|
|
405
|
+
"write_containerfile",
|
|
406
|
+
"BuildPlan",
|
|
407
|
+
"Trigger",
|
|
408
|
+
"CronTrigger",
|
|
409
|
+
"WebhookTrigger",
|
|
410
|
+
"verify_webhook",
|
|
411
|
+
"Stability",
|
|
412
|
+
"stable",
|
|
413
|
+
"experimental",
|
|
414
|
+
"deprecated",
|
|
415
|
+
"stability_of",
|
|
416
|
+
"is_breaking",
|
|
417
|
+
"EgressBroker",
|
|
418
|
+
"EgressDenied",
|
|
419
|
+
"run_out_of_process",
|
|
420
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Artifact store — blob/file Outputs carried by reference.
|
|
2
|
+
|
|
3
|
+
Public surface: the :class:`ArtifactRef` envelope, the :class:`ArtifactStore`
|
|
4
|
+
protocol seam, the local-disk reference impl, and the ``offload_if_large`` helper.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from crawfish.artifacts.base import ArtifactRef, ArtifactStore
|
|
10
|
+
from crawfish.artifacts.local import LocalArtifactStore, offload_if_large
|
|
11
|
+
|
|
12
|
+
__all__ = ["ArtifactRef", "ArtifactStore", "LocalArtifactStore", "offload_if_large"]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""The ``ArtifactStore`` seam — blob/file Outputs by reference.
|
|
2
|
+
|
|
3
|
+
Large payloads (files, blobs, big JSON) don't belong inline in an ``Output``.
|
|
4
|
+
Instead an Output carries a small, content-addressed :class:`ArtifactRef` (a dict
|
|
5
|
+
on ``Output.value``) and the bytes live in an :class:`ArtifactStore`. The product
|
|
6
|
+
model imports the *protocol*, never a concrete backend, so local-disk → S3/GCS is
|
|
7
|
+
a driver swap (mirrors the ``Store`` seam, ADR 0001/0003). Every operation carries
|
|
8
|
+
an ``org_id`` tenancy key (defaulted ``"local"``) so cloud multi-tenancy is a
|
|
9
|
+
driver swap, not a schema migration.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Protocol, runtime_checkable
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel
|
|
17
|
+
|
|
18
|
+
__all__ = ["ArtifactRef", "ArtifactStore"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ArtifactRef(BaseModel):
|
|
22
|
+
"""A content-addressed pointer to artifact bytes held in an ``ArtifactStore``.
|
|
23
|
+
|
|
24
|
+
This is what an ``Output`` carries instead of inline bytes. ``uri`` and
|
|
25
|
+
``sha256`` both derive from the content hash, so identical content dedupes.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
uri: str
|
|
29
|
+
sha256: str
|
|
30
|
+
size: int
|
|
31
|
+
content_type: str = "application/octet-stream"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@runtime_checkable
|
|
35
|
+
class ArtifactStore(Protocol):
|
|
36
|
+
"""Blob persistence contract: content-addressed, tenant-scoped, GC-able."""
|
|
37
|
+
|
|
38
|
+
def put(
|
|
39
|
+
self,
|
|
40
|
+
data: bytes,
|
|
41
|
+
*,
|
|
42
|
+
content_type: str = "application/octet-stream",
|
|
43
|
+
org_id: str = "local",
|
|
44
|
+
) -> ArtifactRef:
|
|
45
|
+
"""Store ``data`` and return a content-addressed :class:`ArtifactRef`."""
|
|
46
|
+
...
|
|
47
|
+
|
|
48
|
+
def get(self, ref: ArtifactRef, *, org_id: str = "local") -> bytes:
|
|
49
|
+
"""Return the bytes for ``ref``. Raises if absent for this ``org_id``."""
|
|
50
|
+
...
|
|
51
|
+
|
|
52
|
+
def exists(self, ref: ArtifactRef, *, org_id: str = "local") -> bool:
|
|
53
|
+
"""True iff ``ref``'s content is stored under this ``org_id``."""
|
|
54
|
+
...
|
|
55
|
+
|
|
56
|
+
def delete(self, ref: ArtifactRef, *, org_id: str = "local") -> None:
|
|
57
|
+
"""Delete ``ref``'s content for this ``org_id`` (no-op if absent)."""
|
|
58
|
+
...
|
|
59
|
+
|
|
60
|
+
def gc(self, live_refs: set[str], *, org_id: str = "local") -> int:
|
|
61
|
+
"""Delete artifacts whose sha256 is not in ``live_refs``; return count."""
|
|
62
|
+
...
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Local-filesystem reference implementation of the ``ArtifactStore`` seam.
|
|
2
|
+
|
|
3
|
+
Content-addressed storage under ``root/<org_id>/<sha[:2]>/<sha>``: identical bytes
|
|
4
|
+
dedupe to one file, and each ``org_id`` gets a separate subtree (tenancy is a path
|
|
5
|
+
prefix, not a schema change — mirrors the SQLite ``Store`` impl, ADR 0001/0003).
|
|
6
|
+
All filesystem layout lives here; call sites use the protocol.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from crawfish.artifacts.base import ArtifactRef, ArtifactStore
|
|
16
|
+
from crawfish.core.types import JSONValue
|
|
17
|
+
|
|
18
|
+
__all__ = ["LocalArtifactStore", "offload_if_large"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class LocalArtifactStore:
|
|
22
|
+
"""An ``ArtifactStore`` backed by the local filesystem, addressed by sha256."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, root: str | Path) -> None:
|
|
25
|
+
self._root = Path(root)
|
|
26
|
+
self._root.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
|
|
28
|
+
def _path(self, sha256: str, org_id: str) -> Path:
|
|
29
|
+
return self._root / org_id / sha256[:2] / sha256
|
|
30
|
+
|
|
31
|
+
def put(
|
|
32
|
+
self,
|
|
33
|
+
data: bytes,
|
|
34
|
+
*,
|
|
35
|
+
content_type: str = "application/octet-stream",
|
|
36
|
+
org_id: str = "local",
|
|
37
|
+
) -> ArtifactRef:
|
|
38
|
+
sha256 = hashlib.sha256(data).hexdigest()
|
|
39
|
+
dest = self._path(sha256, org_id)
|
|
40
|
+
if not dest.exists():
|
|
41
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
42
|
+
# Atomic write: stage to a temp file, then rename into place.
|
|
43
|
+
tmp = dest.with_name(dest.name + ".tmp")
|
|
44
|
+
tmp.write_bytes(data)
|
|
45
|
+
tmp.replace(dest)
|
|
46
|
+
return ArtifactRef(
|
|
47
|
+
uri=dest.as_uri(),
|
|
48
|
+
sha256=sha256,
|
|
49
|
+
size=len(data),
|
|
50
|
+
content_type=content_type,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def get(self, ref: ArtifactRef, *, org_id: str = "local") -> bytes:
|
|
54
|
+
return self._path(ref.sha256, org_id).read_bytes()
|
|
55
|
+
|
|
56
|
+
def exists(self, ref: ArtifactRef, *, org_id: str = "local") -> bool:
|
|
57
|
+
return self._path(ref.sha256, org_id).is_file()
|
|
58
|
+
|
|
59
|
+
def delete(self, ref: ArtifactRef, *, org_id: str = "local") -> None:
|
|
60
|
+
self._path(ref.sha256, org_id).unlink(missing_ok=True)
|
|
61
|
+
|
|
62
|
+
def gc(self, live_refs: set[str], *, org_id: str = "local") -> int:
|
|
63
|
+
base = self._root / org_id
|
|
64
|
+
if not base.is_dir():
|
|
65
|
+
return 0
|
|
66
|
+
removed = 0
|
|
67
|
+
for shard in base.iterdir():
|
|
68
|
+
if not shard.is_dir():
|
|
69
|
+
continue
|
|
70
|
+
for blob in shard.iterdir():
|
|
71
|
+
if blob.is_file() and blob.name not in live_refs:
|
|
72
|
+
blob.unlink()
|
|
73
|
+
removed += 1
|
|
74
|
+
return removed
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def offload_if_large(
|
|
78
|
+
value: JSONValue,
|
|
79
|
+
store: ArtifactStore,
|
|
80
|
+
*,
|
|
81
|
+
threshold: int = 65536,
|
|
82
|
+
org_id: str = "local",
|
|
83
|
+
) -> JSONValue | ArtifactRef:
|
|
84
|
+
"""Offload ``value`` to ``store`` if its JSON form exceeds ``threshold`` bytes.
|
|
85
|
+
|
|
86
|
+
Returns an :class:`ArtifactRef` (content_type ``application/json``) when the
|
|
87
|
+
serialized value is larger than ``threshold``; otherwise returns ``value``
|
|
88
|
+
unchanged. This is how an Output keeps large payloads out of the record.
|
|
89
|
+
"""
|
|
90
|
+
data = json.dumps(value).encode("utf-8")
|
|
91
|
+
if len(data) <= threshold:
|
|
92
|
+
return value
|
|
93
|
+
return store.put(data, content_type="application/json", org_id=org_id)
|