codebatch 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebatch-0.1.0/.github/workflows/.gitkeep +0 -0
- codebatch-0.1.0/.gitignore +49 -0
- codebatch-0.1.0/.spec_baseline_hash +1 -0
- codebatch-0.1.0/CHANGELOG.md +33 -0
- codebatch-0.1.0/PKG-INFO +66 -0
- codebatch-0.1.0/README.md +46 -0
- codebatch-0.1.0/SPEC.md +342 -0
- codebatch-0.1.0/docs/PHASE2_ACCEPTANCE.md +158 -0
- codebatch-0.1.0/docs/PHASE2_CHARTER.md +175 -0
- codebatch-0.1.0/docs/PHASE2_RULES.md +142 -0
- codebatch-0.1.0/pyproject.toml +42 -0
- codebatch-0.1.0/schemas/batch.schema.json +53 -0
- codebatch-0.1.0/schemas/chunk-manifest.schema.json +72 -0
- codebatch-0.1.0/schemas/event-record.schema.json +75 -0
- codebatch-0.1.0/schemas/files-index-record.schema.json +55 -0
- codebatch-0.1.0/schemas/output-record.schema.json +79 -0
- codebatch-0.1.0/schemas/plan.schema.json +61 -0
- codebatch-0.1.0/schemas/snapshot.schema.json +73 -0
- codebatch-0.1.0/schemas/state.schema.json +89 -0
- codebatch-0.1.0/schemas/store.schema.json +38 -0
- codebatch-0.1.0/schemas/task.schema.json +91 -0
- codebatch-0.1.0/scripts/check_no_network.py +155 -0
- codebatch-0.1.0/scripts/check_spec_protected.py +113 -0
- codebatch-0.1.0/scripts/check_truth_stores.py +156 -0
- codebatch-0.1.0/src/codebatch/__init__.py +3 -0
- codebatch-0.1.0/src/codebatch/batch.py +366 -0
- codebatch-0.1.0/src/codebatch/cas.py +170 -0
- codebatch-0.1.0/src/codebatch/cli.py +432 -0
- codebatch-0.1.0/src/codebatch/common.py +104 -0
- codebatch-0.1.0/src/codebatch/paths.py +196 -0
- codebatch-0.1.0/src/codebatch/query.py +242 -0
- codebatch-0.1.0/src/codebatch/runner.py +495 -0
- codebatch-0.1.0/src/codebatch/snapshot.py +340 -0
- codebatch-0.1.0/src/codebatch/store.py +162 -0
- codebatch-0.1.0/src/codebatch/tasks/__init__.py +37 -0
- codebatch-0.1.0/src/codebatch/tasks/analyze.py +109 -0
- codebatch-0.1.0/src/codebatch/tasks/lint.py +244 -0
- codebatch-0.1.0/src/codebatch/tasks/parse.py +304 -0
- codebatch-0.1.0/src/codebatch/tasks/symbols.py +223 -0
- codebatch-0.1.0/tests/README.md +52 -0
- codebatch-0.1.0/tests/conftest.py +9 -0
- codebatch-0.1.0/tests/fixtures/corpus/binary.bin +0 -0
- codebatch-0.1.0/tests/fixtures/corpus/crlf_example.txt +3 -0
- codebatch-0.1.0/tests/fixtures/corpus/emoji_/360/237/216/211.md +5 -0
- codebatch-0.1.0/tests/fixtures/corpus/empty.txt +0 -0
- codebatch-0.1.0/tests/fixtures/corpus/hello.py +11 -0
- codebatch-0.1.0/tests/fixtures/corpus/unicode_/350/267/257/345/276/204.txt +3 -0
- codebatch-0.1.0/tests/fixtures/corpus-windows-only/CaseA.cs +9 -0
- codebatch-0.1.0/tests/fixtures/corpus-windows-only/casea.cs.expected +12 -0
- codebatch-0.1.0/tests/fixtures/golden/snapshot/files.index.jsonl +6 -0
- codebatch-0.1.0/tests/fixtures/golden/snapshot/snapshot.json +19 -0
- codebatch-0.1.0/tests/test_analyze_task.py +229 -0
- codebatch-0.1.0/tests/test_batch.py +176 -0
- codebatch-0.1.0/tests/test_cas.py +115 -0
- codebatch-0.1.0/tests/test_e2e_acceptance.py +296 -0
- codebatch-0.1.0/tests/test_lint_task.py +359 -0
- codebatch-0.1.0/tests/test_parse_task.py +206 -0
- codebatch-0.1.0/tests/test_paths.py +171 -0
- codebatch-0.1.0/tests/test_phase2_gates.py +537 -0
- codebatch-0.1.0/tests/test_query.py +185 -0
- codebatch-0.1.0/tests/test_runner.py +244 -0
- codebatch-0.1.0/tests/test_schema_validation.py +230 -0
- codebatch-0.1.0/tests/test_snapshot.py +146 -0
- codebatch-0.1.0/tests/test_store.py +139 -0
- codebatch-0.1.0/tests/test_symbols_task.py +296 -0
|
File without changes
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.env
|
|
25
|
+
.venv
|
|
26
|
+
env/
|
|
27
|
+
venv/
|
|
28
|
+
ENV/
|
|
29
|
+
|
|
30
|
+
# Testing
|
|
31
|
+
.pytest_cache/
|
|
32
|
+
.coverage
|
|
33
|
+
htmlcov/
|
|
34
|
+
.tox/
|
|
35
|
+
.nox/
|
|
36
|
+
|
|
37
|
+
# IDE
|
|
38
|
+
.idea/
|
|
39
|
+
.vscode/
|
|
40
|
+
*.swp
|
|
41
|
+
*.swo
|
|
42
|
+
*~
|
|
43
|
+
|
|
44
|
+
# OS
|
|
45
|
+
.DS_Store
|
|
46
|
+
Thumbs.db
|
|
47
|
+
|
|
48
|
+
# CodeBatch stores (example stores, not the code)
|
|
49
|
+
*.store/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
8578b4ba5bda9a45a2868f13a9a19b5e2f46ee4fed63803e8e85c25553a818ac
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to the CodeBatch specification and implementation.
|
|
4
|
+
|
|
5
|
+
## [Unreleased]
|
|
6
|
+
|
|
7
|
+
### Added
|
|
8
|
+
- Initial repository structure
|
|
9
|
+
- Core specification document
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## [spec-v1.0-draft] - 2025-02-02
|
|
14
|
+
|
|
15
|
+
### Added
|
|
16
|
+
- Complete storage and execution specification (SPEC.md)
|
|
17
|
+
- Content-addressed object store layout
|
|
18
|
+
- Snapshot immutability contract
|
|
19
|
+
- Batch, task, and shard execution model
|
|
20
|
+
- Output record indexing semantics
|
|
21
|
+
- Query model guarantees
|
|
22
|
+
- Large output chunking rules
|
|
23
|
+
- Versioning requirements for all records
|
|
24
|
+
|
|
25
|
+
### Defined
|
|
26
|
+
- 14 specification sections covering full execution lifecycle
|
|
27
|
+
- 6 global invariants for system correctness
|
|
28
|
+
- Compliance criteria for implementations
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
[Unreleased]: https://github.com/mcp-tool-shop-org/code-batch/compare/spec-v1.0-draft...HEAD
|
|
33
|
+
[spec-v1.0-draft]: https://github.com/mcp-tool-shop-org/code-batch/releases/tag/spec-v1.0-draft
|
codebatch-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codebatch
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Content-addressed batch execution engine
|
|
5
|
+
Author-email: mcp-tool-shop <64996768+mcp-tool-shop@users.noreply.github.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: batch,content-addressed,deterministic,execution
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# CodeBatch
|
|
22
|
+
|
|
23
|
+
Content-addressed batch execution engine with deterministic sharding and queryable outputs.
|
|
24
|
+
|
|
25
|
+
## Overview
|
|
26
|
+
|
|
27
|
+
CodeBatch provides a filesystem-based execution substrate for running deterministic transformations over codebases. It captures inputs as immutable snapshots, executes work in isolated shards, and indexes all semantic outputs for efficient querying—without requiring a database.
|
|
28
|
+
|
|
29
|
+
## Documentation
|
|
30
|
+
|
|
31
|
+
- **[SPEC.md](./SPEC.md)** — Full storage and execution specification
|
|
32
|
+
- **[CHANGELOG.md](./CHANGELOG.md)** — Version history
|
|
33
|
+
|
|
34
|
+
## Spec Versioning
|
|
35
|
+
|
|
36
|
+
The specification uses semantic versioning with draft/stable markers. Each version is tagged in git (e.g., `spec-v1.0-draft`). Breaking changes increment the major version. Implementations should declare which spec version they target and tolerate unknown fields for forward compatibility.
|
|
37
|
+
|
|
38
|
+
## Project Structure
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
schemas/ JSON Schema definitions for all record types
|
|
42
|
+
src/ Core implementation
|
|
43
|
+
tests/ Test suites and fixtures
|
|
44
|
+
examples/ Usage examples
|
|
45
|
+
.github/ CI/CD workflows
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# Create a snapshot of a directory
|
|
52
|
+
codebatch snapshot ./my-project --store ./store
|
|
53
|
+
|
|
54
|
+
# Initialize a batch with a pipeline
|
|
55
|
+
codebatch batch init --snapshot <id> --pipeline parse
|
|
56
|
+
|
|
57
|
+
# Run a shard
|
|
58
|
+
codebatch run-shard --batch <id> --task 01_parse --shard ab
|
|
59
|
+
|
|
60
|
+
# Query results
|
|
61
|
+
codebatch query diagnostics --batch <id> --task 01_parse
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## License
|
|
65
|
+
|
|
66
|
+
MIT
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# CodeBatch
|
|
2
|
+
|
|
3
|
+
Content-addressed batch execution engine with deterministic sharding and queryable outputs.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
CodeBatch provides a filesystem-based execution substrate for running deterministic transformations over codebases. It captures inputs as immutable snapshots, executes work in isolated shards, and indexes all semantic outputs for efficient querying—without requiring a database.
|
|
8
|
+
|
|
9
|
+
## Documentation
|
|
10
|
+
|
|
11
|
+
- **[SPEC.md](./SPEC.md)** — Full storage and execution specification
|
|
12
|
+
- **[CHANGELOG.md](./CHANGELOG.md)** — Version history
|
|
13
|
+
|
|
14
|
+
## Spec Versioning
|
|
15
|
+
|
|
16
|
+
The specification uses semantic versioning with draft/stable markers. Each version is tagged in git (e.g., `spec-v1.0-draft`). Breaking changes increment the major version. Implementations should declare which spec version they target and tolerate unknown fields for forward compatibility.
|
|
17
|
+
|
|
18
|
+
## Project Structure
|
|
19
|
+
|
|
20
|
+
```
|
|
21
|
+
schemas/ JSON Schema definitions for all record types
|
|
22
|
+
src/ Core implementation
|
|
23
|
+
tests/ Test suites and fixtures
|
|
24
|
+
examples/ Usage examples
|
|
25
|
+
.github/ CI/CD workflows
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Create a snapshot of a directory
|
|
32
|
+
codebatch snapshot ./my-project --store ./store
|
|
33
|
+
|
|
34
|
+
# Initialize a batch with a pipeline
|
|
35
|
+
codebatch batch init --snapshot <id> --pipeline parse
|
|
36
|
+
|
|
37
|
+
# Run a shard
|
|
38
|
+
codebatch run-shard --batch <id> --task 01_parse --shard ab
|
|
39
|
+
|
|
40
|
+
# Query results
|
|
41
|
+
codebatch query diagnostics --batch <id> --task 01_parse
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## License
|
|
45
|
+
|
|
46
|
+
MIT
|
codebatch-0.1.0/SPEC.md
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# CodeBatch Storage & Execution Specification
|
|
2
|
+
|
|
3
|
+
**Specification Version: 1.0 (Draft)**
|
|
4
|
+
**Schema Version: 1** (`schema_version` field in all records)
|
|
5
|
+
|
|
6
|
+
> **Note**: The specification version tracks the document itself. The schema version
|
|
7
|
+
> is the integer value written to `schema_version` fields in JSON records and is
|
|
8
|
+
> incremented when record formats change in backward-incompatible ways.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## 1. Scope and Non-Goals
|
|
13
|
+
|
|
14
|
+
This specification defines:
|
|
15
|
+
|
|
16
|
+
- The on-disk storage layout
|
|
17
|
+
- Content-addressed object rules
|
|
18
|
+
- Snapshot, batch, task, and shard contracts
|
|
19
|
+
- Execution and output indexing semantics
|
|
20
|
+
- Queryability guarantees
|
|
21
|
+
|
|
22
|
+
This specification does **not** define:
|
|
23
|
+
|
|
24
|
+
- Scheduling policies
|
|
25
|
+
- UI concerns
|
|
26
|
+
- Cloud-specific integrations
|
|
27
|
+
- Programming language requirements
|
|
28
|
+
|
|
29
|
+
All behaviors are defined in terms of filesystem structure and serialized records.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
<!-- SPEC_PROTECTED_BEGIN -->
|
|
34
|
+
<!-- WARNING: Changes to sections 2-8 require Phase 3+ and schema version bump -->
|
|
35
|
+
<!-- Allowed changes: adding output kinds (§9), clarifying plan deps (§7), new schemas -->
|
|
36
|
+
|
|
37
|
+
## 2. Global Invariants
|
|
38
|
+
|
|
39
|
+
The system SHALL maintain the following invariants:
|
|
40
|
+
|
|
41
|
+
1. Content-addressed objects are immutable and add-only.
|
|
42
|
+
2. Snapshots represent frozen input state and never change after creation.
|
|
43
|
+
3. Batches represent execution attempts and may be repeated or discarded.
|
|
44
|
+
4. Semantic results are discoverable without reading execution logs.
|
|
45
|
+
5. Partial execution SHALL NOT corrupt previously completed work.
|
|
46
|
+
6. All indexes are append-only and rebuildable from authoritative data.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## 3. Store Root Layout
|
|
51
|
+
|
|
52
|
+
A CodeBatch store has the following top-level layout:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
<store_root>/
|
|
56
|
+
store.json
|
|
57
|
+
objects/
|
|
58
|
+
snapshots/
|
|
59
|
+
batches/
|
|
60
|
+
indexes/ # optional acceleration only
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
`store.json` describes store-level configuration and versioning.
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## 4. Object Store
|
|
68
|
+
|
|
69
|
+
### 4.1 Object Identity
|
|
70
|
+
|
|
71
|
+
- Each object is identified by `sha256(raw_bytes)`.
|
|
72
|
+
- Object identity is independent of filename, origin, or usage.
|
|
73
|
+
- Object hashes are stable across platforms.
|
|
74
|
+
|
|
75
|
+
### 4.2 Storage Layout
|
|
76
|
+
|
|
77
|
+
Objects SHALL be stored at:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
objects/sha256/<aa>/<bb>/<full_hash>
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Where `<aa>` and `<bb>` are the first two byte pairs of the hex hash.
|
|
84
|
+
|
|
85
|
+
### 4.3 Object Metadata (Optional)
|
|
86
|
+
|
|
87
|
+
An object MAY have an adjacent metadata file:
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
<full_hash>.meta.json
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Metadata is advisory and SHALL NOT be required for correctness.
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## 5. Snapshots
|
|
98
|
+
|
|
99
|
+
### 5.1 Snapshot Definition
|
|
100
|
+
|
|
101
|
+
- A snapshot represents a frozen view of an input source at a specific point in time.
|
|
102
|
+
- Snapshots are immutable once written.
|
|
103
|
+
|
|
104
|
+
### 5.2 Snapshot Layout
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
snapshots/<snapshot_id>/
|
|
108
|
+
snapshot.json
|
|
109
|
+
files.index.jsonl
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### 5.3 Path Canonicalization
|
|
113
|
+
|
|
114
|
+
All file paths in a snapshot SHALL be canonicalized:
|
|
115
|
+
|
|
116
|
+
- UTF-8 encoded
|
|
117
|
+
- `/` as separator
|
|
118
|
+
- No `.` or `..` segments
|
|
119
|
+
- No trailing slash
|
|
120
|
+
- Stable casing preserved
|
|
121
|
+
|
|
122
|
+
A `path_key` field SHALL be included for normalized comparison.
|
|
123
|
+
|
|
124
|
+
### 5.4 File Index Records
|
|
125
|
+
|
|
126
|
+
Each line in `files.index.jsonl` describes exactly one file.
|
|
127
|
+
|
|
128
|
+
**Required fields:**
|
|
129
|
+
|
|
130
|
+
| Field | Description |
|
|
131
|
+
|-------|-------------|
|
|
132
|
+
| `schema_version` | Record schema version |
|
|
133
|
+
| `path` | Original file path |
|
|
134
|
+
| `path_key` | Normalized path for comparison |
|
|
135
|
+
| `object` | SHA-256 hash of file content |
|
|
136
|
+
| `size` | File size in bytes |
|
|
137
|
+
|
|
138
|
+
**Optional fields MAY include:**
|
|
139
|
+
|
|
140
|
+
- `text_hash`
|
|
141
|
+
- `lang_hint`
|
|
142
|
+
- `mode`
|
|
143
|
+
- `mtime`
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## 6. Batches
|
|
148
|
+
|
|
149
|
+
### 6.1 Batch Definition
|
|
150
|
+
|
|
151
|
+
- A batch represents one execution attempt over a snapshot.
|
|
152
|
+
- Batches are isolated, repeatable, and discardable.
|
|
153
|
+
|
|
154
|
+
### 6.2 Batch Layout
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
batches/<batch_id>/
|
|
158
|
+
batch.json
|
|
159
|
+
plan.json
|
|
160
|
+
events.jsonl
|
|
161
|
+
tasks/
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### 6.3 Batch Events
|
|
165
|
+
|
|
166
|
+
- `events.jsonl` records execution facts only.
|
|
167
|
+
- Events SHALL NOT be required to answer semantic questions about outputs.
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## 7. Tasks
|
|
172
|
+
|
|
173
|
+
### 7.1 Task Definition
|
|
174
|
+
|
|
175
|
+
- A task performs a deterministic transformation over snapshot inputs or prior task outputs.
|
|
176
|
+
- Tasks SHALL be idempotent per shard.
|
|
177
|
+
|
|
178
|
+
### 7.2 Task Layout
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
tasks/<task_id>/
|
|
182
|
+
task.json
|
|
183
|
+
events.jsonl
|
|
184
|
+
shards/
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### 7.3 Task Configuration
|
|
188
|
+
|
|
189
|
+
`task.json` SHALL fully describe:
|
|
190
|
+
|
|
191
|
+
- Task identity
|
|
192
|
+
- Input requirements
|
|
193
|
+
- Sharding strategy
|
|
194
|
+
- Resolved configuration parameters
|
|
195
|
+
|
|
196
|
+
Task configuration SHALL be treated as immutable.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## 8. Shards
|
|
201
|
+
|
|
202
|
+
### 8.1 Shard Identity
|
|
203
|
+
|
|
204
|
+
- Shards are deterministic partitions of task input space.
|
|
205
|
+
- Shard identifiers SHALL be stable across executions.
|
|
206
|
+
|
|
207
|
+
### 8.2 Shard Layout
|
|
208
|
+
|
|
209
|
+
```
|
|
210
|
+
shards/<shard_id>/
|
|
211
|
+
state.json
|
|
212
|
+
outputs.index.jsonl
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### 8.3 Shard State
|
|
216
|
+
|
|
217
|
+
- `state.json` tracks shard execution status.
|
|
218
|
+
- Shard state transitions SHALL be monotonic.
|
|
219
|
+
|
|
220
|
+
### 8.4 Shard Execution Rules
|
|
221
|
+
|
|
222
|
+
A shard:
|
|
223
|
+
|
|
224
|
+
1. Reads only snapshot and prior task outputs
|
|
225
|
+
2. Writes only within its own shard directory
|
|
226
|
+
3. Adds objects to the object store
|
|
227
|
+
4. Appends records to its outputs index
|
|
228
|
+
5. Emits completion events only after outputs are committed
|
|
229
|
+
|
|
230
|
+
<!-- SPEC_PROTECTED_END -->
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## 9. Outputs
|
|
235
|
+
|
|
236
|
+
### 9.1 Output Records
|
|
237
|
+
|
|
238
|
+
- All semantic results SHALL be represented as output records.
|
|
239
|
+
- Output records are append-only.
|
|
240
|
+
|
|
241
|
+
### 9.2 Output Index
|
|
242
|
+
|
|
243
|
+
Each shard SHALL maintain an `outputs.index.jsonl`.
|
|
244
|
+
|
|
245
|
+
Each record SHALL include:
|
|
246
|
+
|
|
247
|
+
| Field | Description |
|
|
248
|
+
|-------|-------------|
|
|
249
|
+
| `schema_version` | Record schema version |
|
|
250
|
+
| `snapshot_id` | Source snapshot |
|
|
251
|
+
| `batch_id` | Execution batch |
|
|
252
|
+
| `task_id` | Owning task |
|
|
253
|
+
| `shard_id` | Owning shard |
|
|
254
|
+
| `path` | Source file path |
|
|
255
|
+
| `kind` | Output type |
|
|
256
|
+
| `ts` | Timestamp |
|
|
257
|
+
|
|
258
|
+
Records MAY include:
|
|
259
|
+
|
|
260
|
+
- `object` — Content hash for stored outputs
|
|
261
|
+
- `format` — Output format identifier
|
|
262
|
+
- Diagnostic fields (`severity`, `code`, `message`)
|
|
263
|
+
|
|
264
|
+
### 9.3 Diagnostics
|
|
265
|
+
|
|
266
|
+
- Diagnostics SHALL be represented as output records with `kind = diagnostic`.
|
|
267
|
+
- Diagnostics SHALL NOT be inferred from execution events.
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## 10. Large Outputs
|
|
272
|
+
|
|
273
|
+
### 10.1 Chunking Requirement
|
|
274
|
+
|
|
275
|
+
- Outputs exceeding a configured size threshold SHALL be chunked.
|
|
276
|
+
- Chunked outputs SHALL be represented by a manifest object.
|
|
277
|
+
|
|
278
|
+
### 10.2 Chunk Manifest
|
|
279
|
+
|
|
280
|
+
A chunk manifest SHALL include:
|
|
281
|
+
|
|
282
|
+
| Field | Description |
|
|
283
|
+
|-------|-------------|
|
|
284
|
+
| `schema_name` | Manifest schema identifier |
|
|
285
|
+
| `schema_version` | Manifest schema version |
|
|
286
|
+
| `kind` | Output kind |
|
|
287
|
+
| `format` | Content format |
|
|
288
|
+
| `chunks` | Array of chunk object references |
|
|
289
|
+
| `total_bytes` | Total size across all chunks |
|
|
290
|
+
|
|
291
|
+
Output records SHALL reference the manifest object.
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## 11. Execution Semantics
|
|
296
|
+
|
|
297
|
+
### 11.1 Determinism
|
|
298
|
+
|
|
299
|
+
Given identical snapshot, task configuration, and shard identifier:
|
|
300
|
+
|
|
301
|
+
- The same outputs SHALL be produced
|
|
302
|
+
- Duplicate objects SHALL deduplicate naturally
|
|
303
|
+
|
|
304
|
+
### 11.2 Failure Handling
|
|
305
|
+
|
|
306
|
+
- Shard failure SHALL NOT invalidate other shards.
|
|
307
|
+
- Restarting a shard SHALL NOT require cleanup.
|
|
308
|
+
|
|
309
|
+
---
|
|
310
|
+
|
|
311
|
+
## 12. Query Model
|
|
312
|
+
|
|
313
|
+
The following questions SHALL be answerable without reading execution logs:
|
|
314
|
+
|
|
315
|
+
1. Which files produced diagnostics?
|
|
316
|
+
2. Which outputs exist for a given task?
|
|
317
|
+
3. Which files failed a given task?
|
|
318
|
+
4. Aggregate counts by kind, severity, or language
|
|
319
|
+
|
|
320
|
+
Indexes MAY be accelerated but SHALL remain rebuildable.
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## 13. Versioning
|
|
325
|
+
|
|
326
|
+
All structured records SHALL include:
|
|
327
|
+
|
|
328
|
+
- `schema_name`
|
|
329
|
+
- `schema_version`
|
|
330
|
+
|
|
331
|
+
Readers SHALL tolerate unknown fields.
|
|
332
|
+
|
|
333
|
+
---
|
|
334
|
+
|
|
335
|
+
## 14. Compliance
|
|
336
|
+
|
|
337
|
+
An implementation is compliant if:
|
|
338
|
+
|
|
339
|
+
1. All required structures are present
|
|
340
|
+
2. All invariants are preserved
|
|
341
|
+
3. Semantic state is discoverable from indexes alone
|
|
342
|
+
4. Partial execution does not corrupt prior results
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Phase 2 Acceptance Checklist
|
|
2
|
+
|
|
3
|
+
**Human-readable checklist mirroring automated gates.**
|
|
4
|
+
|
|
5
|
+
Use this when debugging CI failures.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Pre-Flight Checks
|
|
10
|
+
|
|
11
|
+
- [ ] All Phase 1 tests still pass (121 baseline)
|
|
12
|
+
- [ ] No uncommitted changes to protected SPEC regions
|
|
13
|
+
- [ ] No network imports in new code
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## Gate 1: Multi-Task Pipeline End-to-End
|
|
18
|
+
|
|
19
|
+
**What it tests**: Complete pipeline with deps works.
|
|
20
|
+
|
|
21
|
+
**Manual verification**:
|
|
22
|
+
```bash
|
|
23
|
+
# Init and snapshot
|
|
24
|
+
codebatch init ./test-store
|
|
25
|
+
codebatch snapshot ./fixtures/corpus --store ./test-store
|
|
26
|
+
|
|
27
|
+
# Create batch with full pipeline
|
|
28
|
+
codebatch batch init --snapshot <id> --pipeline full --store ./test-store
|
|
29
|
+
|
|
30
|
+
# Run tasks in order
|
|
31
|
+
codebatch run-task --batch <id> --task 01_parse --store ./test-store
|
|
32
|
+
codebatch run-task --batch <id> --task 02_analyze --store ./test-store
|
|
33
|
+
codebatch run-task --batch <id> --task 03_symbols --store ./test-store
|
|
34
|
+
codebatch run-task --batch <id> --task 04_lint --store ./test-store
|
|
35
|
+
|
|
36
|
+
# Verify all shards done
|
|
37
|
+
find ./test-store/batches/*/tasks/*/shards/*/state.json -exec grep -l '"status": "done"' {} \;
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Pass condition**: All shard states are "done", each task has outputs.
|
|
41
|
+
|
|
42
|
+
---
|
|
43
|
+
|
|
44
|
+
## Gate 2: Log Independence
|
|
45
|
+
|
|
46
|
+
**What it tests**: Semantic queries don't depend on events.
|
|
47
|
+
|
|
48
|
+
**Manual verification**:
|
|
49
|
+
```bash
|
|
50
|
+
# Run queries, save output
|
|
51
|
+
codebatch query diagnostics --batch <id> --task 04_lint --store ./test-store > before.json
|
|
52
|
+
codebatch query outputs --batch <id> --task 03_symbols --kind symbol --store ./test-store >> before.json
|
|
53
|
+
|
|
54
|
+
# Delete all events
|
|
55
|
+
find ./test-store -name "events.jsonl" -delete
|
|
56
|
+
|
|
57
|
+
# Rerun queries
|
|
58
|
+
codebatch query diagnostics --batch <id> --task 04_lint --store ./test-store > after.json
|
|
59
|
+
codebatch query outputs --batch <id> --task 03_symbols --kind symbol --store ./test-store >> after.json
|
|
60
|
+
|
|
61
|
+
# Compare (ignoring timestamps)
|
|
62
|
+
diff <(jq -S 'del(.ts)' before.json) <(jq -S 'del(.ts)' after.json)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Pass condition**: Identical semantic results.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Gate 3: Cache Deletion Equivalence
|
|
70
|
+
|
|
71
|
+
**What it tests**: indexes/ is truly optional.
|
|
72
|
+
|
|
73
|
+
**Manual verification**:
|
|
74
|
+
```bash
|
|
75
|
+
# Run queries with cache
|
|
76
|
+
codebatch query stats --batch <id> --task 03_symbols --store ./test-store > with_cache.json
|
|
77
|
+
|
|
78
|
+
# Delete cache
|
|
79
|
+
rm -rf ./test-store/indexes/
|
|
80
|
+
|
|
81
|
+
# Rerun queries
|
|
82
|
+
codebatch query stats --batch <id> --task 03_symbols --store ./test-store > without_cache.json
|
|
83
|
+
|
|
84
|
+
# Compare
|
|
85
|
+
diff with_cache.json without_cache.json
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Pass condition**: Identical results.
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## Gate 4: Retry Determinism
|
|
93
|
+
|
|
94
|
+
**What it tests**: Per-shard replacement produces same outputs on retry.
|
|
95
|
+
|
|
96
|
+
**Manual verification**:
|
|
97
|
+
```bash
|
|
98
|
+
# Run shard, capture outputs
|
|
99
|
+
codebatch run-shard --batch <id> --task 03_symbols --shard ab --store ./test-store
|
|
100
|
+
cp ./test-store/batches/<id>/tasks/03_symbols/shards/ab/outputs.index.jsonl run1.jsonl
|
|
101
|
+
|
|
102
|
+
# Reset shard (delete state and outputs)
|
|
103
|
+
rm ./test-store/batches/<id>/tasks/03_symbols/shards/ab/state.json
|
|
104
|
+
rm ./test-store/batches/<id>/tasks/03_symbols/shards/ab/outputs.index.jsonl
|
|
105
|
+
|
|
106
|
+
# Rerun
|
|
107
|
+
codebatch run-shard --batch <id> --task 03_symbols --shard ab --store ./test-store
|
|
108
|
+
cp ./test-store/batches/<id>/tasks/03_symbols/shards/ab/outputs.index.jsonl run2.jsonl
|
|
109
|
+
|
|
110
|
+
# Compare (ignoring ts)
|
|
111
|
+
diff <(jq -S 'del(.ts)' run1.jsonl) <(jq -S 'del(.ts)' run2.jsonl)
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**Pass condition**: Same semantic records (objects may differ if content-equivalent).
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Gate 5: SPEC Stability
|
|
119
|
+
|
|
120
|
+
**What it tests**: Protected SPEC regions unchanged.
|
|
121
|
+
|
|
122
|
+
**Manual verification**:
|
|
123
|
+
```bash
|
|
124
|
+
# Check protected region
|
|
125
|
+
git diff origin/main -- SPEC.md | grep -A5 -B5 "SPEC_PROTECTED"
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Pass condition**: No changes between SPEC_PROTECTED markers.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Task Output Verification
|
|
133
|
+
|
|
134
|
+
### 02_analyze
|
|
135
|
+
- [ ] Emits `kind=metric` records
|
|
136
|
+
- [ ] Metrics include: `loc`, `size`, `complexity` (optional)
|
|
137
|
+
|
|
138
|
+
### 03_symbols
|
|
139
|
+
- [ ] Emits `kind=symbol` records with `name`, `symbol_type`, `line`
|
|
140
|
+
- [ ] Emits `kind=edge` records with `edge_type`, `target`
|
|
141
|
+
- [ ] Symbol types include: `function`, `class`, `variable`
|
|
142
|
+
- [ ] Edge types include: `imports`, `calls` (optional)
|
|
143
|
+
|
|
144
|
+
### 04_lint
|
|
145
|
+
- [ ] Emits `kind=diagnostic` records
|
|
146
|
+
- [ ] Diagnostics have `severity`, `code`, `message`
|
|
147
|
+
- [ ] Line/col positions when available
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Final Checklist
|
|
152
|
+
|
|
153
|
+
- [ ] 150+ tests passing
|
|
154
|
+
- [ ] All 5 gates pass in CI
|
|
155
|
+
- [ ] No network imports
|
|
156
|
+
- [ ] No writes outside allowed paths
|
|
157
|
+
- [ ] Plan deps enforced
|
|
158
|
+
- [ ] Documentation updated
|