@kulapard/pi-caveman 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +6 -9
- package/CHANGELOG.md +46 -0
- package/README.md +6 -16
- package/package.json +2 -5
- package/skills/caveman-compress/README.md +10 -31
- package/skills/caveman-compress/SKILL.md +11 -14
- package/skills/caveman-compress/SECURITY.md +0 -31
- package/skills/caveman-compress/scripts/__init__.py +0 -9
- package/skills/caveman-compress/scripts/__main__.py +0 -3
- package/skills/caveman-compress/scripts/benchmark.py +0 -80
- package/skills/caveman-compress/scripts/cli.py +0 -85
- package/skills/caveman-compress/scripts/compress.py +0 -341
- package/skills/caveman-compress/scripts/detect.py +0 -169
- package/skills/caveman-compress/scripts/validate.py +0 -213
package/AGENTS.md
CHANGED
|
@@ -22,12 +22,12 @@ Project memory for agents working in this repo. Non-obvious conventions only.
|
|
|
22
22
|
SDK is needed at test time. A value import from `@earendil-works/pi-coding-agent`
|
|
23
23
|
would break the tests — `tests/extension.test.mjs` asserts this invariant.
|
|
24
24
|
- **Verbatim preservation**: caveman-compress never alters code blocks, inline
|
|
25
|
-
code, URLs, file paths, commands, or exact error strings.
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
25
|
+
code, URLs, file paths, commands, or exact error strings. The skill instructs
|
|
26
|
+
the agent to self-validate these against the original and, on any mismatch it
|
|
27
|
+
cannot fix, restore from the `.original` backup rather than leave a corrupted file.
|
|
28
|
+
- `caveman-compress` is **prompt-only**: the Pi agent performs the compression
|
|
29
|
+
with its own model and file tools, driven by `SKILL.md`. There is no Python and
|
|
30
|
+
no external model CLI; coverage is the doc-guard test `tests/compress-docs.test.mjs`.
|
|
31
31
|
|
|
32
32
|
## Tests / validation
|
|
33
33
|
|
|
@@ -36,9 +36,6 @@ Project memory for agents working in this repo. Non-obvious conventions only.
|
|
|
36
36
|
- The JS test glob (`tests/**/*.test.mjs`) is expanded by the Node `--test` runner,
|
|
37
37
|
not the shell. Directory-recursion (`--test tests/`) does **not** work on the
|
|
38
38
|
current Node — keep the glob.
|
|
39
|
-
- **Python tests are not on PATH.** Create a venv and install pytest:
|
|
40
|
-
`python3 -m venv .venv && .venv/bin/pip install pytest`, then run
|
|
41
|
-
`npm run test:py` (which calls `.venv/bin/pytest skills/caveman-compress`).
|
|
42
39
|
- Several tests are **phantom-reference guards** (`tests/stats-docs.test.mjs`,
|
|
43
40
|
`tests/cavecrew-docs.test.mjs`, `tests/compress-docs.test.mjs`): they assert the
|
|
44
41
|
docs do **not** mention a Claude-Code hooks layer, a plugin install path, the
|
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.2.0] - 2026-06-29
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
|
|
14
|
+
- `caveman-compress` is now a prompt-only skill: the Pi agent performs the
|
|
15
|
+
compression itself (its own model and file tools), driven by `SKILL.md`. The
|
|
16
|
+
`/caveman-compress` command and the compression rules are unchanged.
|
|
17
|
+
|
|
18
|
+
### Removed
|
|
19
|
+
|
|
20
|
+
- The Python `caveman-compress` toolkit (`scripts/` + the pytest suite) and the
|
|
21
|
+
`test:py` script. The package no longer requires Python or an external
|
|
22
|
+
`claude` CLI / `ANTHROPIC_API_KEY`.
|
|
23
|
+
|
|
24
|
+
## [0.1.0] - 2026-06-29
|
|
25
|
+
|
|
26
|
+
Initial release — a [Pi](https://github.com/earendil-works/pi-coding-agent)
|
|
27
|
+
port of [caveman](https://github.com/JuliusBrussee/caveman).
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
|
|
31
|
+
- Caveman output-mode extension (`extensions/caveman.ts`): six intensity modes
|
|
32
|
+
(lite, full, ultra, wenyan-lite, wenyan-full, wenyan-ultra; default full),
|
|
33
|
+
slash commands, natural-language activation/deactivation, and a session
|
|
34
|
+
statusline indicator.
|
|
35
|
+
- Skills: `caveman`, `caveman-commit`, `caveman-review`, `caveman-help`,
|
|
36
|
+
`caveman-stats`, `caveman-compress`, and the `cavecrew` subagent suite.
|
|
37
|
+
- npm packaging as `@kulapard/pi-caveman` (scoped, public) with a `files`
|
|
38
|
+
whitelist, a `prepublishOnly` test gate, and repository metadata.
|
|
39
|
+
- GitHub Actions: `ci.yml` (test on push and pull requests) and `publish.yml`
|
|
40
|
+
(publish to npm via Trusted Publishing / OIDC on `v[0-9]*` tags — no stored
|
|
41
|
+
token, automatic provenance, a tag-equals-version guard, and a concurrency
|
|
42
|
+
group).
|
|
43
|
+
|
|
44
|
+
[Unreleased]: https://github.com/kulapard/pi-caveman/compare/v0.2.0...HEAD
|
|
45
|
+
[0.2.0]: https://github.com/kulapard/pi-caveman/compare/v0.1.0...v0.2.0
|
|
46
|
+
[0.1.0]: https://github.com/kulapard/pi-caveman/releases/tag/v0.1.0
|
package/README.md
CHANGED
|
@@ -56,12 +56,6 @@ persistent install over a per-session `pi -e`.
|
|
|
56
56
|
```bash
|
|
57
57
|
npm install # fetch the Pi SDK + TypeScript dev deps
|
|
58
58
|
npm test # typecheck + extension/manifest/docs unit tests
|
|
59
|
-
|
|
60
|
-
# Python tests for the caveman-compress toolkit need pytest in a local venv
|
|
61
|
-
# (pytest is not on PATH on a fresh checkout):
|
|
62
|
-
python3 -m venv .venv
|
|
63
|
-
.venv/bin/pip install pytest
|
|
64
|
-
npm run test:py # runs: .venv/bin/pytest skills/caveman-compress
|
|
65
59
|
```
|
|
66
60
|
|
|
67
61
|
## Modes
|
|
@@ -122,16 +116,12 @@ tool descriptions. pi-caveman does **not** bundle it: that proxy works at the
|
|
|
122
116
|
MCP-client layer, independent of Pi, and upstream itself ships it as a separate
|
|
123
117
|
package — so it does not belong in this extension-plus-skills package.
|
|
124
118
|
|
|
125
|
-
The Pi-side equivalent is the
|
|
126
|
-
`/caveman-compress`
|
|
127
|
-
(writing a `FILE.original.md` backup)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
equivalent" means *invoked via a Pi skill/command*, **not** *model-independent*.
|
|
132
|
-
This is the same nature as upstream's MCP shrink (also a model-mediated
|
|
133
|
-
transform); only the integration mechanism differs (a Pi skill here vs. MCP
|
|
134
|
-
middleware upstream).
|
|
119
|
+
The Pi-side equivalent is the `caveman-compress` skill, invoked via the
|
|
120
|
+
`/caveman-compress` command. It is prompt-only: the Pi agent itself compresses a
|
|
121
|
+
prose memory file in place (writing a `FILE.original.md` backup) using its own
|
|
122
|
+
model and file tools, preserving code, URLs, and paths verbatim. No Python and no
|
|
123
|
+
external Claude CLI are involved — compression is performed by the host Pi agent,
|
|
124
|
+
the same way the other skills work.
|
|
135
125
|
|
|
136
126
|
## Attribution & license
|
|
137
127
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@kulapard/pi-caveman",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Caveman for Pi: ultra-compressed agent output that preserves technical substance. Six intensity modes, slash commands, natural-language activation, and a session statusline.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
@@ -20,9 +20,7 @@
|
|
|
20
20
|
"skills",
|
|
21
21
|
"agents",
|
|
22
22
|
"AGENTS.md",
|
|
23
|
-
"
|
|
24
|
-
"!**/*.pyc",
|
|
25
|
-
"!skills/**/tests"
|
|
23
|
+
"CHANGELOG.md"
|
|
26
24
|
],
|
|
27
25
|
"engines": {
|
|
28
26
|
"node": ">=18"
|
|
@@ -47,7 +45,6 @@
|
|
|
47
45
|
"pretest": "npm run typecheck",
|
|
48
46
|
"test": "node --experimental-strip-types --test tests/**/*.test.mjs",
|
|
49
47
|
"typecheck": "tsc --noEmit",
|
|
50
|
-
"test:py": ".venv/bin/pytest skills/caveman-compress",
|
|
51
48
|
"prepublishOnly": "npm test"
|
|
52
49
|
},
|
|
53
50
|
"devDependencies": {
|
|
@@ -40,7 +40,7 @@ Real results on real project files:
|
|
|
40
40
|
| `mixed-with-code.md` | 888 | 560 | **36.9%** |
|
|
41
41
|
| **Average** | **898** | **481** | **46%** |
|
|
42
42
|
|
|
43
|
-
|
|
43
|
+
Headings, code blocks, URLs, and file paths preserved exactly.
|
|
44
44
|
|
|
45
45
|
## Before / After
|
|
46
46
|
|
|
@@ -65,23 +65,14 @@ All validations passed ✅ — headings, code blocks, URLs, file paths preserved
|
|
|
65
65
|
|
|
66
66
|
**Same instructions. ~60% fewer tokens in this example (46% average across the files above). Every. Single. Session.**
|
|
67
67
|
|
|
68
|
-
## Security
|
|
69
|
-
|
|
70
|
-
`caveman-compress` is flagged as Snyk High Risk due to subprocess and file I/O patterns detected by static analysis. This is a false positive — see [SECURITY.md](./SECURITY.md) for a full explanation of what the skill does and does not do.
|
|
71
|
-
|
|
72
68
|
## Install
|
|
73
69
|
|
|
74
70
|
This skill ships inside the pi-caveman package. Load the package (see the
|
|
75
71
|
[root README](../../README.md) for the `pi -e … --skill …` / `pi install`
|
|
76
72
|
mechanism), then use `/caveman-compress` in a Pi session.
|
|
77
73
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
**Requires:** Python 3.10+. Compression calls a model (the Anthropic SDK if
|
|
82
|
-
`ANTHROPIC_API_KEY` is set, otherwise the `claude --print` CLI), so one of those
|
|
83
|
-
must be available — see [Security](#security) and the root README's
|
|
84
|
-
"Compression vs. upstream MCP shrink" note for why this step is model-bound.
|
|
74
|
+
No extra runtime is required: the Pi agent performs the compression itself with
|
|
75
|
+
its own model and file tools — there is no separate tool or language to install.
|
|
85
76
|
|
|
86
77
|
## Usage
|
|
87
78
|
|
|
@@ -111,23 +102,20 @@ Examples:
|
|
|
111
102
|
```
|
|
112
103
|
/caveman-compress AGENTS.md
|
|
113
104
|
↓
|
|
114
|
-
|
|
105
|
+
agent detects file type (prose? else skip)
|
|
115
106
|
↓
|
|
116
|
-
|
|
107
|
+
agent backs up original → AGENTS.original.md (verbatim, never overwritten)
|
|
117
108
|
↓
|
|
118
|
-
|
|
119
|
-
checks: headings, code blocks, URLs, file paths, bullets
|
|
109
|
+
agent rewrites prose to caveman, code/URLs/paths left exact
|
|
120
110
|
↓
|
|
121
|
-
|
|
122
|
-
does NOT recompress — only patches broken parts
|
|
111
|
+
agent self-validates: protected tokens byte-identical to original
|
|
123
112
|
↓
|
|
124
|
-
|
|
113
|
+
if a protected token changed: fix it, or restore from backup and report
|
|
125
114
|
↓
|
|
126
|
-
write compressed
|
|
127
|
-
write original → AGENTS.original.md
|
|
115
|
+
write compressed → AGENTS.md
|
|
128
116
|
```
|
|
129
117
|
|
|
130
|
-
|
|
118
|
+
The agent does this with its own model and file tools — no external CLI, no separate runtime.
|
|
131
119
|
|
|
132
120
|
## What Is Preserved
|
|
133
121
|
|
|
@@ -149,15 +137,6 @@ A memory file (`AGENTS.md` / `CLAUDE.md`) loads on **every session start**. A 10
|
|
|
149
137
|
|
|
150
138
|
Caveman cut that by ~46% on average. Same instructions. Same accuracy. Less waste.
|
|
151
139
|
|
|
152
|
-
```
|
|
153
|
-
┌────────────────────────────────────────────┐
|
|
154
|
-
│ TOKEN SAVINGS PER FILE █████ 46% │
|
|
155
|
-
│ SESSIONS THAT BENEFIT ██████████ 100% │
|
|
156
|
-
│ INFORMATION PRESERVED ██████████ 100% │
|
|
157
|
-
│ SETUP TIME █ 1x │
|
|
158
|
-
└────────────────────────────────────────────┘
|
|
159
|
-
```
|
|
160
|
-
|
|
161
140
|
## Part of Caveman
|
|
162
141
|
|
|
163
142
|
This skill is part of the [caveman](https://github.com/JuliusBrussee/caveman) toolkit — making the agent use fewer tokens without losing accuracy. pi-caveman is the [Pi](https://github.com/earendil-works/pi-coding-agent) port.
|
|
@@ -19,21 +19,17 @@ Compress natural language files (`AGENTS.md`, `CLAUDE.md`, todos, preferences) i
|
|
|
19
19
|
|
|
20
20
|
## Process
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
You (the Pi agent) perform the compression directly — there is no separate tool to run. Given `/caveman-compress <filepath>`:
|
|
23
23
|
|
|
24
|
-
|
|
24
|
+
1. **Skip backups.** If the path ends in `.original.<ext>` (e.g. `AGENTS.original.md`), stop — never compress a backup file.
|
|
25
|
+
2. **Check it is compressible** per **Boundaries** below: prose files (`.md`, `.txt`, `.rst`, `.typ`, `.typst`, `.tex`, or extensionless natural language). If it is code/config (`.py`, `.js`, `.ts`, `.json`, `.yaml`, …) or larger than ~500 KB, report it is out of scope and stop.
|
|
26
|
+
3. **Read** the file's full contents.
|
|
27
|
+
4. **Back up the original.** Write a verbatim copy to `<filename>.original.<ext>` (e.g. `AGENTS.md` → `AGENTS.original.md`), **only if that backup does not already exist** — never overwrite an existing `.original` backup.
|
|
28
|
+
5. **Rewrite** the file in place, applying the **Compression Rules** below. Treat code blocks, inline code, URLs, paths, commands, headings, and table structure as read-only regions.
|
|
29
|
+
6. **Self-validate** against the contents you read in step 3: every protected token — fenced and inline code, URLs, file paths, heading text, table structure, dates/version numbers — must be byte-for-byte identical. If any changed, fix just that region; if you cannot make it identical, restore the file from the `.original` backup and report the failure rather than leave a corrupted file.
|
|
30
|
+
7. **Report** the result: bytes before/after and the approximate reduction.
|
|
25
31
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
3. The CLI will:
|
|
29
|
-
- detect file type (no tokens)
|
|
30
|
-
- call a model to compress (Anthropic SDK if `ANTHROPIC_API_KEY` is set, else the `claude --print` CLI)
|
|
31
|
-
- validate output (no tokens)
|
|
32
|
-
- if errors: cherry-pick fix via the same model call (targeted fixes only, no recompression)
|
|
33
|
-
- retry up to 2 times
|
|
34
|
-
- if still failing after 2 retries: report error to user, leave original file untouched
|
|
35
|
-
|
|
36
|
-
4. Return result to user
|
|
32
|
+
Only the rewrite needs the model (you); detection, backup, and validation are mechanical.
|
|
37
33
|
|
|
38
34
|
## Compression Rules
|
|
39
35
|
|
|
@@ -103,8 +99,9 @@ Compressed:
|
|
|
103
99
|
|
|
104
100
|
## Boundaries
|
|
105
101
|
|
|
106
|
-
- ONLY compress natural language files (.md, .txt, .typ, .typst, .tex, extensionless)
|
|
102
|
+
- ONLY compress natural language files (.md, .txt, .rst, .typ, .typst, .tex, extensionless)
|
|
107
103
|
- NEVER modify: .py, .js, .ts, .json, .yaml, .yml, .toml, .env, .lock, .css, .html, .xml, .sql, .sh
|
|
104
|
+
- Skip files larger than ~500 KB (too big to rewrite safely in one pass)
|
|
108
105
|
- If file has mixed content (prose + code), compress ONLY the prose sections
|
|
109
106
|
- If unsure whether something is code or prose, leave it unchanged
|
|
110
107
|
- Original file is backed up as FILE.original.md before overwriting
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
# Security
|
|
2
|
-
|
|
3
|
-
## Snyk High Risk Rating
|
|
4
|
-
|
|
5
|
-
`caveman-compress` receives a Snyk High Risk rating due to static analysis heuristics. This document explains what the skill does and does not do.
|
|
6
|
-
|
|
7
|
-
### What triggers the rating
|
|
8
|
-
|
|
9
|
-
1. **subprocess usage**: The skill calls the `claude` CLI via `subprocess.run()` as a fallback when `ANTHROPIC_API_KEY` is not set. The subprocess call uses a fixed argument list — no shell interpolation occurs. User file content is passed via stdin, not as a shell argument.
|
|
10
|
-
|
|
11
|
-
2. **File read/write**: The skill reads the file the user explicitly points it at, compresses it, and writes the result back to the same path. A `.original.md` backup is saved alongside it. No files outside the user-specified path are read or written.
|
|
12
|
-
|
|
13
|
-
### What the skill does NOT do
|
|
14
|
-
|
|
15
|
-
- Does not execute user file content as code
|
|
16
|
-
- Does not make network requests except to Anthropic's API (via SDK or CLI)
|
|
17
|
-
- Does not access files outside the path the user provides
|
|
18
|
-
- Does not use shell=True or string interpolation in subprocess calls
|
|
19
|
-
- Does not collect or transmit any data beyond the file being compressed
|
|
20
|
-
|
|
21
|
-
### Auth behavior
|
|
22
|
-
|
|
23
|
-
If `ANTHROPIC_API_KEY` is set, the skill uses the Anthropic Python SDK directly (no subprocess). If not set, it falls back to the `claude` CLI, which uses the user's existing Claude desktop authentication.
|
|
24
|
-
|
|
25
|
-
### File size limit
|
|
26
|
-
|
|
27
|
-
Files larger than 500KB are rejected before any API call is made.
|
|
28
|
-
|
|
29
|
-
### Reporting a vulnerability
|
|
30
|
-
|
|
31
|
-
If you believe you've found a genuine security issue, please open a GitHub issue with the label `security`.
|
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
from pathlib import Path
|
|
3
|
-
import sys
|
|
4
|
-
|
|
5
|
-
# Support both direct execution and module import
|
|
6
|
-
try:
|
|
7
|
-
from .validate import validate
|
|
8
|
-
except ImportError:
|
|
9
|
-
sys.path.insert(0, str(Path(__file__).parent))
|
|
10
|
-
from validate import validate
|
|
11
|
-
|
|
12
|
-
try:
|
|
13
|
-
import tiktoken
|
|
14
|
-
_enc = tiktoken.get_encoding("o200k_base")
|
|
15
|
-
except ImportError:
|
|
16
|
-
_enc = None
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def count_tokens(text):
|
|
20
|
-
if _enc is None:
|
|
21
|
-
return len(text.split()) # fallback: word count
|
|
22
|
-
return len(_enc.encode(text))
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def benchmark_pair(orig_path: Path, comp_path: Path):
|
|
26
|
-
orig_text = orig_path.read_text()
|
|
27
|
-
comp_text = comp_path.read_text()
|
|
28
|
-
|
|
29
|
-
orig_tokens = count_tokens(orig_text)
|
|
30
|
-
comp_tokens = count_tokens(comp_text)
|
|
31
|
-
saved = 100 * (orig_tokens - comp_tokens) / orig_tokens if orig_tokens > 0 else 0.0
|
|
32
|
-
result = validate(orig_path, comp_path)
|
|
33
|
-
|
|
34
|
-
return (comp_path.name, orig_tokens, comp_tokens, saved, result.is_valid)
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def print_table(rows):
|
|
38
|
-
print("\n| File | Original | Compressed | Saved % | Valid |")
|
|
39
|
-
print("|------|----------|------------|---------|-------|")
|
|
40
|
-
for r in rows:
|
|
41
|
-
print(f"| {r[0]} | {r[1]} | {r[2]} | {r[3]:.1f}% | {'✅' if r[4] else '❌'} |")
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def main():
|
|
45
|
-
# Direct file pair: python3 benchmark.py original.md compressed.md
|
|
46
|
-
if len(sys.argv) == 3:
|
|
47
|
-
orig = Path(sys.argv[1]).resolve()
|
|
48
|
-
comp = Path(sys.argv[2]).resolve()
|
|
49
|
-
if not orig.exists():
|
|
50
|
-
print(f"❌ Not found: {orig}")
|
|
51
|
-
sys.exit(1)
|
|
52
|
-
if not comp.exists():
|
|
53
|
-
print(f"❌ Not found: {comp}")
|
|
54
|
-
sys.exit(1)
|
|
55
|
-
print_table([benchmark_pair(orig, comp)])
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
# Glob mode: repo_root/tests/caveman-compress/
|
|
59
|
-
# __file__ lives at <repo_root>/skills/caveman-compress/scripts/benchmark.py
|
|
60
|
-
# Walk up four dirs: scripts → caveman-compress → skills → repo_root.
|
|
61
|
-
tests_dir = Path(__file__).resolve().parents[3] / "tests" / "caveman-compress"
|
|
62
|
-
if not tests_dir.exists():
|
|
63
|
-
print(f"❌ Tests dir not found: {tests_dir}")
|
|
64
|
-
sys.exit(1)
|
|
65
|
-
|
|
66
|
-
rows = []
|
|
67
|
-
for orig in sorted(tests_dir.glob("*.original.md")):
|
|
68
|
-
comp = orig.with_name(orig.stem.removesuffix(".original") + ".md")
|
|
69
|
-
if comp.exists():
|
|
70
|
-
rows.append(benchmark_pair(orig, comp))
|
|
71
|
-
|
|
72
|
-
if not rows:
|
|
73
|
-
print("No compressed file pairs found.")
|
|
74
|
-
return
|
|
75
|
-
|
|
76
|
-
print_table(rows)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
if __name__ == "__main__":
|
|
80
|
-
main()
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Caveman Compress CLI
|
|
4
|
-
|
|
5
|
-
Usage:
|
|
6
|
-
caveman <filepath>
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import sys
|
|
10
|
-
|
|
11
|
-
# Force UTF-8 on stdout/stderr before any code can print. Windows consoles
|
|
12
|
-
# default to cp1252 and crash on the ❌ glyphs in error/validation branches,
|
|
13
|
-
# masking the real error and leaving the user with a half-compressed file.
|
|
14
|
-
for _stream in (sys.stdout, sys.stderr):
|
|
15
|
-
reconfigure = getattr(_stream, "reconfigure", None)
|
|
16
|
-
if callable(reconfigure):
|
|
17
|
-
try:
|
|
18
|
-
reconfigure(encoding="utf-8", errors="replace")
|
|
19
|
-
except Exception:
|
|
20
|
-
pass
|
|
21
|
-
|
|
22
|
-
from pathlib import Path
|
|
23
|
-
|
|
24
|
-
from .compress import backup_dir_for, compress_file
|
|
25
|
-
from .detect import detect_file_type, should_compress
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def print_usage():
|
|
29
|
-
print("Usage: caveman <filepath>")
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def main():
|
|
33
|
-
if len(sys.argv) != 2:
|
|
34
|
-
print_usage()
|
|
35
|
-
sys.exit(1)
|
|
36
|
-
|
|
37
|
-
filepath = Path(sys.argv[1])
|
|
38
|
-
|
|
39
|
-
# Check file exists
|
|
40
|
-
if not filepath.exists():
|
|
41
|
-
print(f"❌ File not found: {filepath}")
|
|
42
|
-
sys.exit(1)
|
|
43
|
-
|
|
44
|
-
if not filepath.is_file():
|
|
45
|
-
print(f"❌ Not a file: {filepath}")
|
|
46
|
-
sys.exit(1)
|
|
47
|
-
|
|
48
|
-
filepath = filepath.resolve()
|
|
49
|
-
|
|
50
|
-
# Detect file type
|
|
51
|
-
file_type = detect_file_type(filepath)
|
|
52
|
-
|
|
53
|
-
print(f"Detected: {file_type}")
|
|
54
|
-
|
|
55
|
-
# Check if compressible
|
|
56
|
-
if not should_compress(filepath):
|
|
57
|
-
print("Skipping: file is not natural language (code/config)")
|
|
58
|
-
sys.exit(0)
|
|
59
|
-
|
|
60
|
-
print("Starting caveman compression...\n")
|
|
61
|
-
|
|
62
|
-
try:
|
|
63
|
-
success = compress_file(filepath)
|
|
64
|
-
|
|
65
|
-
if success:
|
|
66
|
-
print("\nCompression completed successfully")
|
|
67
|
-
backup_path = backup_dir_for(filepath) / (filepath.stem + ".original.md")
|
|
68
|
-
print(f"Compressed: {filepath}")
|
|
69
|
-
print(f"Original: {backup_path}")
|
|
70
|
-
sys.exit(0)
|
|
71
|
-
else:
|
|
72
|
-
print("\n❌ Compression failed after retries")
|
|
73
|
-
sys.exit(2)
|
|
74
|
-
|
|
75
|
-
except KeyboardInterrupt:
|
|
76
|
-
print("\nInterrupted by user")
|
|
77
|
-
sys.exit(130)
|
|
78
|
-
|
|
79
|
-
except Exception as e:
|
|
80
|
-
print(f"\n❌ Error: {e}")
|
|
81
|
-
sys.exit(1)
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
if __name__ == "__main__":
|
|
85
|
-
main()
|
|
@@ -1,341 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Caveman Memory Compression Orchestrator
|
|
4
|
-
|
|
5
|
-
Usage:
|
|
6
|
-
python scripts/compress.py <filepath>
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import os
|
|
10
|
-
import re
|
|
11
|
-
import shutil
|
|
12
|
-
import subprocess
|
|
13
|
-
import sys
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
|
|
16
|
-
OUTER_FENCE_REGEX = re.compile(
|
|
17
|
-
r"\A\s*(`{3,}|~{3,})[^\n]*\n(.*)\n\1\s*\Z", re.DOTALL
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
# YAML frontmatter: starts at file start with --- on its own line, ends with --- on its own line.
|
|
21
|
-
# Captures the entire block (including delimiters and trailing newline) and the body after.
|
|
22
|
-
FRONTMATTER_REGEX = re.compile(
|
|
23
|
-
r"\A(---\r?\n.*?\r?\n---\r?\n)(.*)", re.DOTALL
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def split_frontmatter(text: str):
|
|
28
|
-
"""Split YAML frontmatter from body. Returns (frontmatter, body).
|
|
29
|
-
|
|
30
|
-
Memory files (and many other markdown docs) start with a YAML frontmatter
|
|
31
|
-
block delimited by `---` lines. The compression LLM has a habit of stripping
|
|
32
|
-
or rewriting these despite preserve-structure rules in the prompt — so we
|
|
33
|
-
surgically remove the frontmatter before compression and prepend it back
|
|
34
|
-
verbatim to the output. Files without frontmatter pass through unchanged.
|
|
35
|
-
"""
|
|
36
|
-
m = FRONTMATTER_REGEX.match(text)
|
|
37
|
-
if m:
|
|
38
|
-
return m.group(1), m.group(2)
|
|
39
|
-
return "", text
|
|
40
|
-
|
|
41
|
-
# Filenames and paths that almost certainly hold secrets or PII. Compressing
|
|
42
|
-
# them ships raw bytes to the Anthropic API — a third-party data boundary that
|
|
43
|
-
# developers on sensitive codebases cannot cross. detect.py already skips .env
|
|
44
|
-
# by extension, but credentials.md / secrets.txt / ~/.aws/credentials would
|
|
45
|
-
# slip through the natural-language filter. This is a hard refuse before read.
|
|
46
|
-
SENSITIVE_BASENAME_REGEX = re.compile(
|
|
47
|
-
r"(?ix)^("
|
|
48
|
-
r"\.env(\..+)?"
|
|
49
|
-
r"|\.netrc"
|
|
50
|
-
r"|credentials(\..+)?"
|
|
51
|
-
r"|secrets?(\..+)?"
|
|
52
|
-
r"|passwords?(\..+)?"
|
|
53
|
-
r"|id_(rsa|dsa|ecdsa|ed25519)(\.pub)?"
|
|
54
|
-
r"|authorized_keys"
|
|
55
|
-
r"|known_hosts"
|
|
56
|
-
r"|.*\.(pem|key|p12|pfx|crt|cer|jks|keystore|asc|gpg)"
|
|
57
|
-
r")$"
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
SENSITIVE_PATH_COMPONENTS = frozenset({".ssh", ".aws", ".gnupg", ".kube", ".docker"})
|
|
61
|
-
|
|
62
|
-
SENSITIVE_NAME_TOKENS = (
|
|
63
|
-
"secret", "credential", "password", "passwd",
|
|
64
|
-
"apikey", "accesskey", "token", "privatekey",
|
|
65
|
-
)
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def backup_dir_for(filepath: Path) -> Path:
|
|
69
|
-
"""Resolve the out-of-tree backup directory for a given source file.
|
|
70
|
-
|
|
71
|
-
Backups must live OUTSIDE the source directory so skill auto-loaders
|
|
72
|
-
(Claude Code rules/, opencode instructions/, etc.) stop re-ingesting the
|
|
73
|
-
`.original.md` copies as live files. Base dir is platform-aware:
|
|
74
|
-
- Windows: %LOCALAPPDATA%\\caveman-compress\\backups
|
|
75
|
-
- else: $XDG_DATA_HOME/caveman-compress/backups if set,
|
|
76
|
-
else ~/.local/share/caveman-compress/backups
|
|
77
|
-
|
|
78
|
-
The source file's parent-dir name is mirrored under the base to reduce
|
|
79
|
-
cross-project collisions (e.g. two `task.md` files in different repos).
|
|
80
|
-
"""
|
|
81
|
-
if os.name == "nt" or sys.platform == "win32":
|
|
82
|
-
local_appdata = os.environ.get("LOCALAPPDATA")
|
|
83
|
-
base = Path(local_appdata) if local_appdata else Path.home() / "AppData" / "Local"
|
|
84
|
-
base = base / "caveman-compress" / "backups"
|
|
85
|
-
else:
|
|
86
|
-
xdg = os.environ.get("XDG_DATA_HOME")
|
|
87
|
-
base = Path(xdg) if xdg else Path.home() / ".local" / "share"
|
|
88
|
-
base = base / "caveman-compress" / "backups"
|
|
89
|
-
return base / filepath.parent.name
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def is_sensitive_path(filepath: Path) -> bool:
|
|
93
|
-
"""Heuristic denylist for files that must never be shipped to a third-party API."""
|
|
94
|
-
name = filepath.name
|
|
95
|
-
if SENSITIVE_BASENAME_REGEX.match(name):
|
|
96
|
-
return True
|
|
97
|
-
lowered_parts = {p.lower() for p in filepath.parts}
|
|
98
|
-
if lowered_parts & SENSITIVE_PATH_COMPONENTS:
|
|
99
|
-
return True
|
|
100
|
-
# Normalize separators so "api-key" and "api_key" both match "apikey".
|
|
101
|
-
lower = re.sub(r"[_\-\s.]", "", name.lower())
|
|
102
|
-
return any(tok in lower for tok in SENSITIVE_NAME_TOKENS)
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def strip_llm_wrapper(text: str) -> str:
|
|
106
|
-
"""Strip outer ```markdown ... ``` fence when it wraps the entire output."""
|
|
107
|
-
m = OUTER_FENCE_REGEX.match(text)
|
|
108
|
-
if m:
|
|
109
|
-
return m.group(2)
|
|
110
|
-
return text
|
|
111
|
-
|
|
112
|
-
from .detect import should_compress
|
|
113
|
-
from .validate import validate
|
|
114
|
-
|
|
115
|
-
MAX_RETRIES = 2
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
# ---------- Claude Calls ----------
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
def call_claude(prompt: str) -> str:
|
|
122
|
-
"""Send a prompt to Claude.
|
|
123
|
-
|
|
124
|
-
Prefers the Anthropic SDK when ANTHROPIC_API_KEY is set; otherwise falls
|
|
125
|
-
back to the ``claude --print`` CLI (which handles desktop auth).
|
|
126
|
-
|
|
127
|
-
On Windows the CLI subprocess decoding defaults to the system codepage
|
|
128
|
-
(cp1251 / cp1252) and crashes on UTF-8 output — see issue #152. Pinning
|
|
129
|
-
``encoding="utf-8"`` with ``errors="replace"`` matches the CLI's actual
|
|
130
|
-
native I/O and prevents the UnicodeDecodeError before validation can
|
|
131
|
-
report. Windows users with non-ASCII content can also set
|
|
132
|
-
``ANTHROPIC_API_KEY`` to route through the SDK and skip the subprocess.
|
|
133
|
-
"""
|
|
134
|
-
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
135
|
-
if api_key:
|
|
136
|
-
try:
|
|
137
|
-
import anthropic
|
|
138
|
-
|
|
139
|
-
client = anthropic.Anthropic(api_key=api_key)
|
|
140
|
-
msg = client.messages.create(
|
|
141
|
-
model=os.environ.get("CAVEMAN_MODEL", "claude-sonnet-4-5"),
|
|
142
|
-
max_tokens=8192,
|
|
143
|
-
messages=[{"role": "user", "content": prompt}],
|
|
144
|
-
)
|
|
145
|
-
return strip_llm_wrapper(msg.content[0].text.strip())
|
|
146
|
-
except ImportError:
|
|
147
|
-
pass # anthropic not installed, fall back to CLI
|
|
148
|
-
# Fallback: use claude CLI (handles desktop auth).
|
|
149
|
-
# Resolve binary via shutil.which so Windows .cmd/.bat shims (e.g.
|
|
150
|
-
# %APPDATA%\npm\claude.CMD) work without shell=True. On POSIX,
|
|
151
|
-
# shutil.which returns the same absolute path as the implicit lookup,
|
|
152
|
-
# so this is a no-op there. Falls back to bare "claude" if not found
|
|
153
|
-
# on PATH so subprocess raises a clear FileNotFoundError.
|
|
154
|
-
claude_bin = shutil.which("claude") or "claude"
|
|
155
|
-
try:
|
|
156
|
-
result = subprocess.run(
|
|
157
|
-
[claude_bin, "--print"],
|
|
158
|
-
input=prompt,
|
|
159
|
-
text=True,
|
|
160
|
-
capture_output=True,
|
|
161
|
-
check=True,
|
|
162
|
-
encoding="utf-8",
|
|
163
|
-
errors="replace",
|
|
164
|
-
)
|
|
165
|
-
return strip_llm_wrapper(result.stdout.strip())
|
|
166
|
-
except subprocess.CalledProcessError as e:
|
|
167
|
-
raise RuntimeError(f"Claude call failed:\n{e.stderr}")
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def build_compress_prompt(original: str) -> str:
|
|
171
|
-
return f"""
|
|
172
|
-
Compress this markdown into caveman format.
|
|
173
|
-
|
|
174
|
-
STRICT RULES:
|
|
175
|
-
- Do NOT modify anything inside ``` code blocks
|
|
176
|
-
- Do NOT modify anything inside inline backticks
|
|
177
|
-
- Preserve ALL URLs exactly
|
|
178
|
-
- Preserve ALL headings exactly
|
|
179
|
-
- Preserve file paths and commands
|
|
180
|
-
- Return ONLY the compressed markdown body — do NOT wrap the entire output in a ```markdown fence or any other fence. Inner code blocks from the original stay as-is; do not add a new outer fence around the whole file.
|
|
181
|
-
|
|
182
|
-
Only compress natural language.
|
|
183
|
-
|
|
184
|
-
TEXT:
|
|
185
|
-
{original}
|
|
186
|
-
"""
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def build_fix_prompt(original: str, compressed: str, errors: list[str]) -> str:
|
|
190
|
-
errors_str = "\n".join(f"- {e}" for e in errors)
|
|
191
|
-
return f"""You are fixing a caveman-compressed markdown file. Specific validation errors were found.
|
|
192
|
-
|
|
193
|
-
CRITICAL RULES:
|
|
194
|
-
- DO NOT recompress or rephrase the file
|
|
195
|
-
- ONLY fix the listed errors — leave everything else exactly as-is
|
|
196
|
-
- The ORIGINAL is provided as reference only (to restore missing content)
|
|
197
|
-
- Preserve caveman style in all untouched sections
|
|
198
|
-
|
|
199
|
-
ERRORS TO FIX:
|
|
200
|
-
{errors_str}
|
|
201
|
-
|
|
202
|
-
HOW TO FIX:
|
|
203
|
-
- Missing URL: find it in ORIGINAL, restore it exactly where it belongs in COMPRESSED
|
|
204
|
-
- Code block mismatch: find the exact code block in ORIGINAL, restore it in COMPRESSED
|
|
205
|
-
- Heading mismatch: restore the exact heading text from ORIGINAL into COMPRESSED
|
|
206
|
-
- Do not touch any section not mentioned in the errors
|
|
207
|
-
|
|
208
|
-
ORIGINAL (reference only):
|
|
209
|
-
{original}
|
|
210
|
-
|
|
211
|
-
COMPRESSED (fix this):
|
|
212
|
-
{compressed}
|
|
213
|
-
|
|
214
|
-
Return ONLY the fixed compressed file. No explanation.
|
|
215
|
-
"""
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
# ---------- Core Logic ----------
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
def compress_file(filepath: Path) -> bool:
|
|
222
|
-
# Resolve and validate path
|
|
223
|
-
filepath = filepath.resolve()
|
|
224
|
-
MAX_FILE_SIZE = 500_000 # 500KB
|
|
225
|
-
if not filepath.exists():
|
|
226
|
-
raise FileNotFoundError(f"File not found: {filepath}")
|
|
227
|
-
if filepath.stat().st_size > MAX_FILE_SIZE:
|
|
228
|
-
raise ValueError(f"File too large to compress safely (max 500KB): {filepath}")
|
|
229
|
-
|
|
230
|
-
# Refuse files that look like they contain secrets or PII. Compressing ships
|
|
231
|
-
# the raw bytes to the Anthropic API — a third-party boundary — so we fail
|
|
232
|
-
# loudly rather than silently exfiltrate credentials or keys. Override is
|
|
233
|
-
# intentional: the user must rename the file if the heuristic is wrong.
|
|
234
|
-
if is_sensitive_path(filepath):
|
|
235
|
-
raise ValueError(
|
|
236
|
-
f"Refusing to compress {filepath}: filename looks sensitive "
|
|
237
|
-
"(credentials, keys, secrets, or known private paths). "
|
|
238
|
-
"Compression sends file contents to the Anthropic API. "
|
|
239
|
-
"Rename the file if this is a false positive."
|
|
240
|
-
)
|
|
241
|
-
|
|
242
|
-
print(f"Processing: {filepath}")
|
|
243
|
-
|
|
244
|
-
if not should_compress(filepath):
|
|
245
|
-
print("Skipping (not natural language)")
|
|
246
|
-
return False
|
|
247
|
-
|
|
248
|
-
original_text = filepath.read_text(errors="ignore")
|
|
249
|
-
# Store backup outside the source directory so skill auto-loaders don't
|
|
250
|
-
# re-ingest the `.original.md` copy as a live file. Mirror the source's
|
|
251
|
-
# parent-dir name + stem under a platform-aware base to reduce collisions.
|
|
252
|
-
backup_dir = backup_dir_for(filepath)
|
|
253
|
-
backup_dir.mkdir(parents=True, exist_ok=True)
|
|
254
|
-
backup_path = backup_dir / (filepath.stem + ".original.md")
|
|
255
|
-
|
|
256
|
-
if not original_text.strip():
|
|
257
|
-
print("❌ Refusing to compress: file is empty or whitespace-only.")
|
|
258
|
-
return False
|
|
259
|
-
|
|
260
|
-
# Check if backup already exists to prevent accidental overwriting
|
|
261
|
-
if backup_path.exists():
|
|
262
|
-
print(f"⚠️ Backup file already exists: {backup_path}")
|
|
263
|
-
print("The original backup may contain important content.")
|
|
264
|
-
print("Aborting to prevent data loss. Please remove or rename the backup file if you want to proceed.")
|
|
265
|
-
return False
|
|
266
|
-
|
|
267
|
-
# Split YAML frontmatter off before compression. Claude tends to strip or
|
|
268
|
-
# rewrite frontmatter despite preserve-structure rules; we keep it verbatim
|
|
269
|
-
# by removing it from the input and re-prepending it to the output.
|
|
270
|
-
frontmatter, body = split_frontmatter(original_text)
|
|
271
|
-
if frontmatter:
|
|
272
|
-
print(f"Detected YAML frontmatter ({len(frontmatter)} chars) — preserving verbatim")
|
|
273
|
-
|
|
274
|
-
if not body.strip():
|
|
275
|
-
print("❌ Refusing to compress: body is empty after frontmatter removal.")
|
|
276
|
-
return False
|
|
277
|
-
|
|
278
|
-
# Step 1: Compress (body only, frontmatter excluded)
|
|
279
|
-
print("Compressing with Claude...")
|
|
280
|
-
compressed_body = call_claude(build_compress_prompt(body))
|
|
281
|
-
|
|
282
|
-
if compressed_body is None or not compressed_body.strip():
|
|
283
|
-
print("❌ Compression aborted: Claude returned an empty response.")
|
|
284
|
-
print(" Original file is untouched (no backup created).")
|
|
285
|
-
return False
|
|
286
|
-
|
|
287
|
-
# Compare the BODY (not the whole file) — frontmatter is preserved verbatim
|
|
288
|
-
# and would never change, so identity must be judged on the compressible part.
|
|
289
|
-
if compressed_body.strip() == body.strip():
|
|
290
|
-
print("❌ Compression aborted: output is identical to input.")
|
|
291
|
-
print(" Likely causes: Claude refused, returned the prompt verbatim, or the file is")
|
|
292
|
-
print(" already in caveman form. Original file is untouched (no backup created).")
|
|
293
|
-
return False
|
|
294
|
-
|
|
295
|
-
# Reassemble: frontmatter (verbatim) + compressed body
|
|
296
|
-
compressed = frontmatter + compressed_body
|
|
297
|
-
|
|
298
|
-
# Save original as backup, then verify the backup readback before
|
|
299
|
-
# touching the input file. If the filesystem dropped bytes (encoding,
|
|
300
|
-
# antivirus, disk full), unlink the bad backup and abort instead of
|
|
301
|
-
# leaving the user with a corrupt backup + compressed primary.
|
|
302
|
-
backup_path.write_text(original_text)
|
|
303
|
-
backup_readback = backup_path.read_text(errors="ignore")
|
|
304
|
-
if backup_readback != original_text:
|
|
305
|
-
print(f"❌ Backup write verification failed: {backup_path}")
|
|
306
|
-
print(" In-memory original differs from on-disk backup. Aborting before touching the input file.")
|
|
307
|
-
try:
|
|
308
|
-
backup_path.unlink()
|
|
309
|
-
except OSError:
|
|
310
|
-
pass
|
|
311
|
-
return False
|
|
312
|
-
filepath.write_text(compressed)
|
|
313
|
-
|
|
314
|
-
# Step 2: Validate + Retry
|
|
315
|
-
for attempt in range(MAX_RETRIES):
|
|
316
|
-
print(f"\nValidation attempt {attempt + 1}")
|
|
317
|
-
|
|
318
|
-
result = validate(backup_path, filepath)
|
|
319
|
-
|
|
320
|
-
if result.is_valid:
|
|
321
|
-
print("Validation passed")
|
|
322
|
-
break
|
|
323
|
-
|
|
324
|
-
print("❌ Validation failed:")
|
|
325
|
-
for err in result.errors:
|
|
326
|
-
print(f" - {err}")
|
|
327
|
-
|
|
328
|
-
if attempt == MAX_RETRIES - 1:
|
|
329
|
-
# Restore original on failure
|
|
330
|
-
filepath.write_text(original_text)
|
|
331
|
-
backup_path.unlink(missing_ok=True)
|
|
332
|
-
print("❌ Failed after retries — original restored")
|
|
333
|
-
return False
|
|
334
|
-
|
|
335
|
-
print("Fixing with Claude...")
|
|
336
|
-
compressed = call_claude(
|
|
337
|
-
build_fix_prompt(original_text, compressed, result.errors)
|
|
338
|
-
)
|
|
339
|
-
filepath.write_text(compressed)
|
|
340
|
-
|
|
341
|
-
return True
|
|
@@ -1,169 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Detect whether a file is natural language (compressible) or code/config (skip)."""
|
|
3
|
-
|
|
4
|
-
import json
|
|
5
|
-
import re
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
# Extensions that are natural language and compressible
|
|
9
|
-
COMPRESSIBLE_EXTENSIONS = {".md", ".txt", ".markdown", ".rst", ".typ", ".typst", ".tex"}
|
|
10
|
-
|
|
11
|
-
# Extensions that are code/config and should be skipped
|
|
12
|
-
SKIP_EXTENSIONS = {
|
|
13
|
-
".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
|
|
14
|
-
".toml", ".env", ".lock", ".css", ".scss", ".html", ".xml",
|
|
15
|
-
".sql", ".sh", ".bash", ".zsh", ".go", ".rs", ".java", ".c",
|
|
16
|
-
".cpp", ".h", ".hpp", ".rb", ".php", ".swift", ".kt", ".lua",
|
|
17
|
-
".dockerfile", ".makefile", ".csv", ".ini", ".cfg",
|
|
18
|
-
}
|
|
19
|
-
|
|
20
|
-
# The subset of SKIP_EXTENSIONS that is configuration (not source code). Used to
|
|
21
|
-
# decide whether a skipped file reports as "config" vs "code". Must stay a subset
|
|
22
|
-
# of SKIP_EXTENSIONS (asserted below) so the two sets cannot silently drift.
|
|
23
|
-
CONFIG_EXTENSIONS = {
|
|
24
|
-
".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".env",
|
|
25
|
-
}
|
|
26
|
-
assert CONFIG_EXTENSIONS <= SKIP_EXTENSIONS, (
|
|
27
|
-
"CONFIG_EXTENSIONS must be a subset of SKIP_EXTENSIONS: "
|
|
28
|
-
f"{CONFIG_EXTENSIONS - SKIP_EXTENSIONS} not in SKIP_EXTENSIONS"
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
# Real-world extensionless config/build filenames whose classification is NOT
|
|
32
|
-
# already covered by the SKIP_EXTENSIONS fallback below. These have an empty
|
|
33
|
-
# `Path.suffix`, so the SKIP_EXTENSIONS check never matches them and they would
|
|
34
|
-
# otherwise be content-sniffed as natural language and offered up for
|
|
35
|
-
# compression to a third-party API. Map the lowercased full filename to its
|
|
36
|
-
# classification so a bare `Dockerfile`/`Makefile`/`.gitignore` is handled like
|
|
37
|
-
# its dotted-extension cousins would be.
|
|
38
|
-
#
|
|
39
|
-
# Names that are themselves SKIP_EXTENSIONS entries (e.g. ".env") are
|
|
40
|
-
# deliberately omitted here: the `name in SKIP_EXTENSIONS` fallback in
|
|
41
|
-
# detect_file_type already classifies them identically, so listing them in both
|
|
42
|
-
# places would be redundant and require hand-syncing.
|
|
43
|
-
SKIP_FILENAMES = {
|
|
44
|
-
"dockerfile": "code",
|
|
45
|
-
"makefile": "code",
|
|
46
|
-
"gnumakefile": "code",
|
|
47
|
-
".gitignore": "config",
|
|
48
|
-
".gitattributes": "config",
|
|
49
|
-
".dockerignore": "config",
|
|
50
|
-
".editorconfig": "config",
|
|
51
|
-
".npmrc": "config",
|
|
52
|
-
".prettierrc": "config",
|
|
53
|
-
".eslintrc": "config",
|
|
54
|
-
".babelrc": "config",
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
# Patterns that indicate a line is code
|
|
58
|
-
CODE_PATTERNS = [
|
|
59
|
-
re.compile(r"^\s*(import |from .+ import |require\(|const |let |var )"),
|
|
60
|
-
re.compile(r"^\s*(def |class |function |async function |export )"),
|
|
61
|
-
re.compile(r"^\s*(if\s*\(|for\s*\(|while\s*\(|switch\s*\(|try\s*\{)"),
|
|
62
|
-
re.compile(r"^\s*[\}\]\);]+\s*$"), # closing braces/brackets
|
|
63
|
-
re.compile(r"^\s*@\w+"), # decorators/annotations
|
|
64
|
-
re.compile(r'^\s*"[^"]+"\s*:\s*'), # JSON-like key-value
|
|
65
|
-
re.compile(r"^\s*\w+\s*=\s*[{\[\(\"']"), # assignment with literal
|
|
66
|
-
]
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _is_code_line(line: str) -> bool:
|
|
70
|
-
"""Check if a line looks like code."""
|
|
71
|
-
return any(p.match(line) for p in CODE_PATTERNS)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def _is_json_content(text: str) -> bool:
|
|
75
|
-
"""Check if content is valid JSON."""
|
|
76
|
-
try:
|
|
77
|
-
json.loads(text)
|
|
78
|
-
return True
|
|
79
|
-
except (json.JSONDecodeError, ValueError):
|
|
80
|
-
return False
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def _is_yaml_content(lines: list[str]) -> bool:
|
|
84
|
-
"""Heuristic: check if content looks like YAML."""
|
|
85
|
-
yaml_indicators = 0
|
|
86
|
-
for line in lines[:30]:
|
|
87
|
-
stripped = line.strip()
|
|
88
|
-
if stripped.startswith("---"):
|
|
89
|
-
yaml_indicators += 1
|
|
90
|
-
elif re.match(r"^\w[\w\s]*:\s", stripped):
|
|
91
|
-
yaml_indicators += 1
|
|
92
|
-
elif stripped.startswith("- ") and ":" in stripped:
|
|
93
|
-
yaml_indicators += 1
|
|
94
|
-
# If most non-empty lines look like YAML
|
|
95
|
-
non_empty = sum(1 for line in lines[:30] if line.strip())
|
|
96
|
-
return non_empty > 0 and yaml_indicators / non_empty > 0.6
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def detect_file_type(filepath: Path) -> str:
|
|
100
|
-
"""Classify a file as 'natural_language', 'code', 'config', or 'unknown'.
|
|
101
|
-
|
|
102
|
-
Returns:
|
|
103
|
-
One of: 'natural_language', 'code', 'config', 'unknown'
|
|
104
|
-
"""
|
|
105
|
-
ext = filepath.suffix.lower()
|
|
106
|
-
|
|
107
|
-
# Extension-based classification
|
|
108
|
-
if ext in COMPRESSIBLE_EXTENSIONS:
|
|
109
|
-
return "natural_language"
|
|
110
|
-
if ext in SKIP_EXTENSIONS:
|
|
111
|
-
return "config" if ext in CONFIG_EXTENSIONS else "code"
|
|
112
|
-
|
|
113
|
-
# Extensionless files (like CLAUDE.md, TODO) — check content.
|
|
114
|
-
if not ext:
|
|
115
|
-
# Leading-dot / build files (".env", "Dockerfile", "Makefile",
|
|
116
|
-
# ".gitignore") have an empty suffix, so the SKIP_EXTENSIONS check above
|
|
117
|
-
# never matched them and they would be content-sniffed as natural
|
|
118
|
-
# language. Classify them by full filename first so they are never
|
|
119
|
-
# offered up for compression to a third-party API.
|
|
120
|
-
name = filepath.name.lower()
|
|
121
|
-
if name in SKIP_FILENAMES:
|
|
122
|
-
return SKIP_FILENAMES[name]
|
|
123
|
-
if name in SKIP_EXTENSIONS:
|
|
124
|
-
return "config" if name in CONFIG_EXTENSIONS else "code"
|
|
125
|
-
|
|
126
|
-
try:
|
|
127
|
-
text = filepath.read_text(errors="ignore")
|
|
128
|
-
except (OSError, PermissionError):
|
|
129
|
-
return "unknown"
|
|
130
|
-
|
|
131
|
-
lines = text.splitlines()[:50]
|
|
132
|
-
|
|
133
|
-
if _is_json_content(text[:10000]):
|
|
134
|
-
return "config"
|
|
135
|
-
if _is_yaml_content(lines):
|
|
136
|
-
return "config"
|
|
137
|
-
|
|
138
|
-
code_lines = sum(1 for line in lines if line.strip() and _is_code_line(line))
|
|
139
|
-
non_empty = sum(1 for line in lines if line.strip())
|
|
140
|
-
if non_empty > 0 and code_lines / non_empty > 0.4:
|
|
141
|
-
return "code"
|
|
142
|
-
|
|
143
|
-
return "natural_language"
|
|
144
|
-
|
|
145
|
-
return "unknown"
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
def should_compress(filepath: Path) -> bool:
|
|
149
|
-
"""Return True if the file is natural language and should be compressed."""
|
|
150
|
-
if not filepath.is_file():
|
|
151
|
-
return False
|
|
152
|
-
# Skip backup files
|
|
153
|
-
if filepath.name.endswith(".original.md"):
|
|
154
|
-
return False
|
|
155
|
-
return detect_file_type(filepath) == "natural_language"
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
if __name__ == "__main__":
|
|
159
|
-
import sys
|
|
160
|
-
|
|
161
|
-
if len(sys.argv) < 2:
|
|
162
|
-
print("Usage: python detect.py <file1> [file2] ...")
|
|
163
|
-
sys.exit(1)
|
|
164
|
-
|
|
165
|
-
for path_str in sys.argv[1:]:
|
|
166
|
-
p = Path(path_str).resolve()
|
|
167
|
-
file_type = detect_file_type(p)
|
|
168
|
-
compress = should_compress(p)
|
|
169
|
-
print(f" {p.name:30s} type={file_type:20s} compress={compress}")
|
|
@@ -1,213 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
import re
|
|
3
|
-
from collections import Counter
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
|
|
6
|
-
URL_REGEX = re.compile(r"https?://[^\s)]+")
|
|
7
|
-
FENCE_OPEN_REGEX = re.compile(r"^(\s{0,3})(`{3,}|~{3,})(.*)$")
|
|
8
|
-
HEADING_REGEX = re.compile(r"^(#{1,6})\s+(.*)", re.MULTILINE)
|
|
9
|
-
BULLET_REGEX = re.compile(r"^\s*[-*+]\s+", re.MULTILINE)
|
|
10
|
-
|
|
11
|
-
# crude but effective path detection
|
|
12
|
-
# Requires either a path prefix (./ ../ / or drive letter) or a slash/backslash within the match
|
|
13
|
-
PATH_REGEX = re.compile(r"(?:\./|\.\./|/|[A-Za-z]:\\)[\w\-/\\\.]+|[\w\-\.]+[/\\][\w\-/\\\.]+")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class ValidationResult:
|
|
17
|
-
def __init__(self):
|
|
18
|
-
self.is_valid = True
|
|
19
|
-
self.errors = []
|
|
20
|
-
self.warnings = []
|
|
21
|
-
|
|
22
|
-
def add_error(self, msg):
|
|
23
|
-
self.is_valid = False
|
|
24
|
-
self.errors.append(msg)
|
|
25
|
-
|
|
26
|
-
def add_warning(self, msg):
|
|
27
|
-
self.warnings.append(msg)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def read_file(path: Path) -> str:
|
|
31
|
-
return path.read_text(errors="ignore")
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# ---------- Extractors ----------
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def extract_headings(text):
|
|
38
|
-
return [(level, title.strip()) for level, title in HEADING_REGEX.findall(text)]
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def extract_code_blocks(text):
|
|
42
|
-
"""Line-based fenced code block extractor.
|
|
43
|
-
|
|
44
|
-
Handles ``` and ~~~ fences with variable length (CommonMark: closing
|
|
45
|
-
fence must use same char and be at least as long as opening). Supports
|
|
46
|
-
nested fences (e.g. an outer 4-backtick block wrapping inner 3-backtick
|
|
47
|
-
content).
|
|
48
|
-
"""
|
|
49
|
-
blocks = []
|
|
50
|
-
lines = text.split("\n")
|
|
51
|
-
i = 0
|
|
52
|
-
n = len(lines)
|
|
53
|
-
while i < n:
|
|
54
|
-
m = FENCE_OPEN_REGEX.match(lines[i])
|
|
55
|
-
if not m:
|
|
56
|
-
i += 1
|
|
57
|
-
continue
|
|
58
|
-
fence_char = m.group(2)[0]
|
|
59
|
-
fence_len = len(m.group(2))
|
|
60
|
-
open_line = lines[i]
|
|
61
|
-
block_lines = [open_line]
|
|
62
|
-
i += 1
|
|
63
|
-
closed = False
|
|
64
|
-
while i < n:
|
|
65
|
-
close_m = FENCE_OPEN_REGEX.match(lines[i])
|
|
66
|
-
if (
|
|
67
|
-
close_m
|
|
68
|
-
and close_m.group(2)[0] == fence_char
|
|
69
|
-
and len(close_m.group(2)) >= fence_len
|
|
70
|
-
and close_m.group(3).strip() == ""
|
|
71
|
-
):
|
|
72
|
-
block_lines.append(lines[i])
|
|
73
|
-
closed = True
|
|
74
|
-
i += 1
|
|
75
|
-
break
|
|
76
|
-
block_lines.append(lines[i])
|
|
77
|
-
i += 1
|
|
78
|
-
if closed:
|
|
79
|
-
blocks.append("\n".join(block_lines))
|
|
80
|
-
# Unclosed fences are silently skipped — they indicate malformed markdown
|
|
81
|
-
# and including them would cause false-positive validation failures.
|
|
82
|
-
return blocks
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def extract_urls(text):
|
|
86
|
-
return set(URL_REGEX.findall(text))
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
def extract_paths(text):
|
|
90
|
-
return set(PATH_REGEX.findall(text))
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def count_bullets(text):
|
|
94
|
-
return len(BULLET_REGEX.findall(text))
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def extract_inline_codes(text):
|
|
98
|
-
text_without_fences = re.sub(r"^```[\s\S]*?^```", "", text, flags=re.MULTILINE)
|
|
99
|
-
text_without_fences = re.sub(r"^~~~[\s\S]*?^~~~", "", text_without_fences, flags=re.MULTILINE)
|
|
100
|
-
return re.findall(r"`([^`]+)`", text_without_fences)
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
# ---------- Validators ----------
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
def validate_headings(orig, comp, result):
|
|
107
|
-
h1 = extract_headings(orig)
|
|
108
|
-
h2 = extract_headings(comp)
|
|
109
|
-
|
|
110
|
-
if len(h1) != len(h2):
|
|
111
|
-
result.add_error(f"Heading count mismatch: {len(h1)} vs {len(h2)}")
|
|
112
|
-
|
|
113
|
-
if h1 != h2:
|
|
114
|
-
result.add_warning("Heading text/order changed")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def validate_code_blocks(orig, comp, result):
|
|
118
|
-
c1 = extract_code_blocks(orig)
|
|
119
|
-
c2 = extract_code_blocks(comp)
|
|
120
|
-
|
|
121
|
-
if c1 != c2:
|
|
122
|
-
result.add_error("Code blocks not preserved exactly")
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
def validate_urls(orig, comp, result):
|
|
126
|
-
u1 = extract_urls(orig)
|
|
127
|
-
u2 = extract_urls(comp)
|
|
128
|
-
|
|
129
|
-
if u1 != u2:
|
|
130
|
-
result.add_error(f"URL mismatch: lost={u1 - u2}, added={u2 - u1}")
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def validate_paths(orig, comp, result):
|
|
134
|
-
p1 = extract_paths(orig)
|
|
135
|
-
p2 = extract_paths(comp)
|
|
136
|
-
|
|
137
|
-
if p1 != p2:
|
|
138
|
-
result.add_warning(f"Path mismatch: lost={p1 - p2}, added={p2 - p1}")
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def validate_bullets(orig, comp, result):
|
|
142
|
-
b1 = count_bullets(orig)
|
|
143
|
-
b2 = count_bullets(comp)
|
|
144
|
-
|
|
145
|
-
if b1 == 0:
|
|
146
|
-
return
|
|
147
|
-
|
|
148
|
-
diff = abs(b1 - b2) / b1
|
|
149
|
-
|
|
150
|
-
if diff > 0.15:
|
|
151
|
-
result.add_warning(f"Bullet count changed too much: {b1} -> {b2}")
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
def validate_inline_codes(orig, comp, result):
|
|
155
|
-
c1 = Counter(extract_inline_codes(orig))
|
|
156
|
-
c2 = Counter(extract_inline_codes(comp))
|
|
157
|
-
|
|
158
|
-
if c1 != c2:
|
|
159
|
-
lost = set(c1.keys()) - set(c2.keys())
|
|
160
|
-
added = set(c2.keys()) - set(c1.keys())
|
|
161
|
-
for code, count in c1.items():
|
|
162
|
-
if code in c2 and c2[code] < count:
|
|
163
|
-
lost.add(f"{code} (lost {count - c2[code]} of {count} occurrences)")
|
|
164
|
-
if lost:
|
|
165
|
-
result.add_error(f"Inline code lost: {lost}")
|
|
166
|
-
if added:
|
|
167
|
-
result.add_warning(f"Inline code added: {added}")
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
# ---------- Main ----------
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def validate(original_path: Path, compressed_path: Path) -> ValidationResult:
|
|
174
|
-
result = ValidationResult()
|
|
175
|
-
|
|
176
|
-
orig = read_file(original_path)
|
|
177
|
-
comp = read_file(compressed_path)
|
|
178
|
-
|
|
179
|
-
validate_headings(orig, comp, result)
|
|
180
|
-
validate_code_blocks(orig, comp, result)
|
|
181
|
-
validate_urls(orig, comp, result)
|
|
182
|
-
validate_paths(orig, comp, result)
|
|
183
|
-
validate_bullets(orig, comp, result)
|
|
184
|
-
validate_inline_codes(orig, comp, result)
|
|
185
|
-
|
|
186
|
-
return result
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
# ---------- CLI ----------
|
|
190
|
-
|
|
191
|
-
if __name__ == "__main__":
|
|
192
|
-
import sys
|
|
193
|
-
|
|
194
|
-
if len(sys.argv) != 3:
|
|
195
|
-
print("Usage: python validate.py <original> <compressed>")
|
|
196
|
-
sys.exit(1)
|
|
197
|
-
|
|
198
|
-
orig = Path(sys.argv[1]).resolve()
|
|
199
|
-
comp = Path(sys.argv[2]).resolve()
|
|
200
|
-
|
|
201
|
-
res = validate(orig, comp)
|
|
202
|
-
|
|
203
|
-
print(f"\nValid: {res.is_valid}")
|
|
204
|
-
|
|
205
|
-
if res.errors:
|
|
206
|
-
print("\nErrors:")
|
|
207
|
-
for e in res.errors:
|
|
208
|
-
print(f" - {e}")
|
|
209
|
-
|
|
210
|
-
if res.warnings:
|
|
211
|
-
print("\nWarnings:")
|
|
212
|
-
for w in res.warnings:
|
|
213
|
-
print(f" - {w}")
|