ultimate-pi 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/.agents/skills/caveman/SKILL.md +67 -0
  2. package/.agents/skills/compress/SKILL.md +111 -0
  3. package/.agents/skills/compress/scripts/__init__.py +9 -0
  4. package/.agents/skills/compress/scripts/__main__.py +3 -0
  5. package/.agents/skills/compress/scripts/benchmark.py +78 -0
  6. package/.agents/skills/compress/scripts/cli.py +73 -0
  7. package/.agents/skills/compress/scripts/compress.py +227 -0
  8. package/.agents/skills/compress/scripts/detect.py +121 -0
  9. package/.agents/skills/compress/scripts/validate.py +189 -0
  10. package/.agents/skills/context7-cli/SKILL.md +73 -0
  11. package/.agents/skills/context7-cli/references/docs.md +121 -0
  12. package/.agents/skills/context7-cli/references/setup.md +43 -0
  13. package/.agents/skills/context7-cli/references/skills.md +118 -0
  14. package/.agents/skills/emil-design-eng/SKILL.md +679 -0
  15. package/.agents/skills/lean-ctx/SKILL.md +149 -0
  16. package/.agents/skills/lean-ctx/scripts/install.sh +95 -0
  17. package/.agents/skills/scrapling-official/LICENSE.txt +28 -0
  18. package/.agents/skills/scrapling-official/SKILL.md +390 -0
  19. package/.agents/skills/scrapling-official/examples/01_fetcher_session.py +26 -0
  20. package/.agents/skills/scrapling-official/examples/02_dynamic_session.py +26 -0
  21. package/.agents/skills/scrapling-official/examples/03_stealthy_session.py +26 -0
  22. package/.agents/skills/scrapling-official/examples/04_spider.py +58 -0
  23. package/.agents/skills/scrapling-official/examples/README.md +45 -0
  24. package/.agents/skills/scrapling-official/references/fetching/choosing.md +78 -0
  25. package/.agents/skills/scrapling-official/references/fetching/dynamic.md +352 -0
  26. package/.agents/skills/scrapling-official/references/fetching/static.md +432 -0
  27. package/.agents/skills/scrapling-official/references/fetching/stealthy.md +255 -0
  28. package/.agents/skills/scrapling-official/references/mcp-server.md +214 -0
  29. package/.agents/skills/scrapling-official/references/migrating_from_beautifulsoup.md +86 -0
  30. package/.agents/skills/scrapling-official/references/parsing/adaptive.md +212 -0
  31. package/.agents/skills/scrapling-official/references/parsing/main_classes.md +586 -0
  32. package/.agents/skills/scrapling-official/references/parsing/selection.md +494 -0
  33. package/.agents/skills/scrapling-official/references/spiders/advanced.md +344 -0
  34. package/.agents/skills/scrapling-official/references/spiders/architecture.md +94 -0
  35. package/.agents/skills/scrapling-official/references/spiders/getting-started.md +164 -0
  36. package/.agents/skills/scrapling-official/references/spiders/proxy-blocking.md +235 -0
  37. package/.agents/skills/scrapling-official/references/spiders/requests-responses.md +196 -0
  38. package/.agents/skills/scrapling-official/references/spiders/sessions.md +205 -0
  39. package/.github/banner.png +0 -0
  40. package/.pi/SYSTEM.md +40 -0
  41. package/.pi/settings.json +5 -0
  42. package/PLAN.md +11 -0
  43. package/README.md +58 -0
  44. package/extensions/lean-ctx-enforce.ts +166 -0
  45. package/package.json +17 -0
  46. package/skills-lock.json +35 -0
  47. package/wiki/README.md +10 -0
  48. package/wiki/decisions/0001-establish-project-wiki-and-decision-record-format.md +25 -0
  49. package/wiki/decisions/0002-add-project-banner-to-readme.md +26 -0
  50. package/wiki/decisions/0003-remove-redundant-readme-title-heading.md +26 -0
  51. package/wiki/decisions/0004-publish-package-to-npm-as-ultimate-pi.md +26 -0
@@ -0,0 +1,67 @@
1
+ ---
2
+ name: caveman
3
+ description: >
4
+ Ultra-compressed communication mode. Cuts token usage ~75% by speaking like caveman
5
+ while keeping full technical accuracy. Supports intensity levels: lite, full (default), ultra,
6
+ wenyan-lite, wenyan-full, wenyan-ultra.
7
+ Use when user says "caveman mode", "talk like caveman", "use caveman", "less tokens",
8
+ "be brief", or invokes /caveman. Also auto-triggers when token efficiency is requested.
9
+ ---
10
+
11
+ Respond terse like smart caveman. All technical substance stay. Only fluff die.
12
+
13
+ ## Persistence
14
+
15
+ ACTIVE EVERY RESPONSE. No revert after many turns. No filler drift. Still active if unsure. Off only: "stop caveman" / "normal mode".
16
+
17
+ Default: **full**. Switch: `/caveman lite|full|ultra`.
18
+
19
+ ## Rules
20
+
21
+ Drop: articles (a/an/the), filler (just/really/basically/actually/simply), pleasantries (sure/certainly/of course/happy to), hedging. Fragments OK. Short synonyms (big not extensive, fix not "implement a solution for"). Technical terms exact. Code blocks unchanged. Errors quoted exact.
22
+
23
+ Pattern: `[thing] [action] [reason]. [next step].`
24
+
25
+ Not: "Sure! I'd be happy to help you with that. The issue you're experiencing is likely caused by..."
26
+ Yes: "Bug in auth middleware. Token expiry check use `<` not `<=`. Fix:"
27
+
28
+ ## Intensity
29
+
30
+ | Level | What change |
31
+ |-------|------------|
32
+ | **lite** | No filler/hedging. Keep articles + full sentences. Professional but tight |
33
+ | **full** | Drop articles, fragments OK, short synonyms. Classic caveman |
34
+ | **ultra** | Abbreviate (DB/auth/config/req/res/fn/impl), strip conjunctions, arrows for causality (X → Y), one word when one word enough |
35
+ | **wenyan-lite** | Semi-classical. Drop filler/hedging but keep grammar structure, classical register |
36
+ | **wenyan-full** | Maximum classical terseness. Fully 文言文. 80-90% character reduction. Classical sentence patterns, verbs precede objects, subjects often omitted, classical particles (之/乃/為/其) |
37
+ | **wenyan-ultra** | Extreme abbreviation while keeping classical Chinese feel. Maximum compression, ultra terse |
38
+
39
+ Example — "Why React component re-render?"
40
+ - lite: "Your component re-renders because you create a new object reference each render. Wrap it in `useMemo`."
41
+ - full: "New object ref each render. Inline object prop = new ref = re-render. Wrap in `useMemo`."
42
+ - ultra: "Inline obj prop → new ref → re-render. `useMemo`."
43
+ - wenyan-lite: "組件頻重繪,以每繪新生對象參照故。以 useMemo 包之。"
44
+ - wenyan-full: "物出新參照,致重繪。useMemo .Wrap之。"
45
+ - wenyan-ultra: "新參照→重繪。useMemo Wrap。"
46
+
47
+ Example — "Explain database connection pooling."
48
+ - lite: "Connection pooling reuses open connections instead of creating new ones per request. Avoids repeated handshake overhead."
49
+ - full: "Pool reuse open DB connections. No new connection per request. Skip handshake overhead."
50
+ - ultra: "Pool = reuse DB conn. Skip handshake → fast under load."
51
+ - wenyan-full: "池reuse open connection。不每req新開。skip handshake overhead。"
52
+ - wenyan-ultra: "池reuse conn。skip handshake → fast。"
53
+
54
+ ## Auto-Clarity
55
+
56
+ Drop caveman for: security warnings, irreversible action confirmations, multi-step sequences where fragment order risks misread, user asks to clarify or repeats question. Resume caveman after clear part done.
57
+
58
+ Example — destructive op:
59
+ > **Warning:** This will permanently delete all rows in the `users` table and cannot be undone.
60
+ > ```sql
61
+ > DROP TABLE users;
62
+ > ```
63
+ > Caveman resume. Verify backup exist first.
64
+
65
+ ## Boundaries
66
+
67
+ Code/commits/PRs: write normal. "stop caveman" or "normal mode": revert. Level persist until changed or session end.
@@ -0,0 +1,111 @@
1
+ ---
2
+ name: compress
3
+ description: >
4
+ Compress natural language memory files (CLAUDE.md, todos, preferences) into caveman format
5
+ to save input tokens. Preserves all technical substance, code, URLs, and structure.
6
+ Compressed version overwrites the original file. Human-readable backup saved as FILE.original.md.
7
+ Trigger: /caveman:compress <filepath> or "compress memory file"
8
+ ---
9
+
10
+ # Caveman Compress
11
+
12
+ ## Purpose
13
+
14
+ Compress natural language files (CLAUDE.md, todos, preferences) into caveman-speak to reduce input tokens. Compressed version overwrites original. Human-readable backup saved as `<filename>.original.md`.
15
+
16
+ ## Trigger
17
+
18
+ `/caveman:compress <filepath>` or when user asks to compress a memory file.
19
+
20
+ ## Process
21
+
22
+ 1. This SKILL.md lives alongside `scripts/` in the same directory. Find that directory.
23
+
24
+ 2. Run:
25
+
26
+ cd <directory_containing_this_SKILL.md> && python3 -m scripts <absolute_filepath>
27
+
28
+ 3. The CLI will:
29
+ - detect file type (no tokens)
30
+ - call Claude to compress
31
+ - validate output (no tokens)
32
+ - if errors: cherry-pick fix with Claude (targeted fixes only, no recompression)
33
+ - retry up to 2 times
34
+ - if still failing after 2 retries: report error to user, leave original file untouched
35
+
36
+ 4. Return result to user
37
+
38
+ ## Compression Rules
39
+
40
+ ### Remove
41
+ - Articles: a, an, the
42
+ - Filler: just, really, basically, actually, simply, essentially, generally
43
+ - Pleasantries: "sure", "certainly", "of course", "happy to", "I'd recommend"
44
+ - Hedging: "it might be worth", "you could consider", "it would be good to"
45
+ - Redundant phrasing: "in order to" → "to", "make sure to" → "ensure", "the reason is because" → "because"
46
+ - Connective fluff: "however", "furthermore", "additionally", "in addition"
47
+
48
+ ### Preserve EXACTLY (never modify)
49
+ - Code blocks (fenced ``` and indented)
50
+ - Inline code (`backtick content`)
51
+ - URLs and links (full URLs, markdown links)
52
+ - File paths (`/src/components/...`, `./config.yaml`)
53
+ - Commands (`npm install`, `git commit`, `docker build`)
54
+ - Technical terms (library names, API names, protocols, algorithms)
55
+ - Proper nouns (project names, people, companies)
56
+ - Dates, version numbers, numeric values
57
+ - Environment variables (`$HOME`, `NODE_ENV`)
58
+
59
+ ### Preserve Structure
60
+ - All markdown headings (keep exact heading text, compress body below)
61
+ - Bullet point hierarchy (keep nesting level)
62
+ - Numbered lists (keep numbering)
63
+ - Tables (compress cell text, keep structure)
64
+ - Frontmatter/YAML headers in markdown files
65
+
66
+ ### Compress
67
+ - Use short synonyms: "big" not "extensive", "fix" not "implement a solution for", "use" not "utilize"
68
+ - Fragments OK: "Run tests before commit" not "You should always run tests before committing"
69
+ - Drop "you should", "make sure to", "remember to" — just state the action
70
+ - Merge redundant bullets that say the same thing differently
71
+ - Keep one example where multiple examples show the same pattern
72
+
73
+ CRITICAL RULE:
74
+ Anything inside ``` ... ``` must be copied EXACTLY.
75
+ Do not:
76
+ - remove comments
77
+ - remove spacing
78
+ - reorder lines
79
+ - shorten commands
80
+ - simplify anything
81
+
82
+ Inline code (`...`) must be preserved EXACTLY.
83
+ Do not modify anything inside backticks.
84
+
85
+ If file contains code blocks:
86
+ - Treat code blocks as read-only regions
87
+ - Only compress text outside them
88
+ - Do not merge sections around code
89
+
90
+ ## Pattern
91
+
92
+ Original:
93
+ > You should always make sure to run the test suite before pushing any changes to the main branch. This is important because it helps catch bugs early and prevents broken builds from being deployed to production.
94
+
95
+ Compressed:
96
+ > Run tests before push to main. Catch bugs early, prevent broken prod deploys.
97
+
98
+ Original:
99
+ > The application uses a microservices architecture with the following components. The API gateway handles all incoming requests and routes them to the appropriate service. The authentication service is responsible for managing user sessions and JWT tokens.
100
+
101
+ Compressed:
102
+ > Microservices architecture. API gateway route all requests to services. Auth service manage user sessions + JWT tokens.
103
+
104
+ ## Boundaries
105
+
106
+ - ONLY compress natural language files (.md, .txt, extensionless)
107
+ - NEVER modify: .py, .js, .ts, .json, .yaml, .yml, .toml, .env, .lock, .css, .html, .xml, .sql, .sh
108
+ - If file has mixed content (prose + code), compress ONLY the prose sections
109
+ - If unsure whether something is code or prose, leave it unchanged
110
+ - Original file is backed up as FILE.original.md before overwriting
111
+ - Never compress FILE.original.md (skip it)
@@ -0,0 +1,9 @@
1
+ """Caveman compress scripts.
2
+
3
+ This package provides tools to compress natural language markdown files
4
+ into caveman format to save input tokens.
5
+ """
6
+
7
+ __all__ = ["cli", "compress", "detect", "validate"]
8
+
9
+ __version__ = "1.0.0"
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
@@ -0,0 +1,78 @@
1
+ #!/usr/bin/env python3
2
+ from pathlib import Path
3
+ import sys
4
+
5
+ # Support both direct execution and module import
6
+ try:
7
+ from .validate import validate
8
+ except ImportError:
9
+ sys.path.insert(0, str(Path(__file__).parent))
10
+ from validate import validate
11
+
12
+ try:
13
+ import tiktoken
14
+ _enc = tiktoken.get_encoding("o200k_base")
15
+ except ImportError:
16
+ _enc = None
17
+
18
+
19
+ def count_tokens(text):
20
+ if _enc is None:
21
+ return len(text.split()) # fallback: word count
22
+ return len(_enc.encode(text))
23
+
24
+
25
+ def benchmark_pair(orig_path: Path, comp_path: Path):
26
+ orig_text = orig_path.read_text()
27
+ comp_text = comp_path.read_text()
28
+
29
+ orig_tokens = count_tokens(orig_text)
30
+ comp_tokens = count_tokens(comp_text)
31
+ saved = 100 * (orig_tokens - comp_tokens) / orig_tokens if orig_tokens > 0 else 0.0
32
+ result = validate(orig_path, comp_path)
33
+
34
+ return (comp_path.name, orig_tokens, comp_tokens, saved, result.is_valid)
35
+
36
+
37
+ def print_table(rows):
38
+ print("\n| File | Original | Compressed | Saved % | Valid |")
39
+ print("|------|----------|------------|---------|-------|")
40
+ for r in rows:
41
+ print(f"| {r[0]} | {r[1]} | {r[2]} | {r[3]:.1f}% | {'✅' if r[4] else '❌'} |")
42
+
43
+
44
+ def main():
45
+ # Direct file pair: python3 benchmark.py original.md compressed.md
46
+ if len(sys.argv) == 3:
47
+ orig = Path(sys.argv[1]).resolve()
48
+ comp = Path(sys.argv[2]).resolve()
49
+ if not orig.exists():
50
+ print(f"❌ Not found: {orig}")
51
+ sys.exit(1)
52
+ if not comp.exists():
53
+ print(f"❌ Not found: {comp}")
54
+ sys.exit(1)
55
+ print_table([benchmark_pair(orig, comp)])
56
+ return
57
+
58
+ # Glob mode: repo_root/tests/caveman-compress/
59
+ tests_dir = Path(__file__).parent.parent.parent / "tests" / "caveman-compress"
60
+ if not tests_dir.exists():
61
+ print(f"❌ Tests dir not found: {tests_dir}")
62
+ sys.exit(1)
63
+
64
+ rows = []
65
+ for orig in sorted(tests_dir.glob("*.original.md")):
66
+ comp = orig.with_name(orig.stem.removesuffix(".original") + ".md")
67
+ if comp.exists():
68
+ rows.append(benchmark_pair(orig, comp))
69
+
70
+ if not rows:
71
+ print("No compressed file pairs found.")
72
+ return
73
+
74
+ print_table(rows)
75
+
76
+
77
+ if __name__ == "__main__":
78
+ main()
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Caveman Compress CLI
4
+
5
+ Usage:
6
+ caveman <filepath>
7
+ """
8
+
9
+ import sys
10
+ from pathlib import Path
11
+
12
+ from .compress import compress_file
13
+ from .detect import detect_file_type, should_compress
14
+
15
+
16
+ def print_usage():
17
+ print("Usage: caveman <filepath>")
18
+
19
+
20
+ def main():
21
+ if len(sys.argv) != 2:
22
+ print_usage()
23
+ sys.exit(1)
24
+
25
+ filepath = Path(sys.argv[1])
26
+
27
+ # Check file exists
28
+ if not filepath.exists():
29
+ print(f"❌ File not found: {filepath}")
30
+ sys.exit(1)
31
+
32
+ if not filepath.is_file():
33
+ print(f"❌ Not a file: {filepath}")
34
+ sys.exit(1)
35
+
36
+ filepath = filepath.resolve()
37
+
38
+ # Detect file type
39
+ file_type = detect_file_type(filepath)
40
+
41
+ print(f"Detected: {file_type}")
42
+
43
+ # Check if compressible
44
+ if not should_compress(filepath):
45
+ print("Skipping: file is not natural language (code/config)")
46
+ sys.exit(0)
47
+
48
+ print("Starting caveman compression...\n")
49
+
50
+ try:
51
+ success = compress_file(filepath)
52
+
53
+ if success:
54
+ print("\nCompression completed successfully")
55
+ backup_path = filepath.with_name(filepath.stem + ".original.md")
56
+ print(f"Compressed: {filepath}")
57
+ print(f"Original: {backup_path}")
58
+ sys.exit(0)
59
+ else:
60
+ print("\n❌ Compression failed after retries")
61
+ sys.exit(2)
62
+
63
+ except KeyboardInterrupt:
64
+ print("\nInterrupted by user")
65
+ sys.exit(130)
66
+
67
+ except Exception as e:
68
+ print(f"\n❌ Error: {e}")
69
+ sys.exit(1)
70
+
71
+
72
+ if __name__ == "__main__":
73
+ main()
@@ -0,0 +1,227 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Caveman Memory Compression Orchestrator
4
+
5
+ Usage:
6
+ python scripts/compress.py <filepath>
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import subprocess
12
+ from pathlib import Path
13
+ from typing import List
14
+
15
+ OUTER_FENCE_REGEX = re.compile(
16
+ r"\A\s*(`{3,}|~{3,})[^\n]*\n(.*)\n\1\s*\Z", re.DOTALL
17
+ )
18
+
19
+ # Filenames and paths that almost certainly hold secrets or PII. Compressing
20
+ # them ships raw bytes to the Anthropic API — a third-party data boundary that
21
+ # developers on sensitive codebases cannot cross. detect.py already skips .env
22
+ # by extension, but credentials.md / secrets.txt / ~/.aws/credentials would
23
+ # slip through the natural-language filter. This is a hard refuse before read.
24
+ SENSITIVE_BASENAME_REGEX = re.compile(
25
+ r"(?ix)^("
26
+ r"\.env(\..+)?"
27
+ r"|\.netrc"
28
+ r"|credentials(\..+)?"
29
+ r"|secrets?(\..+)?"
30
+ r"|passwords?(\..+)?"
31
+ r"|id_(rsa|dsa|ecdsa|ed25519)(\.pub)?"
32
+ r"|authorized_keys"
33
+ r"|known_hosts"
34
+ r"|.*\.(pem|key|p12|pfx|crt|cer|jks|keystore|asc|gpg)"
35
+ r")$"
36
+ )
37
+
38
+ SENSITIVE_PATH_COMPONENTS = frozenset({".ssh", ".aws", ".gnupg", ".kube", ".docker"})
39
+
40
+ SENSITIVE_NAME_TOKENS = (
41
+ "secret", "credential", "password", "passwd",
42
+ "apikey", "accesskey", "token", "privatekey",
43
+ )
44
+
45
+
46
+ def is_sensitive_path(filepath: Path) -> bool:
47
+ """Heuristic denylist for files that must never be shipped to a third-party API."""
48
+ name = filepath.name
49
+ if SENSITIVE_BASENAME_REGEX.match(name):
50
+ return True
51
+ lowered_parts = {p.lower() for p in filepath.parts}
52
+ if lowered_parts & SENSITIVE_PATH_COMPONENTS:
53
+ return True
54
+ # Normalize separators so "api-key" and "api_key" both match "apikey".
55
+ lower = re.sub(r"[_\-\s.]", "", name.lower())
56
+ return any(tok in lower for tok in SENSITIVE_NAME_TOKENS)
57
+
58
+
59
+ def strip_llm_wrapper(text: str) -> str:
60
+ """Strip outer ```markdown ... ``` fence when it wraps the entire output."""
61
+ m = OUTER_FENCE_REGEX.match(text)
62
+ if m:
63
+ return m.group(2)
64
+ return text
65
+
66
+ from .detect import should_compress
67
+ from .validate import validate
68
+
69
+ MAX_RETRIES = 2
70
+
71
+
72
+ # ---------- Claude Calls ----------
73
+
74
+
75
+ def call_claude(prompt: str) -> str:
76
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
77
+ if api_key:
78
+ try:
79
+ import anthropic
80
+
81
+ client = anthropic.Anthropic(api_key=api_key)
82
+ msg = client.messages.create(
83
+ model=os.environ.get("CAVEMAN_MODEL", "claude-sonnet-4-5"),
84
+ max_tokens=8192,
85
+ messages=[{"role": "user", "content": prompt}],
86
+ )
87
+ return strip_llm_wrapper(msg.content[0].text.strip())
88
+ except ImportError:
89
+ pass # anthropic not installed, fall back to CLI
90
+ # Fallback: use claude CLI (handles desktop auth)
91
+ try:
92
+ result = subprocess.run(
93
+ ["claude", "--print"],
94
+ input=prompt,
95
+ text=True,
96
+ capture_output=True,
97
+ check=True,
98
+ )
99
+ return strip_llm_wrapper(result.stdout.strip())
100
+ except subprocess.CalledProcessError as e:
101
+ raise RuntimeError(f"Claude call failed:\n{e.stderr}")
102
+
103
+
104
+ def build_compress_prompt(original: str) -> str:
105
+ return f"""
106
+ Compress this markdown into caveman format.
107
+
108
+ STRICT RULES:
109
+ - Do NOT modify anything inside ``` code blocks
110
+ - Do NOT modify anything inside inline backticks
111
+ - Preserve ALL URLs exactly
112
+ - Preserve ALL headings exactly
113
+ - Preserve file paths and commands
114
+ - Return ONLY the compressed markdown body — do NOT wrap the entire output in a ```markdown fence or any other fence. Inner code blocks from the original stay as-is; do not add a new outer fence around the whole file.
115
+
116
+ Only compress natural language.
117
+
118
+ TEXT:
119
+ {original}
120
+ """
121
+
122
+
123
+ def build_fix_prompt(original: str, compressed: str, errors: List[str]) -> str:
124
+ errors_str = "\n".join(f"- {e}" for e in errors)
125
+ return f"""You are fixing a caveman-compressed markdown file. Specific validation errors were found.
126
+
127
+ CRITICAL RULES:
128
+ - DO NOT recompress or rephrase the file
129
+ - ONLY fix the listed errors — leave everything else exactly as-is
130
+ - The ORIGINAL is provided as reference only (to restore missing content)
131
+ - Preserve caveman style in all untouched sections
132
+
133
+ ERRORS TO FIX:
134
+ {errors_str}
135
+
136
+ HOW TO FIX:
137
+ - Missing URL: find it in ORIGINAL, restore it exactly where it belongs in COMPRESSED
138
+ - Code block mismatch: find the exact code block in ORIGINAL, restore it in COMPRESSED
139
+ - Heading mismatch: restore the exact heading text from ORIGINAL into COMPRESSED
140
+ - Do not touch any section not mentioned in the errors
141
+
142
+ ORIGINAL (reference only):
143
+ {original}
144
+
145
+ COMPRESSED (fix this):
146
+ {compressed}
147
+
148
+ Return ONLY the fixed compressed file. No explanation.
149
+ """
150
+
151
+
152
+ # ---------- Core Logic ----------
153
+
154
+
155
+ def compress_file(filepath: Path) -> bool:
156
+ # Resolve and validate path
157
+ filepath = filepath.resolve()
158
+ MAX_FILE_SIZE = 500_000 # 500KB
159
+ if not filepath.exists():
160
+ raise FileNotFoundError(f"File not found: {filepath}")
161
+ if filepath.stat().st_size > MAX_FILE_SIZE:
162
+ raise ValueError(f"File too large to compress safely (max 500KB): {filepath}")
163
+
164
+ # Refuse files that look like they contain secrets or PII. Compressing ships
165
+ # the raw bytes to the Anthropic API — a third-party boundary — so we fail
166
+ # loudly rather than silently exfiltrate credentials or keys. Override is
167
+ # intentional: the user must rename the file if the heuristic is wrong.
168
+ if is_sensitive_path(filepath):
169
+ raise ValueError(
170
+ f"Refusing to compress {filepath}: filename looks sensitive "
171
+ "(credentials, keys, secrets, or known private paths). "
172
+ "Compression sends file contents to the Anthropic API. "
173
+ "Rename the file if this is a false positive."
174
+ )
175
+
176
+ print(f"Processing: {filepath}")
177
+
178
+ if not should_compress(filepath):
179
+ print("Skipping (not natural language)")
180
+ return False
181
+
182
+ original_text = filepath.read_text(errors="ignore")
183
+ backup_path = filepath.with_name(filepath.stem + ".original.md")
184
+
185
+ # Check if backup already exists to prevent accidental overwriting
186
+ if backup_path.exists():
187
+ print(f"⚠️ Backup file already exists: {backup_path}")
188
+ print("The original backup may contain important content.")
189
+ print("Aborting to prevent data loss. Please remove or rename the backup file if you want to proceed.")
190
+ return False
191
+
192
+ # Step 1: Compress
193
+ print("Compressing with Claude...")
194
+ compressed = call_claude(build_compress_prompt(original_text))
195
+
196
+ # Save original as backup, write compressed to original path
197
+ backup_path.write_text(original_text)
198
+ filepath.write_text(compressed)
199
+
200
+ # Step 2: Validate + Retry
201
+ for attempt in range(MAX_RETRIES):
202
+ print(f"\nValidation attempt {attempt + 1}")
203
+
204
+ result = validate(backup_path, filepath)
205
+
206
+ if result.is_valid:
207
+ print("Validation passed")
208
+ break
209
+
210
+ print("❌ Validation failed:")
211
+ for err in result.errors:
212
+ print(f" - {err}")
213
+
214
+ if attempt == MAX_RETRIES - 1:
215
+ # Restore original on failure
216
+ filepath.write_text(original_text)
217
+ backup_path.unlink(missing_ok=True)
218
+ print("❌ Failed after retries — original restored")
219
+ return False
220
+
221
+ print("Fixing with Claude...")
222
+ compressed = call_claude(
223
+ build_fix_prompt(original_text, compressed, result.errors)
224
+ )
225
+ filepath.write_text(compressed)
226
+
227
+ return True
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python3
2
+ """Detect whether a file is natural language (compressible) or code/config (skip)."""
3
+
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+
8
+ # Extensions that are natural language and compressible
9
+ COMPRESSIBLE_EXTENSIONS = {".md", ".txt", ".markdown", ".rst"}
10
+
11
+ # Extensions that are code/config and should be skipped
12
+ SKIP_EXTENSIONS = {
13
+ ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
14
+ ".toml", ".env", ".lock", ".css", ".scss", ".html", ".xml",
15
+ ".sql", ".sh", ".bash", ".zsh", ".go", ".rs", ".java", ".c",
16
+ ".cpp", ".h", ".hpp", ".rb", ".php", ".swift", ".kt", ".lua",
17
+ ".dockerfile", ".makefile", ".csv", ".ini", ".cfg",
18
+ }
19
+
20
+ # Patterns that indicate a line is code
21
+ CODE_PATTERNS = [
22
+ re.compile(r"^\s*(import |from .+ import |require\(|const |let |var )"),
23
+ re.compile(r"^\s*(def |class |function |async function |export )"),
24
+ re.compile(r"^\s*(if\s*\(|for\s*\(|while\s*\(|switch\s*\(|try\s*\{)"),
25
+ re.compile(r"^\s*[\}\]\);]+\s*$"), # closing braces/brackets
26
+ re.compile(r"^\s*@\w+"), # decorators/annotations
27
+ re.compile(r'^\s*"[^"]+"\s*:\s*'), # JSON-like key-value
28
+ re.compile(r"^\s*\w+\s*=\s*[{\[\(\"']"), # assignment with literal
29
+ ]
30
+
31
+
32
+ def _is_code_line(line: str) -> bool:
33
+ """Check if a line looks like code."""
34
+ return any(p.match(line) for p in CODE_PATTERNS)
35
+
36
+
37
+ def _is_json_content(text: str) -> bool:
38
+ """Check if content is valid JSON."""
39
+ try:
40
+ json.loads(text)
41
+ return True
42
+ except (json.JSONDecodeError, ValueError):
43
+ return False
44
+
45
+
46
+ def _is_yaml_content(lines: list[str]) -> bool:
47
+ """Heuristic: check if content looks like YAML."""
48
+ yaml_indicators = 0
49
+ for line in lines[:30]:
50
+ stripped = line.strip()
51
+ if stripped.startswith("---"):
52
+ yaml_indicators += 1
53
+ elif re.match(r"^\w[\w\s]*:\s", stripped):
54
+ yaml_indicators += 1
55
+ elif stripped.startswith("- ") and ":" in stripped:
56
+ yaml_indicators += 1
57
+ # If most non-empty lines look like YAML
58
+ non_empty = sum(1 for l in lines[:30] if l.strip())
59
+ return non_empty > 0 and yaml_indicators / non_empty > 0.6
60
+
61
+
62
+ def detect_file_type(filepath: Path) -> str:
63
+ """Classify a file as 'natural_language', 'code', 'config', or 'unknown'.
64
+
65
+ Returns:
66
+ One of: 'natural_language', 'code', 'config', 'unknown'
67
+ """
68
+ ext = filepath.suffix.lower()
69
+
70
+ # Extension-based classification
71
+ if ext in COMPRESSIBLE_EXTENSIONS:
72
+ return "natural_language"
73
+ if ext in SKIP_EXTENSIONS:
74
+ return "code" if ext not in {".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".env"} else "config"
75
+
76
+ # Extensionless files (like CLAUDE.md, TODO) — check content
77
+ if not ext:
78
+ try:
79
+ text = filepath.read_text(errors="ignore")
80
+ except (OSError, PermissionError):
81
+ return "unknown"
82
+
83
+ lines = text.splitlines()[:50]
84
+
85
+ if _is_json_content(text[:10000]):
86
+ return "config"
87
+ if _is_yaml_content(lines):
88
+ return "config"
89
+
90
+ code_lines = sum(1 for l in lines if l.strip() and _is_code_line(l))
91
+ non_empty = sum(1 for l in lines if l.strip())
92
+ if non_empty > 0 and code_lines / non_empty > 0.4:
93
+ return "code"
94
+
95
+ return "natural_language"
96
+
97
+ return "unknown"
98
+
99
+
100
+ def should_compress(filepath: Path) -> bool:
101
+ """Return True if the file is natural language and should be compressed."""
102
+ if not filepath.is_file():
103
+ return False
104
+ # Skip backup files
105
+ if filepath.name.endswith(".original.md"):
106
+ return False
107
+ return detect_file_type(filepath) == "natural_language"
108
+
109
+
110
+ if __name__ == "__main__":
111
+ import sys
112
+
113
+ if len(sys.argv) < 2:
114
+ print("Usage: python detect.py <file1> [file2] ...")
115
+ sys.exit(1)
116
+
117
+ for path_str in sys.argv[1:]:
118
+ p = Path(path_str).resolve()
119
+ file_type = detect_file_type(p)
120
+ compress = should_compress(p)
121
+ print(f" {p.name:30s} type={file_type:20s} compress={compress}")