sourcepack 1.10.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sourcepack/packet.py ADDED
@@ -0,0 +1,837 @@
1
+ from __future__ import annotations
2
+
3
+ import fnmatch
4
+ import hashlib
5
+ import json
6
+ import os
7
+ import re
8
+ import shutil
9
+ import subprocess
10
+ import tomllib
11
+ from dataclasses import dataclass, asdict
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+ from typing import Iterable
15
+ from xml.sax.saxutils import escape as xml_escape
16
+
17
+ from .ecosystems.python import PY_IMPORT_ALIASES
18
+
19
+ try:
20
+ from . import __version__
21
+ except Exception:
22
+ __version__ = "1.10.0-alpha"
23
+
24
+
25
+
26
+ DEFAULT_IGNORED_DIRS = {
27
+ ".git", "node_modules", ".venv", "venv", "__pycache__", "dist", "build",
28
+ ".next", ".cache", "target", "coverage", ".pytest_cache", ".sourcepack"
29
+ }
30
+ DEFAULT_IGNORED_PATTERNS = {
31
+ ".env", ".env.*", "*.pem", "*.key", "*.sqlite", "*.db", "*.png", "*.jpg",
32
+ "*.jpeg", "*.gif", "*.webp", "*.pdf", "*.zip", "*.tar", "*.gz", "*.exe",
33
+ "*.dll", "*.so", "*.dylib", "*.bin", "*.pyc"
34
+ }
35
+ DEFAULT_TEXT_EXTENSIONS = {
36
+ ".txt", ".md", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
37
+ ".html", ".css", ".csv", ".toml", ".ini", ".sql", ".sh", ".bat", ".ps1", ".rs",
38
+ ".go", ".java", ".c", ".cpp", ".h", ".hpp", ".rb", ".php", ".xml"
39
+ }
40
+ SECRET_PATTERNS = [
41
+ ("openai_key", re.compile(r"sk-proj-[A-Za-z0-9_\-]{12,}|sk-[A-Za-z0-9]{24,}")),
42
+ ("aws_access_key", re.compile(r"AKIA[0-9A-Z]{16}")),
43
+ ("private_key", re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----")),
44
+ ("generic_api_key", re.compile(r"(?i)(api[_-]?key|secret|token)\s*[:=]\s*['\"]?[A-Za-z0-9_\-]{16,}")),
45
+ ("github_token", re.compile(r"ghp_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}")),
46
+ ("slack_token", re.compile(r"xox[baprs]-[A-Za-z0-9\-]{20,}")),
47
+ ]
48
+ COMMON_DEPENDENCIES = ["fastapi", "flask", "django", "react", "vue", "svelte", "pytest", "typer", "click", "sqlalchemy", "prisma", "pydantic", "pyyaml", "pillow", "beautifulsoup4", "opencv-python", "scikit-learn", "python-dotenv", "pyjwt", "python-dateutil", "boto3", "requests"]
49
+ FEATURE_NAMES = ("pdf", "ocr", "web server", "react", "docker", "authentication", "database")
50
+
51
+
52
+ def utc_now() -> str:
53
+ return datetime.now(timezone.utc).isoformat()
54
+
55
+
56
+ def sha256_file(path: Path) -> str:
57
+ h = hashlib.sha256()
58
+ with path.open("rb") as f:
59
+ for block in iter(lambda: f.read(1024 * 1024), b""):
60
+ h.update(block)
61
+ return h.hexdigest()
62
+
63
+
64
+ def sha256_text(text: str) -> str:
65
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
66
+
67
+
68
+ def estimate_tokens(text: str) -> int:
69
+ return (len(text) + 3) // 4
70
+
71
+
72
+ def is_probably_binary(path: Path, sample_size: int = 4096) -> bool:
73
+ try:
74
+ data = path.read_bytes()[:sample_size]
75
+ except OSError:
76
+ return True
77
+ if b"\x00" in data:
78
+ return True
79
+ if not data:
80
+ return False
81
+ nonprintable = sum(1 for b in data if b < 9 or (13 < b < 32))
82
+ return (nonprintable / max(len(data), 1)) > 0.30
83
+
84
+
85
+ def matches_any(name: str, patterns: Iterable[str]) -> bool:
86
+ return any(fnmatch.fnmatch(name, pattern) for pattern in patterns)
87
+
88
+
89
+ def redact_secrets(text: str):
90
+ redactions = []
91
+ redacted = text
92
+ for label, pattern in SECRET_PATTERNS:
93
+ def repl(match):
94
+ redactions.append({"pattern": label, "span_start": match.start(), "span_end": match.end()})
95
+ return f"[REDACTED:{label}]"
96
+ redacted = pattern.sub(repl, redacted)
97
+ return redacted, redactions
98
+
99
+
100
+ @dataclass
101
+ class IncludedFile:
102
+ relative_path: str
103
+ absolute_path: str
104
+ size_bytes: int
105
+ sha256: str
106
+ source_sha256: str
107
+ packet_sha256: str
108
+ estimated_tokens: int
109
+ extension: str
110
+ content: str
111
+
112
+
113
+ @dataclass
114
+ class IgnoredFile:
115
+ relative_path: str
116
+ reason: str
117
+
118
+
119
+ class SourceScanner:
120
+ def __init__(self, input_path: str | Path, max_file_size: int = 1_000_000, include_hidden: bool = False, redact: bool = True):
121
+ self.input_path = Path(input_path).resolve()
122
+ self.max_file_size = max_file_size
123
+ self.include_hidden = include_hidden
124
+ self.redact = redact
125
+ self.included_files: list[IncludedFile] = []
126
+ self.ignored_files: list[IgnoredFile] = []
127
+ self.redactions: list[dict] = []
128
+ self.total_seen = 0
129
+
130
+ def ignore(self, path: Path, reason: str):
131
+ rel = str(path.relative_to(self.input_path)) if path.is_absolute() or self.input_path in path.parents else str(path)
132
+ self.ignored_files.append(IgnoredFile(rel, reason))
133
+
134
+ def scan(self):
135
+ if not self.input_path.exists():
136
+ raise FileNotFoundError(f"Input path does not exist: {self.input_path}")
137
+ if not self.input_path.is_dir():
138
+ raise NotADirectoryError(f"Input path is not a directory: {self.input_path}")
139
+ for root, dirs, files in os.walk(self.input_path, followlinks=False):
140
+ root_path = Path(root)
141
+ dirs[:] = sorted(dirs)
142
+ files = sorted(files)
143
+ kept_dirs = []
144
+ for d in dirs:
145
+ dpath = root_path / d
146
+ rel = dpath.relative_to(self.input_path)
147
+ if d in DEFAULT_IGNORED_DIRS:
148
+ self.ignored_files.append(IgnoredFile(str(rel) + "/", "ignored_directory"))
149
+ elif not self.include_hidden and d.startswith("."):
150
+ self.ignored_files.append(IgnoredFile(str(rel) + "/", "hidden_directory"))
151
+ elif dpath.is_symlink():
152
+ self.ignored_files.append(IgnoredFile(str(rel) + "/", "symlink_skipped"))
153
+ else:
154
+ kept_dirs.append(d)
155
+ dirs[:] = kept_dirs
156
+ for filename in files:
157
+ fp = root_path / filename
158
+ rel = fp.relative_to(self.input_path)
159
+ self.total_seen += 1
160
+ rel_str = str(rel)
161
+ if fp.is_symlink():
162
+ self.ignored_files.append(IgnoredFile(rel_str, "symlink_skipped")); continue
163
+ if not self.include_hidden and filename.startswith("."):
164
+ self.ignored_files.append(IgnoredFile(rel_str, "hidden_file")); continue
165
+ if matches_any(filename, DEFAULT_IGNORED_PATTERNS) or matches_any(rel_str, DEFAULT_IGNORED_PATTERNS):
166
+ self.ignored_files.append(IgnoredFile(rel_str, "ignored_pattern")); continue
167
+ try:
168
+ size = fp.stat().st_size
169
+ except OSError:
170
+ self.ignored_files.append(IgnoredFile(rel_str, "stat_error")); continue
171
+ if size > self.max_file_size:
172
+ self.ignored_files.append(IgnoredFile(rel_str, "max_file_size_exceeded")); continue
173
+ if fp.suffix and fp.suffix.lower() not in DEFAULT_TEXT_EXTENSIONS:
174
+ self.ignored_files.append(IgnoredFile(rel_str, "unsupported_extension")); continue
175
+ if is_probably_binary(fp):
176
+ self.ignored_files.append(IgnoredFile(rel_str, "binary_detected")); continue
177
+ try:
178
+ content = fp.read_text(encoding="utf-8")
179
+ except UnicodeDecodeError:
180
+ self.ignored_files.append(IgnoredFile(rel_str, "decode_error")); continue
181
+ except OSError:
182
+ self.ignored_files.append(IgnoredFile(rel_str, "read_error")); continue
183
+ source_sha256 = sha256_text(content)
184
+ if self.redact:
185
+ redacted, reds = redact_secrets(content)
186
+ for r in reds:
187
+ r["file"] = rel_str
188
+ self.redactions.extend(reds)
189
+ content = redacted
190
+ packet_sha256 = sha256_text(content)
191
+ self.included_files.append(IncludedFile(
192
+ relative_path=rel_str,
193
+ absolute_path=str(fp.resolve()),
194
+ size_bytes=size,
195
+ sha256=packet_sha256,
196
+ source_sha256=source_sha256,
197
+ packet_sha256=packet_sha256,
198
+ estimated_tokens=estimate_tokens(content),
199
+ extension=fp.suffix.lower(),
200
+ content=content,
201
+ ))
202
+ self.included_files.sort(key=lambda x: x.relative_path)
203
+ self.ignored_files.sort(key=lambda x: x.relative_path)
204
+ return self
205
+
206
+
207
+ def _tracked_file_inventory(root: Path, included_records: list[dict]) -> dict:
208
+ included = {str(rec.get("relative_path", "")).replace("\\", "/") for rec in included_records}
209
+ files: list[dict] = []
210
+ source = "scanner_included_files"
211
+ try:
212
+ cp = subprocess.run(["git", "ls-files", "-z"], cwd=root, text=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
213
+ except (OSError, ValueError):
214
+ cp = None
215
+ if cp is not None and cp.returncode == 0:
216
+ raw_paths = [p.decode("utf-8", "surrogateescape") for p in cp.stdout.split(b"\0") if p]
217
+ source = "git_ls_files" if raw_paths else "scanner_included_files"
218
+ if not raw_paths:
219
+ raw_paths = sorted(included)
220
+ else:
221
+ raw_paths = sorted(included)
222
+ for raw in raw_paths:
223
+ rel = raw.replace("\\", "/")
224
+ path = root / rel
225
+ rec = {"relative_path": rel, "included_in_prompt_context": rel in included, "source": source}
226
+ try:
227
+ if path.exists() and path.is_file():
228
+ rec["sha256"] = sha256_file(path)
229
+ rec["file_type"] = "binary" if is_probably_binary(path) else "text"
230
+ else:
231
+ rec["file_type"] = "missing"
232
+ except OSError:
233
+ rec["file_type"] = "unreadable"
234
+ files.append(rec)
235
+ return {"schema_version": "sourcepack.file_inventory.v1", "generated_at": utc_now(), "source": source, "files": files}
236
+
237
+
238
+ class PacketWriter:
239
+ OUTPUT_FILES = ["manifest.json", "context.md", "context.xml", "file_tree.txt", "ignored_files.txt", "token_report.json", "redactions.json", "reality_map.json", "ai_instructions.md", "file_inventory.json"]
240
+
241
+ def __init__(self, out: str | Path, scanner: SourceScanner, force: bool = False):
242
+ self.out = Path(out)
243
+ self.scanner = scanner
244
+ self.force = force
245
+
246
+ def prepare_out(self):
247
+ if self.out.exists() and any(self.out.iterdir()):
248
+ if not self.force:
249
+ raise FileExistsError(f"Output directory is non-empty: {self.out}")
250
+ for child in self.out.iterdir():
251
+ if child.is_dir():
252
+ shutil.rmtree(child)
253
+ else:
254
+ child.unlink()
255
+ self.out.mkdir(parents=True, exist_ok=True)
256
+
257
+ def write_all(self):
258
+ self.prepare_out()
259
+ included_records = []
260
+ for f in self.scanner.included_files:
261
+ rec = asdict(f)
262
+ rec.pop("content")
263
+ included_records.append(rec)
264
+ ignored_records = [asdict(f) for f in self.scanner.ignored_files]
265
+ total_tokens = sum(f.estimated_tokens for f in self.scanner.included_files)
266
+ total_bytes = sum(f.size_bytes for f in self.scanner.included_files)
267
+ manifest = {
268
+ "input_path": str(self.scanner.input_path),
269
+ "generated_at": utc_now(),
270
+ "tool_version": __version__,
271
+ "total_files_seen": self.scanner.total_seen,
272
+ "total_files_included": len(included_records),
273
+ "total_files_ignored": len(ignored_records),
274
+ "total_bytes_included": total_bytes,
275
+ "total_estimated_tokens": total_tokens,
276
+ "included_files": included_records,
277
+ "ignored_files": ignored_records,
278
+ }
279
+ (self.out / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
280
+ (self.out / "file_inventory.json").write_text(json.dumps(_tracked_file_inventory(self.scanner.input_path, included_records), indent=2), encoding="utf-8")
281
+ md_parts = ["# SourcePack Context Packet", "", "## Source Manifest Summary", "", f"Input path: {manifest['input_path']}", f"Generated at: {manifest['generated_at']}", f"Files included: {len(included_records)}", f"Estimated tokens: {total_tokens}", ""]
282
+ for f in self.scanner.included_files:
283
+ md_parts.extend([
284
+ f"## File: {f.relative_path}", "", "Metadata:", f"- sha256: {f.sha256}", f"- bytes: {f.size_bytes}", f"- estimated_tokens: {f.estimated_tokens}", "", "Content:", "", f.content, "", "---", ""
285
+ ])
286
+ (self.out / "context.md").write_text("\n".join(md_parts), encoding="utf-8")
287
+ xml_parts = ["<sourcepack>", " <files>"]
288
+ for f in self.scanner.included_files:
289
+ xml_parts.append(f' <file path="{xml_escape(f.relative_path)}" sha256="{f.sha256}" bytes="{f.size_bytes}" estimated_tokens="{f.estimated_tokens}">')
290
+ xml_parts.append(" <content>")
291
+ xml_parts.append(xml_escape(f.content))
292
+ xml_parts.append(" </content>")
293
+ xml_parts.append(" </file>")
294
+ xml_parts.extend([" </files>", "</sourcepack>"])
295
+ (self.out / "context.xml").write_text("\n".join(xml_parts), encoding="utf-8")
296
+ tree_lines = []
297
+ for f in self.scanner.included_files:
298
+ tree_lines.append(f"[INC] {f.relative_path}")
299
+ for f in self.scanner.ignored_files:
300
+ tree_lines.append(f"[IGN] {f.relative_path} - {f.reason}")
301
+ (self.out / "file_tree.txt").write_text("\n".join(sorted(tree_lines)) + "\n", encoding="utf-8")
302
+ (self.out / "ignored_files.txt").write_text("\n".join(f"{f.relative_path}\t{f.reason}" for f in self.scanner.ignored_files) + "\n", encoding="utf-8")
303
+ token_report = {
304
+ "total_estimated_tokens": total_tokens,
305
+ "warnings": [limit for limit in [32_000, 128_000, 200_000, 1_000_000] if total_tokens > limit],
306
+ "per_file": [{"relative_path": f.relative_path, "estimated_tokens": f.estimated_tokens} for f in self.scanner.included_files],
307
+ }
308
+ (self.out / "token_report.json").write_text(json.dumps(token_report, indent=2), encoding="utf-8")
309
+ (self.out / "redactions.json").write_text(json.dumps({"redactions": self.scanner.redactions}, indent=2), encoding="utf-8")
310
+ reality_map = generate_reality_map(manifest, self.out)
311
+ (self.out / "reality_map.json").write_text(json.dumps(reality_map, indent=2), encoding="utf-8")
312
+ (self.out / "ai_instructions.md").write_text(render_ai_instructions(reality_map), encoding="utf-8")
313
+ hashes = {name: sha256_file(self.out / name) for name in self.OUTPUT_FILES if (self.out / name).exists()}
314
+ receipt = {"generated_at": utc_now(), "tool_version": __version__, "hashes": hashes}
315
+ (self.out / "receipt.json").write_text(json.dumps(receipt, indent=2), encoding="utf-8")
316
+ return self.out
317
+
318
+
319
+
320
+ def _included_paths(manifest: dict) -> set[str]:
321
+ return {rec.get("relative_path", "").replace("\\", "/") for rec in manifest.get("included_files", [])}
322
+
323
+
324
+ def _package_json_scripts(packet: Path) -> dict[str, str]:
325
+ contents = _packet_file_contents(packet)
326
+ for rel, content in contents.items():
327
+ if Path(rel).name.lower() == "package.json":
328
+ try:
329
+ package = json.loads(content)
330
+ except json.JSONDecodeError:
331
+ return {}
332
+ scripts = package.get("scripts")
333
+ return scripts if isinstance(scripts, dict) else {}
334
+ return {}
335
+
336
+
337
+ def _is_poetry_project(packet: Path) -> bool:
338
+ for rel, content in _packet_file_contents(packet).items():
339
+ if Path(rel).name.lower() == "pyproject.toml" and re.search(r"(?m)^\s*\[tool\.poetry\]\s*$", content):
340
+ return True
341
+ return False
342
+
343
+
344
+ def _uses_unittest(packet: Path) -> bool:
345
+ for rel, content in _packet_file_contents(packet).items():
346
+ if Path(rel).suffix.lower() == ".py" and re.search(r"(?m)^\s*(import\s+unittest|from\s+unittest\s+import\s+)", content):
347
+ return True
348
+ return False
349
+
350
+
351
+ def generate_reality_map(manifest: dict, packet: Path) -> dict:
352
+ files = _included_paths(manifest)
353
+ lower_files = {f.lower() for f in files}
354
+ deps = dependency_inventory(manifest, packet)
355
+ features = feature_inventory(manifest, packet, deps)
356
+ scripts = _package_json_scripts(packet)
357
+ project_types = []
358
+ package_managers = []
359
+ frameworks = []
360
+ supported_commands = []
361
+ test_commands = []
362
+ build_commands = []
363
+ run_commands = []
364
+ if "pyproject.toml" in lower_files:
365
+ project_types.append("python")
366
+ if any(Path(f).name.lower().startswith("requirements") and f.endswith(".txt") for f in lower_files):
367
+ project_types.append("python")
368
+ package_managers.append("pip")
369
+ if _is_poetry_project(packet):
370
+ package_managers.append("poetry")
371
+ if "package.json" in lower_files:
372
+ project_types.append("node")
373
+ package_managers.append("npm")
374
+ for name in sorted(scripts):
375
+ cmd = "npm test" if name == "test" else f"npm run {name}"
376
+ supported_commands.append(cmd)
377
+ if name == "test": test_commands.append(cmd)
378
+ elif name in {"build", "compile"}: build_commands.append(cmd)
379
+ elif name in {"start", "dev", "serve"}: run_commands.append(cmd)
380
+ if any(Path(f).name.lower() == "dockerfile" for f in files):
381
+ supported_commands.append("docker build")
382
+ build_commands.append("docker build")
383
+ if any(Path(f).name.lower() in {"docker-compose.yml", "compose.yaml", "compose.yml"} for f in files):
384
+ supported_commands.append("docker compose up")
385
+ run_commands.append("docker compose up")
386
+ if "pytest" in deps or any(f == "tests" or f.startswith("tests/") for f in lower_files):
387
+ supported_commands.append("pytest")
388
+ test_commands.append("pytest")
389
+ if _uses_unittest(packet):
390
+ supported_commands.append("python -m unittest")
391
+ test_commands.append("python -m unittest")
392
+ framework_map = {"fastapi": "FastAPI", "flask": "Flask", "django": "Django", "react": "React"}
393
+ for dep, label in framework_map.items():
394
+ if dep in deps or (dep == "react" and "react" in features):
395
+ frameworks.append(label)
396
+ ignored = manifest.get("ignored_files", [])
397
+ ignored_reasons = {}
398
+ for rec in ignored:
399
+ reason = rec.get("reason", "unknown")
400
+ ignored_reasons[reason] = ignored_reasons.get(reason, 0) + 1
401
+ included_count = len(manifest.get("included_files", []))
402
+ safe_claims = [
403
+ f"This packet includes {included_count} source files.",
404
+ f"SourcePack scanned input path: {manifest.get('input_path', '')}.",
405
+ ]
406
+ for name in ["pyproject.toml", "package.json", "Dockerfile"]:
407
+ present = name.lower() in {Path(f).name.lower() for f in files}
408
+ safe_claims.append(f"The project {'contains' if present else 'does not include'} {name}.")
409
+ if "react" not in deps and "react" not in features:
410
+ safe_claims.append("No React dependency was detected.")
411
+ if "pdf" not in features:
412
+ safe_claims.append("No PDF parsing capability was detected.")
413
+ if ignored:
414
+ safe_claims.append("The packet includes ignored file records for safety or relevance reasons.")
415
+ claim_boundaries = [
416
+ "SourcePack did not execute the application.",
417
+ "SourcePack did not prove semantic correctness.",
418
+ "SourcePack did not verify external services.",
419
+ "SourcePack did not prove security.",
420
+ "SourcePack did not prove production readiness.",
421
+ "Absence of evidence means unknown, not impossible.",
422
+ "Unsupported claims should be treated as ungrounded.",
423
+ ]
424
+ return {
425
+ "reality_map_schema_version": "1.0",
426
+ "tool_version": __version__,
427
+ "generated_at": utc_now(),
428
+ "input_path": manifest.get("input_path", ""),
429
+ "project_types": sorted(set(project_types)),
430
+ "package_managers": sorted(set(package_managers)),
431
+ "frameworks": sorted(set(frameworks)),
432
+ "entry_points": sorted(f for f in files if Path(f).name in {"main.py", "app.py", "server.py", "cli.py"}),
433
+ "test_commands": sorted(set(test_commands)),
434
+ "build_commands": sorted(set(build_commands)),
435
+ "run_commands": sorted(set(run_commands)),
436
+ "supported_commands": sorted(set(supported_commands)),
437
+ "detected_dependencies": sorted(deps),
438
+ "supported_capabilities": sorted(features),
439
+ "excluded_files_summary": {"total": len(ignored), "reasons": ignored_reasons, "records": ignored[:25]},
440
+ "included_file_count": included_count,
441
+ "confirmed_files": sorted(files),
442
+ "ignored_file_count": len(ignored),
443
+ "safe_claims": safe_claims,
444
+ "unknowns": [
445
+ "Runtime behavior was not executed.",
446
+ "Semantic correctness was not proven.",
447
+ "External services were not verified.",
448
+ "Capabilities not present in structural evidence must be treated as unknown.",
449
+ "Missing files must not be invented.",
450
+ ],
451
+ "claim_boundaries": claim_boundaries,
452
+ "ai_constraints": [
453
+ "Use only the packet and reality map as project evidence.",
454
+ "Do not invent files, commands, dependencies, frameworks, services, or capabilities.",
455
+ "If a required file is missing, say it is missing.",
456
+ "If a command is unsupported by detected evidence, say it is unsupported.",
457
+ "If a capability is not in supported_capabilities, treat it as unknown or unsupported.",
458
+ "Cite file paths when making project-specific claims.",
459
+ "Do not claim SourcePack proves semantic truth.",
460
+ "Ask for missing files rather than hallucinating them.",
461
+ ],
462
+ }
463
+
464
+
465
+ def render_ai_instructions(reality_map: dict) -> str:
466
+ lines = [
467
+ "# AI Instructions for This SourcePack Packet", "",
468
+ "Use only the packet and `reality_map.json` as project evidence.",
469
+ "Do not invent files, commands, dependencies, frameworks, services, or capabilities.",
470
+ "If a required file is missing, say it is missing and ask for it rather than hallucinating it.",
471
+ "If a command is unsupported by detected evidence, say it is unsupported.",
472
+ "If a capability is not listed in `supported_capabilities`, treat it as unknown or unsupported.",
473
+ "If you introduce a new external dependency, modify the appropriate dependency manifest in the same patch and list it under Dependency Changes.",
474
+ "Only recommend commands listed under Supported Commands unless your patch also adds the project file that defines the new command.",
475
+ "Before referencing a file as existing, it must appear in Confirmed Files; label intentional creations as NEW FILE.",
476
+ "If required evidence is missing, say UNKNOWN and ask for the missing file/output instead of guessing.",
477
+ "Cite file paths when making project-specific claims.",
478
+ "Do not claim SourcePack proves semantic truth, security, production readiness, or external service behavior.", "",
479
+ "## Supported Commands", "",
480
+ ]
481
+ cmds = reality_map.get("supported_commands", [])
482
+ lines.extend([f"- `{cmd}`" for cmd in cmds] or ["- None detected"])
483
+ lines.extend(["", "## Supported Capabilities", ""])
484
+ caps = reality_map.get("supported_capabilities", [])
485
+ lines.extend([f"- {cap}" for cap in caps] or ["- None detected"])
486
+ lines.extend(["", "## Confirmed Files", ""])
487
+ lines.extend(f"- `{path}`" for path in reality_map.get("confirmed_files", [])[:200])
488
+ lines.extend(["", "## Required Answer Contract", "", "- Files to modify", "- New files", "- Dependency changes", "- Commands to run", "- Assumptions/unknowns", "- Patch or code", "", "## Claim Boundaries", ""])
489
+ lines.extend(f"- {boundary}" for boundary in reality_map.get("claim_boundaries", []))
490
+ return "\n".join(lines) + "\n"
491
+
492
+ def load_manifest(packet: Path) -> dict:
493
+ return json.loads((packet / "manifest.json").read_text(encoding="utf-8"))
494
+
495
+
496
+ def verify_packet(packet_path: str | Path, against: str | Path | None = None) -> bool:
497
+ packet = Path(packet_path)
498
+ ok = True
499
+ receipt_path = packet / "receipt.json"
500
+ if not receipt_path.exists():
501
+ print("FAIL receipt.json missing")
502
+ return False
503
+ receipt = json.loads(receipt_path.read_text(encoding="utf-8"))
504
+ for name, expected in receipt.get("hashes", {}).items():
505
+ path = packet / name
506
+ if not path.exists():
507
+ print(f"FAIL {name} missing")
508
+ ok = False
509
+ continue
510
+ actual = sha256_file(path)
511
+ if actual == expected:
512
+ print(f"PASS {name}")
513
+ else:
514
+ print(f"FAIL {name} hash mismatch")
515
+ ok = False
516
+ if against:
517
+ manifest = load_manifest(packet)
518
+ source = Path(against).resolve()
519
+ included = {rec["relative_path"]: rec for rec in manifest.get("included_files", [])}
520
+ for rel, rec in included.items():
521
+ source_file = source / rel
522
+ if not source_file.exists():
523
+ print(f"FAIL source missing {rel}")
524
+ ok = False
525
+ elif is_probably_binary(source_file):
526
+ print(f"WARN source now binary {rel}")
527
+ else:
528
+ try:
529
+ content = source_file.read_text(encoding="utf-8")
530
+ except Exception:
531
+ print(f"FAIL source unreadable {rel}")
532
+ ok = False
533
+ continue
534
+ expected_source_hash = rec.get("source_sha256")
535
+ expected_source_hash = rec.get("source_sha256")
536
+ if expected_source_hash is None:
537
+ expected_source_hash = rec.get("sha256")
538
+ redacted, _ = redact_secrets(content)
539
+ content_hash = sha256_text(redacted)
540
+ else:
541
+ content_hash = sha256_text(content)
542
+ if content_hash != expected_source_hash:
543
+ print(f"FAIL source changed {rel}")
544
+ ok = False
545
+ current_files = []
546
+ for root, dirs, files in os.walk(source, followlinks=False):
547
+ dirs[:] = [d for d in sorted(dirs) if d not in DEFAULT_IGNORED_DIRS and not d.startswith(".")]
548
+ for filename in sorted(files):
549
+ fp = Path(root) / filename
550
+ if filename.startswith(".") or fp.suffix.lower() not in DEFAULT_TEXT_EXTENSIONS:
551
+ continue
552
+ rel = str(fp.relative_to(source))
553
+ if rel not in included:
554
+ current_files.append(rel)
555
+ for rel in current_files:
556
+ print(f"WARN new source file not in packet {rel}")
557
+ print("OVERALL", "PASS" if ok else "FAIL")
558
+ return ok
559
+
560
+
561
+ PATHLIKE_EXTENSIONS = {".py", ".js", ".jsx", ".ts", ".tsx", ".json", ".toml", ".yaml", ".yml", ".md", ".txt", ".cfg", ".ini", ".css", ".html", ".rs", ".go", ".java", ".rb", ".php", ".sh"}
562
+ PROJECT_PATH_PREFIXES = {"src", "sourcepack", "tests", "test", "frontend", "backend", "docs", "app", "lib", "packages", "public", "config", "scripts"}
563
+
564
+
565
+ def _normalize_ai_ref(ref: str) -> str | None:
566
+ ref = ref.strip().strip("`'\".,;)")
567
+ ref = ref.replace("\\", "/")
568
+ if ref.endswith(":"):
569
+ ref = ref[:-1]
570
+ while ref.startswith("./"):
571
+ ref = ref[2:]
572
+ if not ref or ref.startswith("/") or re.match(r"^[A-Za-z]:/", ref):
573
+ return None
574
+ normalized, unsafe = _normalize_diff_path(ref)
575
+ if unsafe or not normalized:
576
+ return None
577
+ return normalized
578
+
579
+
580
+ def _looks_like_ai_file_ref(ref: str) -> bool:
581
+ normalized = ref.replace("\\", "/")
582
+ name = PurePosixPath(normalized).name
583
+ if name in {"Dockerfile", "docker-compose.yml", "compose.yaml", "compose.yml", "pyproject.toml", "package.json", "requirements.txt"}:
584
+ return True
585
+ suffix = PurePosixPath(normalized).suffix.lower()
586
+ if suffix not in PATHLIKE_EXTENSIONS:
587
+ return False
588
+ parts = [p for p in PurePosixPath(normalized).parts if p not in {"."}]
589
+ return "/" in normalized or (parts and parts[0] in PROJECT_PATH_PREFIXES)
590
+
591
+
592
+ def extract_refs(text: str) -> set[str]:
593
+ refs: set[str] = set()
594
+ token = r"(?:\./)?[A-Za-z0-9_.-]+(?:[\\/][A-Za-z0-9_.-]+)*\.[A-Za-z0-9_.-]+:?|Dockerfile"
595
+ patterns = [rf"[`'\"]({token})[`'\"]", rf"(?m)^\s*[-*]\s+({token})\b", rf"\b(?:edit|open|update|modify|change|in|file)\s+({token})\b", rf"\b((?:\./)?(?:src|sourcepack|tests|test|frontend|backend|docs|app|lib|packages|public|config|scripts)[\\/][A-Za-z0-9_./\\-]+\.[A-Za-z0-9_.-]+:?)\b"]
596
+ for pattern in patterns:
597
+ for candidate in re.findall(pattern, text, re.I):
598
+ normalized = _normalize_ai_ref(candidate)
599
+ if normalized and _looks_like_ai_file_ref(normalized):
600
+ refs.add(normalized)
601
+ return refs
602
+
603
+
604
+ def _packet_file_contents(packet: Path) -> dict[str, str]:
605
+ context_path = packet / "context.md"
606
+ if not context_path.exists():
607
+ return {}
608
+ text = context_path.read_text(encoding="utf-8", errors="ignore")
609
+ contents: dict[str, str] = {}
610
+ current: str | None = None
611
+ body: list[str] = []
612
+ in_content = False
613
+ for line in text.splitlines():
614
+ if line.startswith("## File: "):
615
+ if current is not None:
616
+ contents[current] = "\n".join(body).rstrip("\n")
617
+ current = line.removeprefix("## File: ").strip()
618
+ body = []
619
+ in_content = False
620
+ elif current is not None and line == "Content:":
621
+ in_content = True
622
+ body = []
623
+ elif current is not None and in_content and line == "---":
624
+ contents[current] = "\n".join(body).rstrip("\n")
625
+ current = None
626
+ body = []
627
+ in_content = False
628
+ elif current is not None and in_content:
629
+ body.append(line)
630
+ if current is not None:
631
+ contents[current] = "\n".join(body).rstrip("\n")
632
+ return contents
633
+
634
+
635
+ def _normalize_dependency_name(name: str) -> str:
636
+ return name.strip().lower().replace("_", "-")
637
+
638
+
639
+ def _dependency_name_for_import(name: str) -> str:
640
+ normalized = _normalize_dependency_name(name)
641
+ return PY_IMPORT_ALIASES.get(normalized, normalized)
642
+
643
+
644
+ def _js_package_root(imported: str) -> str:
645
+ imported = imported.strip().lower()
646
+ parts = imported.split("/")
647
+ if imported.startswith("@") and len(parts) >= 2 and parts[0] != "@":
648
+ return "/".join(parts[:2])
649
+ if imported.startswith("@/"):
650
+ return imported
651
+ return parts[0]
652
+
653
+
654
+ def _python_dependency_names_from_requirement_lines(text: str) -> set[str]:
655
+ deps: set[str] = set()
656
+ for line in text.splitlines():
657
+ cleaned = line.split("#", 1)[0].strip()
658
+ if cleaned and not cleaned.startswith(("-", "--")):
659
+ deps.add(_normalize_dependency_name(re.split(r"[<>=!~;\[]", cleaned, maxsplit=1)[0]))
660
+ return deps
661
+
662
+
663
+ def _python_dependency_names_from_pyproject(content: str) -> set[str]:
664
+ try:
665
+ data = tomllib.loads(content)
666
+ except tomllib.TOMLDecodeError:
667
+ return set()
668
+ deps: set[str] = set()
669
+
670
+ def add_requirement(req: object) -> None:
671
+ if isinstance(req, str):
672
+ name = re.split(r"[<>=!~;\[]", req.strip(), maxsplit=1)[0]
673
+ if name:
674
+ deps.add(_normalize_dependency_name(name))
675
+
676
+ project = data.get("project", {})
677
+ if isinstance(project, dict):
678
+ for req in project.get("dependencies", []) if isinstance(project.get("dependencies"), list) else []:
679
+ add_requirement(req)
680
+ optional = project.get("optional-dependencies", {})
681
+ if isinstance(optional, dict):
682
+ for group in optional.values():
683
+ if isinstance(group, list):
684
+ for req in group:
685
+ add_requirement(req)
686
+
687
+ tool = data.get("tool", {})
688
+ if isinstance(tool, dict):
689
+ poetry = tool.get("poetry", {})
690
+ if isinstance(poetry, dict):
691
+ for section_name in ("dependencies", "dev-dependencies"):
692
+ section = poetry.get(section_name, {})
693
+ if isinstance(section, dict):
694
+ for dep in section:
695
+ if dep.lower() != "python":
696
+ deps.add(_normalize_dependency_name(dep))
697
+ group = poetry.get("group", {})
698
+ if isinstance(group, dict):
699
+ for group_data in group.values():
700
+ if isinstance(group_data, dict):
701
+ section = group_data.get("dependencies", {})
702
+ if isinstance(section, dict):
703
+ deps.update(_normalize_dependency_name(dep) for dep in section)
704
+ for tool_name in ("pdm", "uv"):
705
+ tool_data = tool.get(tool_name, {})
706
+ if isinstance(tool_data, dict):
707
+ for key in ("dev-dependencies", "dependency-groups"):
708
+ groups = tool_data.get(key, {})
709
+ if isinstance(groups, dict):
710
+ for group in groups.values():
711
+ if isinstance(group, list):
712
+ for req in group:
713
+ add_requirement(req)
714
+ dependency_groups = data.get("dependency-groups", {})
715
+ if isinstance(dependency_groups, dict):
716
+ for group in dependency_groups.values():
717
+ if isinstance(group, list):
718
+ for req in group:
719
+ add_requirement(req)
720
+ return deps
721
+
722
+
723
+ def _add_common_dependency(deps: set[str], name: str):
724
+ normalized = _normalize_dependency_name(name)
725
+ for dep in COMMON_DEPENDENCIES:
726
+ if normalized == _normalize_dependency_name(dep):
727
+ deps.add(dep.lower())
728
+
729
+
730
+ def dependency_inventory(manifest: dict, packet: Path) -> set[str]:
731
+ deps: set[str] = set()
732
+ contents = _packet_file_contents(packet)
733
+ for rec in manifest.get("included_files", []):
734
+ rel = rec.get("relative_path", "")
735
+ content = contents.get(rel, "")
736
+ name = Path(rel).name.lower()
737
+ suffix = Path(rel).suffix.lower()
738
+ if name == "pyproject.toml":
739
+ for dep in _python_dependency_names_from_pyproject(content):
740
+ _add_common_dependency(deps, dep)
741
+ elif name.startswith("requirements") and name.endswith(".txt"):
742
+ for dep in _python_dependency_names_from_requirement_lines(content):
743
+ _add_common_dependency(deps, dep)
744
+ elif name == "package.json":
745
+ try:
746
+ package = json.loads(content)
747
+ except json.JSONDecodeError:
748
+ package = {}
749
+ for section in ("dependencies", "devDependencies", "peerDependencies", "optionalDependencies"):
750
+ section_deps = package.get(section)
751
+ if isinstance(section_deps, dict):
752
+ for dep_name in section_deps:
753
+ _add_common_dependency(deps, dep_name)
754
+ elif suffix == ".py":
755
+ for imported in re.findall(r"(?m)^\s*(?:import|from)\s+([A-Za-z_][A-Za-z0-9_]*)", content):
756
+ _add_common_dependency(deps, imported)
757
+ elif suffix in {".js", ".jsx", ".ts", ".tsx"}:
758
+ for imported in re.findall(r"""(?:from\s+["']|import\s*\(\s*["']|require\s*\(\s*["'])(@?[A-Za-z0-9_.-]+)""", content):
759
+ _add_common_dependency(deps, _js_package_root(imported))
760
+ return deps
761
+
762
+
763
+ def _has_import(content: str, *modules: str) -> bool:
764
+ module_pattern = "|".join(re.escape(module) for module in modules)
765
+ return bool(re.search(rf"(?m)^\s*(?:import|from)\s+({module_pattern})(?:\b|[._])", content))
766
+
767
+
768
+ PDF_DEPENDENCIES = {"pypdf", "pdfplumber", "fitz", "pymupdf"}
769
+
770
+
771
+ def _declares_pdf_dependency(rel: str, content: str) -> bool:
772
+ name = Path(rel).name.lower()
773
+ if name == "pyproject.toml":
774
+ return any(dep in PDF_DEPENDENCIES for dep in _python_dependency_names_from_pyproject(content))
775
+ if name.startswith("requirements") and name.endswith(".txt"):
776
+ return any(dep in PDF_DEPENDENCIES for dep in _python_dependency_names_from_requirement_lines(content))
777
+ return False
778
+
779
+
780
+ def feature_inventory(manifest: dict, packet: Path, deps: set[str] | None = None) -> set[str]:
781
+ if deps is None:
782
+ deps = dependency_inventory(manifest, packet)
783
+ contents = _packet_file_contents(packet)
784
+ files = {rec.get("relative_path", "").replace("\\", "/") for rec in manifest.get("included_files", [])}
785
+ lower_files = {rel.lower() for rel in files}
786
+ features: set[str] = set()
787
+
788
+ if any(Path(rel).name.lower() in {"dockerfile", "docker-compose.yml", "compose.yaml", "compose.yml"} for rel in files):
789
+ features.add("docker")
790
+ if any(rel.endswith(("/pdf_parser.py", "pdf_parser.py")) for rel in lower_files):
791
+ features.add("pdf")
792
+ if any(_declares_pdf_dependency(rel, content) for rel, content in contents.items()):
793
+ features.add("pdf")
794
+ if "react" in deps or any(rel in {"frontend/app.tsx", "frontend/app.jsx"} for rel in lower_files):
795
+ features.add("react")
796
+ if deps & {"fastapi", "flask", "django"} or any(Path(rel).name.lower() in {"server.py", "app.py"} for rel in files):
797
+ features.add("web server")
798
+ if deps & {"sqlalchemy", "prisma"} or any("/migrations/" in f"/{rel}/" or Path(rel).name.lower() in {"schema.prisma", "schema.sql"} for rel in files):
799
+ features.add("database")
800
+ if any(part == "auth" or part.startswith("auth_") for rel in lower_files for part in Path(rel).parts):
801
+ features.add("authentication")
802
+
803
+ for rel, content in contents.items():
804
+ suffix = Path(rel).suffix.lower()
805
+ if suffix == ".py":
806
+ if _has_import(content, "pypdf", "pdfplumber", "fitz"):
807
+ features.add("pdf")
808
+ if _has_import(content, "fastapi", "flask", "django") or re.search(r"(?m)^\s*@\w+\.(?:route|get|post|put|patch|delete)\(", content):
809
+ features.add("web server")
810
+ if _has_import(content, "sqlalchemy", "prisma") or re.search(r"(?i)\b(sqlite|postgres(?:ql)?|mysql)://", content):
811
+ features.add("database")
812
+ if _has_import(content, "jwt", "oauthlib", "authlib") or re.search(r"(?i)@\w+\.(?:route|get|post)\([^)]*login", content):
813
+ features.add("authentication")
814
+ if _has_import(content, "pytesseract", "easyocr"):
815
+ features.add("ocr")
816
+ elif suffix in {".js", ".jsx", ".ts", ".tsx"}:
817
+ if re.search(r"""(?:from\s+["']react["']|require\s*\(\s*["']react["']|import\s+React\b)""", content):
818
+ features.add("react")
819
+ if re.search(r"(?i)\b(jwt|oauth|session|login)\b", content):
820
+ features.add("authentication")
821
+ elif Path(rel).name.lower() == "package.json":
822
+ if re.search(r'"react"\s*:', content):
823
+ features.add("react")
824
+ return features
825
+
826
+
827
+
828
+ def scanner_config_hash() -> str:
829
+ payload = {
830
+ "ignored_dirs": sorted(DEFAULT_IGNORED_DIRS),
831
+ "ignored_patterns": sorted(DEFAULT_IGNORED_PATTERNS),
832
+ "text_extensions": sorted(DEFAULT_TEXT_EXTENSIONS),
833
+ "max_file_size": 1_000_000,
834
+ "include_hidden": False,
835
+ "redact": True,
836
+ }
837
+ return sha256_text(json.dumps(payload, sort_keys=True))