sourcepack 1.10.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sourcepack/__init__.py +19 -0
- sourcepack/assets/__init__.py +1 -0
- sourcepack/assets/audit_template.md +3 -0
- sourcepack/assets/packet_instructions.md +3 -0
- sourcepack/baseline.py +285 -0
- sourcepack/cli.py +2991 -0
- sourcepack/commands.py +149 -0
- sourcepack/dependencies.py +98 -0
- sourcepack/diff_parser.py +122 -0
- sourcepack/ecosystems/__init__.py +3 -0
- sourcepack/ecosystems/generic.py +13 -0
- sourcepack/ecosystems/node.py +3 -0
- sourcepack/ecosystems/python.py +12 -0
- sourcepack/errors.py +19 -0
- sourcepack/evidence.py +109 -0
- sourcepack/execution_ledger.py +252 -0
- sourcepack/git.py +50 -0
- sourcepack/judgment.py +1922 -0
- sourcepack/packet.py +837 -0
- sourcepack/paths.py +68 -0
- sourcepack/policy.py +38 -0
- sourcepack/reason_codes.py +72 -0
- sourcepack/reports/__init__.py +5 -0
- sourcepack/reports/html.py +88 -0
- sourcepack/reports/json.py +123 -0
- sourcepack/reports/markdown.py +61 -0
- sourcepack/schemas.py +63 -0
- sourcepack-1.10.0a0.dist-info/METADATA +311 -0
- sourcepack-1.10.0a0.dist-info/RECORD +33 -0
- sourcepack-1.10.0a0.dist-info/WHEEL +5 -0
- sourcepack-1.10.0a0.dist-info/entry_points.txt +2 -0
- sourcepack-1.10.0a0.dist-info/licenses/LICENSE +21 -0
- sourcepack-1.10.0a0.dist-info/top_level.txt +1 -0
sourcepack/packet.py
ADDED
|
@@ -0,0 +1,837 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import fnmatch
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import shutil
|
|
9
|
+
import subprocess
|
|
10
|
+
import tomllib
|
|
11
|
+
from dataclasses import dataclass, asdict
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Iterable
|
|
15
|
+
from xml.sax.saxutils import escape as xml_escape
|
|
16
|
+
|
|
17
|
+
from .ecosystems.python import PY_IMPORT_ALIASES
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from . import __version__
|
|
21
|
+
except Exception:
|
|
22
|
+
__version__ = "1.10.0-alpha"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
DEFAULT_IGNORED_DIRS = {
|
|
27
|
+
".git", "node_modules", ".venv", "venv", "__pycache__", "dist", "build",
|
|
28
|
+
".next", ".cache", "target", "coverage", ".pytest_cache", ".sourcepack"
|
|
29
|
+
}
|
|
30
|
+
DEFAULT_IGNORED_PATTERNS = {
|
|
31
|
+
".env", ".env.*", "*.pem", "*.key", "*.sqlite", "*.db", "*.png", "*.jpg",
|
|
32
|
+
"*.jpeg", "*.gif", "*.webp", "*.pdf", "*.zip", "*.tar", "*.gz", "*.exe",
|
|
33
|
+
"*.dll", "*.so", "*.dylib", "*.bin", "*.pyc"
|
|
34
|
+
}
|
|
35
|
+
DEFAULT_TEXT_EXTENSIONS = {
|
|
36
|
+
".txt", ".md", ".py", ".js", ".ts", ".tsx", ".jsx", ".json", ".yaml", ".yml",
|
|
37
|
+
".html", ".css", ".csv", ".toml", ".ini", ".sql", ".sh", ".bat", ".ps1", ".rs",
|
|
38
|
+
".go", ".java", ".c", ".cpp", ".h", ".hpp", ".rb", ".php", ".xml"
|
|
39
|
+
}
|
|
40
|
+
SECRET_PATTERNS = [
|
|
41
|
+
("openai_key", re.compile(r"sk-proj-[A-Za-z0-9_\-]{12,}|sk-[A-Za-z0-9]{24,}")),
|
|
42
|
+
("aws_access_key", re.compile(r"AKIA[0-9A-Z]{16}")),
|
|
43
|
+
("private_key", re.compile(r"-----BEGIN [A-Z ]*PRIVATE KEY-----")),
|
|
44
|
+
("generic_api_key", re.compile(r"(?i)(api[_-]?key|secret|token)\s*[:=]\s*['\"]?[A-Za-z0-9_\-]{16,}")),
|
|
45
|
+
("github_token", re.compile(r"ghp_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}")),
|
|
46
|
+
("slack_token", re.compile(r"xox[baprs]-[A-Za-z0-9\-]{20,}")),
|
|
47
|
+
]
|
|
48
|
+
COMMON_DEPENDENCIES = ["fastapi", "flask", "django", "react", "vue", "svelte", "pytest", "typer", "click", "sqlalchemy", "prisma", "pydantic", "pyyaml", "pillow", "beautifulsoup4", "opencv-python", "scikit-learn", "python-dotenv", "pyjwt", "python-dateutil", "boto3", "requests"]
|
|
49
|
+
FEATURE_NAMES = ("pdf", "ocr", "web server", "react", "docker", "authentication", "database")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def utc_now() -> str:
|
|
53
|
+
return datetime.now(timezone.utc).isoformat()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def sha256_file(path: Path) -> str:
|
|
57
|
+
h = hashlib.sha256()
|
|
58
|
+
with path.open("rb") as f:
|
|
59
|
+
for block in iter(lambda: f.read(1024 * 1024), b""):
|
|
60
|
+
h.update(block)
|
|
61
|
+
return h.hexdigest()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def sha256_text(text: str) -> str:
|
|
65
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def estimate_tokens(text: str) -> int:
|
|
69
|
+
return (len(text) + 3) // 4
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def is_probably_binary(path: Path, sample_size: int = 4096) -> bool:
|
|
73
|
+
try:
|
|
74
|
+
data = path.read_bytes()[:sample_size]
|
|
75
|
+
except OSError:
|
|
76
|
+
return True
|
|
77
|
+
if b"\x00" in data:
|
|
78
|
+
return True
|
|
79
|
+
if not data:
|
|
80
|
+
return False
|
|
81
|
+
nonprintable = sum(1 for b in data if b < 9 or (13 < b < 32))
|
|
82
|
+
return (nonprintable / max(len(data), 1)) > 0.30
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def matches_any(name: str, patterns: Iterable[str]) -> bool:
|
|
86
|
+
return any(fnmatch.fnmatch(name, pattern) for pattern in patterns)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def redact_secrets(text: str):
|
|
90
|
+
redactions = []
|
|
91
|
+
redacted = text
|
|
92
|
+
for label, pattern in SECRET_PATTERNS:
|
|
93
|
+
def repl(match):
|
|
94
|
+
redactions.append({"pattern": label, "span_start": match.start(), "span_end": match.end()})
|
|
95
|
+
return f"[REDACTED:{label}]"
|
|
96
|
+
redacted = pattern.sub(repl, redacted)
|
|
97
|
+
return redacted, redactions
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass
|
|
101
|
+
class IncludedFile:
|
|
102
|
+
relative_path: str
|
|
103
|
+
absolute_path: str
|
|
104
|
+
size_bytes: int
|
|
105
|
+
sha256: str
|
|
106
|
+
source_sha256: str
|
|
107
|
+
packet_sha256: str
|
|
108
|
+
estimated_tokens: int
|
|
109
|
+
extension: str
|
|
110
|
+
content: str
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class IgnoredFile:
|
|
115
|
+
relative_path: str
|
|
116
|
+
reason: str
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class SourceScanner:
|
|
120
|
+
def __init__(self, input_path: str | Path, max_file_size: int = 1_000_000, include_hidden: bool = False, redact: bool = True):
|
|
121
|
+
self.input_path = Path(input_path).resolve()
|
|
122
|
+
self.max_file_size = max_file_size
|
|
123
|
+
self.include_hidden = include_hidden
|
|
124
|
+
self.redact = redact
|
|
125
|
+
self.included_files: list[IncludedFile] = []
|
|
126
|
+
self.ignored_files: list[IgnoredFile] = []
|
|
127
|
+
self.redactions: list[dict] = []
|
|
128
|
+
self.total_seen = 0
|
|
129
|
+
|
|
130
|
+
def ignore(self, path: Path, reason: str):
|
|
131
|
+
rel = str(path.relative_to(self.input_path)) if path.is_absolute() or self.input_path in path.parents else str(path)
|
|
132
|
+
self.ignored_files.append(IgnoredFile(rel, reason))
|
|
133
|
+
|
|
134
|
+
def scan(self):
|
|
135
|
+
if not self.input_path.exists():
|
|
136
|
+
raise FileNotFoundError(f"Input path does not exist: {self.input_path}")
|
|
137
|
+
if not self.input_path.is_dir():
|
|
138
|
+
raise NotADirectoryError(f"Input path is not a directory: {self.input_path}")
|
|
139
|
+
for root, dirs, files in os.walk(self.input_path, followlinks=False):
|
|
140
|
+
root_path = Path(root)
|
|
141
|
+
dirs[:] = sorted(dirs)
|
|
142
|
+
files = sorted(files)
|
|
143
|
+
kept_dirs = []
|
|
144
|
+
for d in dirs:
|
|
145
|
+
dpath = root_path / d
|
|
146
|
+
rel = dpath.relative_to(self.input_path)
|
|
147
|
+
if d in DEFAULT_IGNORED_DIRS:
|
|
148
|
+
self.ignored_files.append(IgnoredFile(str(rel) + "/", "ignored_directory"))
|
|
149
|
+
elif not self.include_hidden and d.startswith("."):
|
|
150
|
+
self.ignored_files.append(IgnoredFile(str(rel) + "/", "hidden_directory"))
|
|
151
|
+
elif dpath.is_symlink():
|
|
152
|
+
self.ignored_files.append(IgnoredFile(str(rel) + "/", "symlink_skipped"))
|
|
153
|
+
else:
|
|
154
|
+
kept_dirs.append(d)
|
|
155
|
+
dirs[:] = kept_dirs
|
|
156
|
+
for filename in files:
|
|
157
|
+
fp = root_path / filename
|
|
158
|
+
rel = fp.relative_to(self.input_path)
|
|
159
|
+
self.total_seen += 1
|
|
160
|
+
rel_str = str(rel)
|
|
161
|
+
if fp.is_symlink():
|
|
162
|
+
self.ignored_files.append(IgnoredFile(rel_str, "symlink_skipped")); continue
|
|
163
|
+
if not self.include_hidden and filename.startswith("."):
|
|
164
|
+
self.ignored_files.append(IgnoredFile(rel_str, "hidden_file")); continue
|
|
165
|
+
if matches_any(filename, DEFAULT_IGNORED_PATTERNS) or matches_any(rel_str, DEFAULT_IGNORED_PATTERNS):
|
|
166
|
+
self.ignored_files.append(IgnoredFile(rel_str, "ignored_pattern")); continue
|
|
167
|
+
try:
|
|
168
|
+
size = fp.stat().st_size
|
|
169
|
+
except OSError:
|
|
170
|
+
self.ignored_files.append(IgnoredFile(rel_str, "stat_error")); continue
|
|
171
|
+
if size > self.max_file_size:
|
|
172
|
+
self.ignored_files.append(IgnoredFile(rel_str, "max_file_size_exceeded")); continue
|
|
173
|
+
if fp.suffix and fp.suffix.lower() not in DEFAULT_TEXT_EXTENSIONS:
|
|
174
|
+
self.ignored_files.append(IgnoredFile(rel_str, "unsupported_extension")); continue
|
|
175
|
+
if is_probably_binary(fp):
|
|
176
|
+
self.ignored_files.append(IgnoredFile(rel_str, "binary_detected")); continue
|
|
177
|
+
try:
|
|
178
|
+
content = fp.read_text(encoding="utf-8")
|
|
179
|
+
except UnicodeDecodeError:
|
|
180
|
+
self.ignored_files.append(IgnoredFile(rel_str, "decode_error")); continue
|
|
181
|
+
except OSError:
|
|
182
|
+
self.ignored_files.append(IgnoredFile(rel_str, "read_error")); continue
|
|
183
|
+
source_sha256 = sha256_text(content)
|
|
184
|
+
if self.redact:
|
|
185
|
+
redacted, reds = redact_secrets(content)
|
|
186
|
+
for r in reds:
|
|
187
|
+
r["file"] = rel_str
|
|
188
|
+
self.redactions.extend(reds)
|
|
189
|
+
content = redacted
|
|
190
|
+
packet_sha256 = sha256_text(content)
|
|
191
|
+
self.included_files.append(IncludedFile(
|
|
192
|
+
relative_path=rel_str,
|
|
193
|
+
absolute_path=str(fp.resolve()),
|
|
194
|
+
size_bytes=size,
|
|
195
|
+
sha256=packet_sha256,
|
|
196
|
+
source_sha256=source_sha256,
|
|
197
|
+
packet_sha256=packet_sha256,
|
|
198
|
+
estimated_tokens=estimate_tokens(content),
|
|
199
|
+
extension=fp.suffix.lower(),
|
|
200
|
+
content=content,
|
|
201
|
+
))
|
|
202
|
+
self.included_files.sort(key=lambda x: x.relative_path)
|
|
203
|
+
self.ignored_files.sort(key=lambda x: x.relative_path)
|
|
204
|
+
return self
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _tracked_file_inventory(root: Path, included_records: list[dict]) -> dict:
|
|
208
|
+
included = {str(rec.get("relative_path", "")).replace("\\", "/") for rec in included_records}
|
|
209
|
+
files: list[dict] = []
|
|
210
|
+
source = "scanner_included_files"
|
|
211
|
+
try:
|
|
212
|
+
cp = subprocess.run(["git", "ls-files", "-z"], cwd=root, text=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
213
|
+
except (OSError, ValueError):
|
|
214
|
+
cp = None
|
|
215
|
+
if cp is not None and cp.returncode == 0:
|
|
216
|
+
raw_paths = [p.decode("utf-8", "surrogateescape") for p in cp.stdout.split(b"\0") if p]
|
|
217
|
+
source = "git_ls_files" if raw_paths else "scanner_included_files"
|
|
218
|
+
if not raw_paths:
|
|
219
|
+
raw_paths = sorted(included)
|
|
220
|
+
else:
|
|
221
|
+
raw_paths = sorted(included)
|
|
222
|
+
for raw in raw_paths:
|
|
223
|
+
rel = raw.replace("\\", "/")
|
|
224
|
+
path = root / rel
|
|
225
|
+
rec = {"relative_path": rel, "included_in_prompt_context": rel in included, "source": source}
|
|
226
|
+
try:
|
|
227
|
+
if path.exists() and path.is_file():
|
|
228
|
+
rec["sha256"] = sha256_file(path)
|
|
229
|
+
rec["file_type"] = "binary" if is_probably_binary(path) else "text"
|
|
230
|
+
else:
|
|
231
|
+
rec["file_type"] = "missing"
|
|
232
|
+
except OSError:
|
|
233
|
+
rec["file_type"] = "unreadable"
|
|
234
|
+
files.append(rec)
|
|
235
|
+
return {"schema_version": "sourcepack.file_inventory.v1", "generated_at": utc_now(), "source": source, "files": files}
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
class PacketWriter:
|
|
239
|
+
OUTPUT_FILES = ["manifest.json", "context.md", "context.xml", "file_tree.txt", "ignored_files.txt", "token_report.json", "redactions.json", "reality_map.json", "ai_instructions.md", "file_inventory.json"]
|
|
240
|
+
|
|
241
|
+
def __init__(self, out: str | Path, scanner: SourceScanner, force: bool = False):
|
|
242
|
+
self.out = Path(out)
|
|
243
|
+
self.scanner = scanner
|
|
244
|
+
self.force = force
|
|
245
|
+
|
|
246
|
+
def prepare_out(self):
|
|
247
|
+
if self.out.exists() and any(self.out.iterdir()):
|
|
248
|
+
if not self.force:
|
|
249
|
+
raise FileExistsError(f"Output directory is non-empty: {self.out}")
|
|
250
|
+
for child in self.out.iterdir():
|
|
251
|
+
if child.is_dir():
|
|
252
|
+
shutil.rmtree(child)
|
|
253
|
+
else:
|
|
254
|
+
child.unlink()
|
|
255
|
+
self.out.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
|
|
257
|
+
def write_all(self):
|
|
258
|
+
self.prepare_out()
|
|
259
|
+
included_records = []
|
|
260
|
+
for f in self.scanner.included_files:
|
|
261
|
+
rec = asdict(f)
|
|
262
|
+
rec.pop("content")
|
|
263
|
+
included_records.append(rec)
|
|
264
|
+
ignored_records = [asdict(f) for f in self.scanner.ignored_files]
|
|
265
|
+
total_tokens = sum(f.estimated_tokens for f in self.scanner.included_files)
|
|
266
|
+
total_bytes = sum(f.size_bytes for f in self.scanner.included_files)
|
|
267
|
+
manifest = {
|
|
268
|
+
"input_path": str(self.scanner.input_path),
|
|
269
|
+
"generated_at": utc_now(),
|
|
270
|
+
"tool_version": __version__,
|
|
271
|
+
"total_files_seen": self.scanner.total_seen,
|
|
272
|
+
"total_files_included": len(included_records),
|
|
273
|
+
"total_files_ignored": len(ignored_records),
|
|
274
|
+
"total_bytes_included": total_bytes,
|
|
275
|
+
"total_estimated_tokens": total_tokens,
|
|
276
|
+
"included_files": included_records,
|
|
277
|
+
"ignored_files": ignored_records,
|
|
278
|
+
}
|
|
279
|
+
(self.out / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
|
|
280
|
+
(self.out / "file_inventory.json").write_text(json.dumps(_tracked_file_inventory(self.scanner.input_path, included_records), indent=2), encoding="utf-8")
|
|
281
|
+
md_parts = ["# SourcePack Context Packet", "", "## Source Manifest Summary", "", f"Input path: {manifest['input_path']}", f"Generated at: {manifest['generated_at']}", f"Files included: {len(included_records)}", f"Estimated tokens: {total_tokens}", ""]
|
|
282
|
+
for f in self.scanner.included_files:
|
|
283
|
+
md_parts.extend([
|
|
284
|
+
f"## File: {f.relative_path}", "", "Metadata:", f"- sha256: {f.sha256}", f"- bytes: {f.size_bytes}", f"- estimated_tokens: {f.estimated_tokens}", "", "Content:", "", f.content, "", "---", ""
|
|
285
|
+
])
|
|
286
|
+
(self.out / "context.md").write_text("\n".join(md_parts), encoding="utf-8")
|
|
287
|
+
xml_parts = ["<sourcepack>", " <files>"]
|
|
288
|
+
for f in self.scanner.included_files:
|
|
289
|
+
xml_parts.append(f' <file path="{xml_escape(f.relative_path)}" sha256="{f.sha256}" bytes="{f.size_bytes}" estimated_tokens="{f.estimated_tokens}">')
|
|
290
|
+
xml_parts.append(" <content>")
|
|
291
|
+
xml_parts.append(xml_escape(f.content))
|
|
292
|
+
xml_parts.append(" </content>")
|
|
293
|
+
xml_parts.append(" </file>")
|
|
294
|
+
xml_parts.extend([" </files>", "</sourcepack>"])
|
|
295
|
+
(self.out / "context.xml").write_text("\n".join(xml_parts), encoding="utf-8")
|
|
296
|
+
tree_lines = []
|
|
297
|
+
for f in self.scanner.included_files:
|
|
298
|
+
tree_lines.append(f"[INC] {f.relative_path}")
|
|
299
|
+
for f in self.scanner.ignored_files:
|
|
300
|
+
tree_lines.append(f"[IGN] {f.relative_path} - {f.reason}")
|
|
301
|
+
(self.out / "file_tree.txt").write_text("\n".join(sorted(tree_lines)) + "\n", encoding="utf-8")
|
|
302
|
+
(self.out / "ignored_files.txt").write_text("\n".join(f"{f.relative_path}\t{f.reason}" for f in self.scanner.ignored_files) + "\n", encoding="utf-8")
|
|
303
|
+
token_report = {
|
|
304
|
+
"total_estimated_tokens": total_tokens,
|
|
305
|
+
"warnings": [limit for limit in [32_000, 128_000, 200_000, 1_000_000] if total_tokens > limit],
|
|
306
|
+
"per_file": [{"relative_path": f.relative_path, "estimated_tokens": f.estimated_tokens} for f in self.scanner.included_files],
|
|
307
|
+
}
|
|
308
|
+
(self.out / "token_report.json").write_text(json.dumps(token_report, indent=2), encoding="utf-8")
|
|
309
|
+
(self.out / "redactions.json").write_text(json.dumps({"redactions": self.scanner.redactions}, indent=2), encoding="utf-8")
|
|
310
|
+
reality_map = generate_reality_map(manifest, self.out)
|
|
311
|
+
(self.out / "reality_map.json").write_text(json.dumps(reality_map, indent=2), encoding="utf-8")
|
|
312
|
+
(self.out / "ai_instructions.md").write_text(render_ai_instructions(reality_map), encoding="utf-8")
|
|
313
|
+
hashes = {name: sha256_file(self.out / name) for name in self.OUTPUT_FILES if (self.out / name).exists()}
|
|
314
|
+
receipt = {"generated_at": utc_now(), "tool_version": __version__, "hashes": hashes}
|
|
315
|
+
(self.out / "receipt.json").write_text(json.dumps(receipt, indent=2), encoding="utf-8")
|
|
316
|
+
return self.out
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _included_paths(manifest: dict) -> set[str]:
|
|
321
|
+
return {rec.get("relative_path", "").replace("\\", "/") for rec in manifest.get("included_files", [])}
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def _package_json_scripts(packet: Path) -> dict[str, str]:
|
|
325
|
+
contents = _packet_file_contents(packet)
|
|
326
|
+
for rel, content in contents.items():
|
|
327
|
+
if Path(rel).name.lower() == "package.json":
|
|
328
|
+
try:
|
|
329
|
+
package = json.loads(content)
|
|
330
|
+
except json.JSONDecodeError:
|
|
331
|
+
return {}
|
|
332
|
+
scripts = package.get("scripts")
|
|
333
|
+
return scripts if isinstance(scripts, dict) else {}
|
|
334
|
+
return {}
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def _is_poetry_project(packet: Path) -> bool:
|
|
338
|
+
for rel, content in _packet_file_contents(packet).items():
|
|
339
|
+
if Path(rel).name.lower() == "pyproject.toml" and re.search(r"(?m)^\s*\[tool\.poetry\]\s*$", content):
|
|
340
|
+
return True
|
|
341
|
+
return False
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def _uses_unittest(packet: Path) -> bool:
|
|
345
|
+
for rel, content in _packet_file_contents(packet).items():
|
|
346
|
+
if Path(rel).suffix.lower() == ".py" and re.search(r"(?m)^\s*(import\s+unittest|from\s+unittest\s+import\s+)", content):
|
|
347
|
+
return True
|
|
348
|
+
return False
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def generate_reality_map(manifest: dict, packet: Path) -> dict:
|
|
352
|
+
files = _included_paths(manifest)
|
|
353
|
+
lower_files = {f.lower() for f in files}
|
|
354
|
+
deps = dependency_inventory(manifest, packet)
|
|
355
|
+
features = feature_inventory(manifest, packet, deps)
|
|
356
|
+
scripts = _package_json_scripts(packet)
|
|
357
|
+
project_types = []
|
|
358
|
+
package_managers = []
|
|
359
|
+
frameworks = []
|
|
360
|
+
supported_commands = []
|
|
361
|
+
test_commands = []
|
|
362
|
+
build_commands = []
|
|
363
|
+
run_commands = []
|
|
364
|
+
if "pyproject.toml" in lower_files:
|
|
365
|
+
project_types.append("python")
|
|
366
|
+
if any(Path(f).name.lower().startswith("requirements") and f.endswith(".txt") for f in lower_files):
|
|
367
|
+
project_types.append("python")
|
|
368
|
+
package_managers.append("pip")
|
|
369
|
+
if _is_poetry_project(packet):
|
|
370
|
+
package_managers.append("poetry")
|
|
371
|
+
if "package.json" in lower_files:
|
|
372
|
+
project_types.append("node")
|
|
373
|
+
package_managers.append("npm")
|
|
374
|
+
for name in sorted(scripts):
|
|
375
|
+
cmd = "npm test" if name == "test" else f"npm run {name}"
|
|
376
|
+
supported_commands.append(cmd)
|
|
377
|
+
if name == "test": test_commands.append(cmd)
|
|
378
|
+
elif name in {"build", "compile"}: build_commands.append(cmd)
|
|
379
|
+
elif name in {"start", "dev", "serve"}: run_commands.append(cmd)
|
|
380
|
+
if any(Path(f).name.lower() == "dockerfile" for f in files):
|
|
381
|
+
supported_commands.append("docker build")
|
|
382
|
+
build_commands.append("docker build")
|
|
383
|
+
if any(Path(f).name.lower() in {"docker-compose.yml", "compose.yaml", "compose.yml"} for f in files):
|
|
384
|
+
supported_commands.append("docker compose up")
|
|
385
|
+
run_commands.append("docker compose up")
|
|
386
|
+
if "pytest" in deps or any(f == "tests" or f.startswith("tests/") for f in lower_files):
|
|
387
|
+
supported_commands.append("pytest")
|
|
388
|
+
test_commands.append("pytest")
|
|
389
|
+
if _uses_unittest(packet):
|
|
390
|
+
supported_commands.append("python -m unittest")
|
|
391
|
+
test_commands.append("python -m unittest")
|
|
392
|
+
framework_map = {"fastapi": "FastAPI", "flask": "Flask", "django": "Django", "react": "React"}
|
|
393
|
+
for dep, label in framework_map.items():
|
|
394
|
+
if dep in deps or (dep == "react" and "react" in features):
|
|
395
|
+
frameworks.append(label)
|
|
396
|
+
ignored = manifest.get("ignored_files", [])
|
|
397
|
+
ignored_reasons = {}
|
|
398
|
+
for rec in ignored:
|
|
399
|
+
reason = rec.get("reason", "unknown")
|
|
400
|
+
ignored_reasons[reason] = ignored_reasons.get(reason, 0) + 1
|
|
401
|
+
included_count = len(manifest.get("included_files", []))
|
|
402
|
+
safe_claims = [
|
|
403
|
+
f"This packet includes {included_count} source files.",
|
|
404
|
+
f"SourcePack scanned input path: {manifest.get('input_path', '')}.",
|
|
405
|
+
]
|
|
406
|
+
for name in ["pyproject.toml", "package.json", "Dockerfile"]:
|
|
407
|
+
present = name.lower() in {Path(f).name.lower() for f in files}
|
|
408
|
+
safe_claims.append(f"The project {'contains' if present else 'does not include'} {name}.")
|
|
409
|
+
if "react" not in deps and "react" not in features:
|
|
410
|
+
safe_claims.append("No React dependency was detected.")
|
|
411
|
+
if "pdf" not in features:
|
|
412
|
+
safe_claims.append("No PDF parsing capability was detected.")
|
|
413
|
+
if ignored:
|
|
414
|
+
safe_claims.append("The packet includes ignored file records for safety or relevance reasons.")
|
|
415
|
+
claim_boundaries = [
|
|
416
|
+
"SourcePack did not execute the application.",
|
|
417
|
+
"SourcePack did not prove semantic correctness.",
|
|
418
|
+
"SourcePack did not verify external services.",
|
|
419
|
+
"SourcePack did not prove security.",
|
|
420
|
+
"SourcePack did not prove production readiness.",
|
|
421
|
+
"Absence of evidence means unknown, not impossible.",
|
|
422
|
+
"Unsupported claims should be treated as ungrounded.",
|
|
423
|
+
]
|
|
424
|
+
return {
|
|
425
|
+
"reality_map_schema_version": "1.0",
|
|
426
|
+
"tool_version": __version__,
|
|
427
|
+
"generated_at": utc_now(),
|
|
428
|
+
"input_path": manifest.get("input_path", ""),
|
|
429
|
+
"project_types": sorted(set(project_types)),
|
|
430
|
+
"package_managers": sorted(set(package_managers)),
|
|
431
|
+
"frameworks": sorted(set(frameworks)),
|
|
432
|
+
"entry_points": sorted(f for f in files if Path(f).name in {"main.py", "app.py", "server.py", "cli.py"}),
|
|
433
|
+
"test_commands": sorted(set(test_commands)),
|
|
434
|
+
"build_commands": sorted(set(build_commands)),
|
|
435
|
+
"run_commands": sorted(set(run_commands)),
|
|
436
|
+
"supported_commands": sorted(set(supported_commands)),
|
|
437
|
+
"detected_dependencies": sorted(deps),
|
|
438
|
+
"supported_capabilities": sorted(features),
|
|
439
|
+
"excluded_files_summary": {"total": len(ignored), "reasons": ignored_reasons, "records": ignored[:25]},
|
|
440
|
+
"included_file_count": included_count,
|
|
441
|
+
"confirmed_files": sorted(files),
|
|
442
|
+
"ignored_file_count": len(ignored),
|
|
443
|
+
"safe_claims": safe_claims,
|
|
444
|
+
"unknowns": [
|
|
445
|
+
"Runtime behavior was not executed.",
|
|
446
|
+
"Semantic correctness was not proven.",
|
|
447
|
+
"External services were not verified.",
|
|
448
|
+
"Capabilities not present in structural evidence must be treated as unknown.",
|
|
449
|
+
"Missing files must not be invented.",
|
|
450
|
+
],
|
|
451
|
+
"claim_boundaries": claim_boundaries,
|
|
452
|
+
"ai_constraints": [
|
|
453
|
+
"Use only the packet and reality map as project evidence.",
|
|
454
|
+
"Do not invent files, commands, dependencies, frameworks, services, or capabilities.",
|
|
455
|
+
"If a required file is missing, say it is missing.",
|
|
456
|
+
"If a command is unsupported by detected evidence, say it is unsupported.",
|
|
457
|
+
"If a capability is not in supported_capabilities, treat it as unknown or unsupported.",
|
|
458
|
+
"Cite file paths when making project-specific claims.",
|
|
459
|
+
"Do not claim SourcePack proves semantic truth.",
|
|
460
|
+
"Ask for missing files rather than hallucinating them.",
|
|
461
|
+
],
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def render_ai_instructions(reality_map: dict) -> str:
|
|
466
|
+
lines = [
|
|
467
|
+
"# AI Instructions for This SourcePack Packet", "",
|
|
468
|
+
"Use only the packet and `reality_map.json` as project evidence.",
|
|
469
|
+
"Do not invent files, commands, dependencies, frameworks, services, or capabilities.",
|
|
470
|
+
"If a required file is missing, say it is missing and ask for it rather than hallucinating it.",
|
|
471
|
+
"If a command is unsupported by detected evidence, say it is unsupported.",
|
|
472
|
+
"If a capability is not listed in `supported_capabilities`, treat it as unknown or unsupported.",
|
|
473
|
+
"If you introduce a new external dependency, modify the appropriate dependency manifest in the same patch and list it under Dependency Changes.",
|
|
474
|
+
"Only recommend commands listed under Supported Commands unless your patch also adds the project file that defines the new command.",
|
|
475
|
+
"Before referencing a file as existing, it must appear in Confirmed Files; label intentional creations as NEW FILE.",
|
|
476
|
+
"If required evidence is missing, say UNKNOWN and ask for the missing file/output instead of guessing.",
|
|
477
|
+
"Cite file paths when making project-specific claims.",
|
|
478
|
+
"Do not claim SourcePack proves semantic truth, security, production readiness, or external service behavior.", "",
|
|
479
|
+
"## Supported Commands", "",
|
|
480
|
+
]
|
|
481
|
+
cmds = reality_map.get("supported_commands", [])
|
|
482
|
+
lines.extend([f"- `{cmd}`" for cmd in cmds] or ["- None detected"])
|
|
483
|
+
lines.extend(["", "## Supported Capabilities", ""])
|
|
484
|
+
caps = reality_map.get("supported_capabilities", [])
|
|
485
|
+
lines.extend([f"- {cap}" for cap in caps] or ["- None detected"])
|
|
486
|
+
lines.extend(["", "## Confirmed Files", ""])
|
|
487
|
+
lines.extend(f"- `{path}`" for path in reality_map.get("confirmed_files", [])[:200])
|
|
488
|
+
lines.extend(["", "## Required Answer Contract", "", "- Files to modify", "- New files", "- Dependency changes", "- Commands to run", "- Assumptions/unknowns", "- Patch or code", "", "## Claim Boundaries", ""])
|
|
489
|
+
lines.extend(f"- {boundary}" for boundary in reality_map.get("claim_boundaries", []))
|
|
490
|
+
return "\n".join(lines) + "\n"
|
|
491
|
+
|
|
492
|
+
def load_manifest(packet: Path) -> dict:
|
|
493
|
+
return json.loads((packet / "manifest.json").read_text(encoding="utf-8"))
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def verify_packet(packet_path: str | Path, against: str | Path | None = None) -> bool:
|
|
497
|
+
packet = Path(packet_path)
|
|
498
|
+
ok = True
|
|
499
|
+
receipt_path = packet / "receipt.json"
|
|
500
|
+
if not receipt_path.exists():
|
|
501
|
+
print("FAIL receipt.json missing")
|
|
502
|
+
return False
|
|
503
|
+
receipt = json.loads(receipt_path.read_text(encoding="utf-8"))
|
|
504
|
+
for name, expected in receipt.get("hashes", {}).items():
|
|
505
|
+
path = packet / name
|
|
506
|
+
if not path.exists():
|
|
507
|
+
print(f"FAIL {name} missing")
|
|
508
|
+
ok = False
|
|
509
|
+
continue
|
|
510
|
+
actual = sha256_file(path)
|
|
511
|
+
if actual == expected:
|
|
512
|
+
print(f"PASS {name}")
|
|
513
|
+
else:
|
|
514
|
+
print(f"FAIL {name} hash mismatch")
|
|
515
|
+
ok = False
|
|
516
|
+
if against:
|
|
517
|
+
manifest = load_manifest(packet)
|
|
518
|
+
source = Path(against).resolve()
|
|
519
|
+
included = {rec["relative_path"]: rec for rec in manifest.get("included_files", [])}
|
|
520
|
+
for rel, rec in included.items():
|
|
521
|
+
source_file = source / rel
|
|
522
|
+
if not source_file.exists():
|
|
523
|
+
print(f"FAIL source missing {rel}")
|
|
524
|
+
ok = False
|
|
525
|
+
elif is_probably_binary(source_file):
|
|
526
|
+
print(f"WARN source now binary {rel}")
|
|
527
|
+
else:
|
|
528
|
+
try:
|
|
529
|
+
content = source_file.read_text(encoding="utf-8")
|
|
530
|
+
except Exception:
|
|
531
|
+
print(f"FAIL source unreadable {rel}")
|
|
532
|
+
ok = False
|
|
533
|
+
continue
|
|
534
|
+
expected_source_hash = rec.get("source_sha256")
|
|
535
|
+
expected_source_hash = rec.get("source_sha256")
|
|
536
|
+
if expected_source_hash is None:
|
|
537
|
+
expected_source_hash = rec.get("sha256")
|
|
538
|
+
redacted, _ = redact_secrets(content)
|
|
539
|
+
content_hash = sha256_text(redacted)
|
|
540
|
+
else:
|
|
541
|
+
content_hash = sha256_text(content)
|
|
542
|
+
if content_hash != expected_source_hash:
|
|
543
|
+
print(f"FAIL source changed {rel}")
|
|
544
|
+
ok = False
|
|
545
|
+
current_files = []
|
|
546
|
+
for root, dirs, files in os.walk(source, followlinks=False):
|
|
547
|
+
dirs[:] = [d for d in sorted(dirs) if d not in DEFAULT_IGNORED_DIRS and not d.startswith(".")]
|
|
548
|
+
for filename in sorted(files):
|
|
549
|
+
fp = Path(root) / filename
|
|
550
|
+
if filename.startswith(".") or fp.suffix.lower() not in DEFAULT_TEXT_EXTENSIONS:
|
|
551
|
+
continue
|
|
552
|
+
rel = str(fp.relative_to(source))
|
|
553
|
+
if rel not in included:
|
|
554
|
+
current_files.append(rel)
|
|
555
|
+
for rel in current_files:
|
|
556
|
+
print(f"WARN new source file not in packet {rel}")
|
|
557
|
+
print("OVERALL", "PASS" if ok else "FAIL")
|
|
558
|
+
return ok
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
PATHLIKE_EXTENSIONS = {".py", ".js", ".jsx", ".ts", ".tsx", ".json", ".toml", ".yaml", ".yml", ".md", ".txt", ".cfg", ".ini", ".css", ".html", ".rs", ".go", ".java", ".rb", ".php", ".sh"}
|
|
562
|
+
PROJECT_PATH_PREFIXES = {"src", "sourcepack", "tests", "test", "frontend", "backend", "docs", "app", "lib", "packages", "public", "config", "scripts"}
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def _normalize_ai_ref(ref: str) -> str | None:
|
|
566
|
+
ref = ref.strip().strip("`'\".,;)")
|
|
567
|
+
ref = ref.replace("\\", "/")
|
|
568
|
+
if ref.endswith(":"):
|
|
569
|
+
ref = ref[:-1]
|
|
570
|
+
while ref.startswith("./"):
|
|
571
|
+
ref = ref[2:]
|
|
572
|
+
if not ref or ref.startswith("/") or re.match(r"^[A-Za-z]:/", ref):
|
|
573
|
+
return None
|
|
574
|
+
normalized, unsafe = _normalize_diff_path(ref)
|
|
575
|
+
if unsafe or not normalized:
|
|
576
|
+
return None
|
|
577
|
+
return normalized
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
def _looks_like_ai_file_ref(ref: str) -> bool:
|
|
581
|
+
normalized = ref.replace("\\", "/")
|
|
582
|
+
name = PurePosixPath(normalized).name
|
|
583
|
+
if name in {"Dockerfile", "docker-compose.yml", "compose.yaml", "compose.yml", "pyproject.toml", "package.json", "requirements.txt"}:
|
|
584
|
+
return True
|
|
585
|
+
suffix = PurePosixPath(normalized).suffix.lower()
|
|
586
|
+
if suffix not in PATHLIKE_EXTENSIONS:
|
|
587
|
+
return False
|
|
588
|
+
parts = [p for p in PurePosixPath(normalized).parts if p not in {"."}]
|
|
589
|
+
return "/" in normalized or (parts and parts[0] in PROJECT_PATH_PREFIXES)
|
|
590
|
+
|
|
591
|
+
|
|
592
|
+
def extract_refs(text: str) -> set[str]:
|
|
593
|
+
refs: set[str] = set()
|
|
594
|
+
token = r"(?:\./)?[A-Za-z0-9_.-]+(?:[\\/][A-Za-z0-9_.-]+)*\.[A-Za-z0-9_.-]+:?|Dockerfile"
|
|
595
|
+
patterns = [rf"[`'\"]({token})[`'\"]", rf"(?m)^\s*[-*]\s+({token})\b", rf"\b(?:edit|open|update|modify|change|in|file)\s+({token})\b", rf"\b((?:\./)?(?:src|sourcepack|tests|test|frontend|backend|docs|app|lib|packages|public|config|scripts)[\\/][A-Za-z0-9_./\\-]+\.[A-Za-z0-9_.-]+:?)\b"]
|
|
596
|
+
for pattern in patterns:
|
|
597
|
+
for candidate in re.findall(pattern, text, re.I):
|
|
598
|
+
normalized = _normalize_ai_ref(candidate)
|
|
599
|
+
if normalized and _looks_like_ai_file_ref(normalized):
|
|
600
|
+
refs.add(normalized)
|
|
601
|
+
return refs
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def _packet_file_contents(packet: Path) -> dict[str, str]:
|
|
605
|
+
context_path = packet / "context.md"
|
|
606
|
+
if not context_path.exists():
|
|
607
|
+
return {}
|
|
608
|
+
text = context_path.read_text(encoding="utf-8", errors="ignore")
|
|
609
|
+
contents: dict[str, str] = {}
|
|
610
|
+
current: str | None = None
|
|
611
|
+
body: list[str] = []
|
|
612
|
+
in_content = False
|
|
613
|
+
for line in text.splitlines():
|
|
614
|
+
if line.startswith("## File: "):
|
|
615
|
+
if current is not None:
|
|
616
|
+
contents[current] = "\n".join(body).rstrip("\n")
|
|
617
|
+
current = line.removeprefix("## File: ").strip()
|
|
618
|
+
body = []
|
|
619
|
+
in_content = False
|
|
620
|
+
elif current is not None and line == "Content:":
|
|
621
|
+
in_content = True
|
|
622
|
+
body = []
|
|
623
|
+
elif current is not None and in_content and line == "---":
|
|
624
|
+
contents[current] = "\n".join(body).rstrip("\n")
|
|
625
|
+
current = None
|
|
626
|
+
body = []
|
|
627
|
+
in_content = False
|
|
628
|
+
elif current is not None and in_content:
|
|
629
|
+
body.append(line)
|
|
630
|
+
if current is not None:
|
|
631
|
+
contents[current] = "\n".join(body).rstrip("\n")
|
|
632
|
+
return contents
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _normalize_dependency_name(name: str) -> str:
|
|
636
|
+
return name.strip().lower().replace("_", "-")
|
|
637
|
+
|
|
638
|
+
|
|
639
|
+
def _dependency_name_for_import(name: str) -> str:
|
|
640
|
+
normalized = _normalize_dependency_name(name)
|
|
641
|
+
return PY_IMPORT_ALIASES.get(normalized, normalized)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def _js_package_root(imported: str) -> str:
|
|
645
|
+
imported = imported.strip().lower()
|
|
646
|
+
parts = imported.split("/")
|
|
647
|
+
if imported.startswith("@") and len(parts) >= 2 and parts[0] != "@":
|
|
648
|
+
return "/".join(parts[:2])
|
|
649
|
+
if imported.startswith("@/"):
|
|
650
|
+
return imported
|
|
651
|
+
return parts[0]
|
|
652
|
+
|
|
653
|
+
|
|
654
|
+
def _python_dependency_names_from_requirement_lines(text: str) -> set[str]:
|
|
655
|
+
deps: set[str] = set()
|
|
656
|
+
for line in text.splitlines():
|
|
657
|
+
cleaned = line.split("#", 1)[0].strip()
|
|
658
|
+
if cleaned and not cleaned.startswith(("-", "--")):
|
|
659
|
+
deps.add(_normalize_dependency_name(re.split(r"[<>=!~;\[]", cleaned, maxsplit=1)[0]))
|
|
660
|
+
return deps
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def _python_dependency_names_from_pyproject(content: str) -> set[str]:
|
|
664
|
+
try:
|
|
665
|
+
data = tomllib.loads(content)
|
|
666
|
+
except tomllib.TOMLDecodeError:
|
|
667
|
+
return set()
|
|
668
|
+
deps: set[str] = set()
|
|
669
|
+
|
|
670
|
+
def add_requirement(req: object) -> None:
|
|
671
|
+
if isinstance(req, str):
|
|
672
|
+
name = re.split(r"[<>=!~;\[]", req.strip(), maxsplit=1)[0]
|
|
673
|
+
if name:
|
|
674
|
+
deps.add(_normalize_dependency_name(name))
|
|
675
|
+
|
|
676
|
+
project = data.get("project", {})
|
|
677
|
+
if isinstance(project, dict):
|
|
678
|
+
for req in project.get("dependencies", []) if isinstance(project.get("dependencies"), list) else []:
|
|
679
|
+
add_requirement(req)
|
|
680
|
+
optional = project.get("optional-dependencies", {})
|
|
681
|
+
if isinstance(optional, dict):
|
|
682
|
+
for group in optional.values():
|
|
683
|
+
if isinstance(group, list):
|
|
684
|
+
for req in group:
|
|
685
|
+
add_requirement(req)
|
|
686
|
+
|
|
687
|
+
tool = data.get("tool", {})
|
|
688
|
+
if isinstance(tool, dict):
|
|
689
|
+
poetry = tool.get("poetry", {})
|
|
690
|
+
if isinstance(poetry, dict):
|
|
691
|
+
for section_name in ("dependencies", "dev-dependencies"):
|
|
692
|
+
section = poetry.get(section_name, {})
|
|
693
|
+
if isinstance(section, dict):
|
|
694
|
+
for dep in section:
|
|
695
|
+
if dep.lower() != "python":
|
|
696
|
+
deps.add(_normalize_dependency_name(dep))
|
|
697
|
+
group = poetry.get("group", {})
|
|
698
|
+
if isinstance(group, dict):
|
|
699
|
+
for group_data in group.values():
|
|
700
|
+
if isinstance(group_data, dict):
|
|
701
|
+
section = group_data.get("dependencies", {})
|
|
702
|
+
if isinstance(section, dict):
|
|
703
|
+
deps.update(_normalize_dependency_name(dep) for dep in section)
|
|
704
|
+
for tool_name in ("pdm", "uv"):
|
|
705
|
+
tool_data = tool.get(tool_name, {})
|
|
706
|
+
if isinstance(tool_data, dict):
|
|
707
|
+
for key in ("dev-dependencies", "dependency-groups"):
|
|
708
|
+
groups = tool_data.get(key, {})
|
|
709
|
+
if isinstance(groups, dict):
|
|
710
|
+
for group in groups.values():
|
|
711
|
+
if isinstance(group, list):
|
|
712
|
+
for req in group:
|
|
713
|
+
add_requirement(req)
|
|
714
|
+
dependency_groups = data.get("dependency-groups", {})
|
|
715
|
+
if isinstance(dependency_groups, dict):
|
|
716
|
+
for group in dependency_groups.values():
|
|
717
|
+
if isinstance(group, list):
|
|
718
|
+
for req in group:
|
|
719
|
+
add_requirement(req)
|
|
720
|
+
return deps
|
|
721
|
+
|
|
722
|
+
|
|
723
|
+
def _add_common_dependency(deps: set[str], name: str):
|
|
724
|
+
normalized = _normalize_dependency_name(name)
|
|
725
|
+
for dep in COMMON_DEPENDENCIES:
|
|
726
|
+
if normalized == _normalize_dependency_name(dep):
|
|
727
|
+
deps.add(dep.lower())
|
|
728
|
+
|
|
729
|
+
|
|
730
|
+
def dependency_inventory(manifest: dict, packet: Path) -> set[str]:
|
|
731
|
+
deps: set[str] = set()
|
|
732
|
+
contents = _packet_file_contents(packet)
|
|
733
|
+
for rec in manifest.get("included_files", []):
|
|
734
|
+
rel = rec.get("relative_path", "")
|
|
735
|
+
content = contents.get(rel, "")
|
|
736
|
+
name = Path(rel).name.lower()
|
|
737
|
+
suffix = Path(rel).suffix.lower()
|
|
738
|
+
if name == "pyproject.toml":
|
|
739
|
+
for dep in _python_dependency_names_from_pyproject(content):
|
|
740
|
+
_add_common_dependency(deps, dep)
|
|
741
|
+
elif name.startswith("requirements") and name.endswith(".txt"):
|
|
742
|
+
for dep in _python_dependency_names_from_requirement_lines(content):
|
|
743
|
+
_add_common_dependency(deps, dep)
|
|
744
|
+
elif name == "package.json":
|
|
745
|
+
try:
|
|
746
|
+
package = json.loads(content)
|
|
747
|
+
except json.JSONDecodeError:
|
|
748
|
+
package = {}
|
|
749
|
+
for section in ("dependencies", "devDependencies", "peerDependencies", "optionalDependencies"):
|
|
750
|
+
section_deps = package.get(section)
|
|
751
|
+
if isinstance(section_deps, dict):
|
|
752
|
+
for dep_name in section_deps:
|
|
753
|
+
_add_common_dependency(deps, dep_name)
|
|
754
|
+
elif suffix == ".py":
|
|
755
|
+
for imported in re.findall(r"(?m)^\s*(?:import|from)\s+([A-Za-z_][A-Za-z0-9_]*)", content):
|
|
756
|
+
_add_common_dependency(deps, imported)
|
|
757
|
+
elif suffix in {".js", ".jsx", ".ts", ".tsx"}:
|
|
758
|
+
for imported in re.findall(r"""(?:from\s+["']|import\s*\(\s*["']|require\s*\(\s*["'])(@?[A-Za-z0-9_.-]+)""", content):
|
|
759
|
+
_add_common_dependency(deps, _js_package_root(imported))
|
|
760
|
+
return deps
|
|
761
|
+
|
|
762
|
+
|
|
763
|
+
def _has_import(content: str, *modules: str) -> bool:
|
|
764
|
+
module_pattern = "|".join(re.escape(module) for module in modules)
|
|
765
|
+
return bool(re.search(rf"(?m)^\s*(?:import|from)\s+({module_pattern})(?:\b|[._])", content))
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
PDF_DEPENDENCIES = {"pypdf", "pdfplumber", "fitz", "pymupdf"}
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
def _declares_pdf_dependency(rel: str, content: str) -> bool:
|
|
772
|
+
name = Path(rel).name.lower()
|
|
773
|
+
if name == "pyproject.toml":
|
|
774
|
+
return any(dep in PDF_DEPENDENCIES for dep in _python_dependency_names_from_pyproject(content))
|
|
775
|
+
if name.startswith("requirements") and name.endswith(".txt"):
|
|
776
|
+
return any(dep in PDF_DEPENDENCIES for dep in _python_dependency_names_from_requirement_lines(content))
|
|
777
|
+
return False
|
|
778
|
+
|
|
779
|
+
|
|
780
|
+
def feature_inventory(manifest: dict, packet: Path, deps: set[str] | None = None) -> set[str]:
|
|
781
|
+
if deps is None:
|
|
782
|
+
deps = dependency_inventory(manifest, packet)
|
|
783
|
+
contents = _packet_file_contents(packet)
|
|
784
|
+
files = {rec.get("relative_path", "").replace("\\", "/") for rec in manifest.get("included_files", [])}
|
|
785
|
+
lower_files = {rel.lower() for rel in files}
|
|
786
|
+
features: set[str] = set()
|
|
787
|
+
|
|
788
|
+
if any(Path(rel).name.lower() in {"dockerfile", "docker-compose.yml", "compose.yaml", "compose.yml"} for rel in files):
|
|
789
|
+
features.add("docker")
|
|
790
|
+
if any(rel.endswith(("/pdf_parser.py", "pdf_parser.py")) for rel in lower_files):
|
|
791
|
+
features.add("pdf")
|
|
792
|
+
if any(_declares_pdf_dependency(rel, content) for rel, content in contents.items()):
|
|
793
|
+
features.add("pdf")
|
|
794
|
+
if "react" in deps or any(rel in {"frontend/app.tsx", "frontend/app.jsx"} for rel in lower_files):
|
|
795
|
+
features.add("react")
|
|
796
|
+
if deps & {"fastapi", "flask", "django"} or any(Path(rel).name.lower() in {"server.py", "app.py"} for rel in files):
|
|
797
|
+
features.add("web server")
|
|
798
|
+
if deps & {"sqlalchemy", "prisma"} or any("/migrations/" in f"/{rel}/" or Path(rel).name.lower() in {"schema.prisma", "schema.sql"} for rel in files):
|
|
799
|
+
features.add("database")
|
|
800
|
+
if any(part == "auth" or part.startswith("auth_") for rel in lower_files for part in Path(rel).parts):
|
|
801
|
+
features.add("authentication")
|
|
802
|
+
|
|
803
|
+
for rel, content in contents.items():
|
|
804
|
+
suffix = Path(rel).suffix.lower()
|
|
805
|
+
if suffix == ".py":
|
|
806
|
+
if _has_import(content, "pypdf", "pdfplumber", "fitz"):
|
|
807
|
+
features.add("pdf")
|
|
808
|
+
if _has_import(content, "fastapi", "flask", "django") or re.search(r"(?m)^\s*@\w+\.(?:route|get|post|put|patch|delete)\(", content):
|
|
809
|
+
features.add("web server")
|
|
810
|
+
if _has_import(content, "sqlalchemy", "prisma") or re.search(r"(?i)\b(sqlite|postgres(?:ql)?|mysql)://", content):
|
|
811
|
+
features.add("database")
|
|
812
|
+
if _has_import(content, "jwt", "oauthlib", "authlib") or re.search(r"(?i)@\w+\.(?:route|get|post)\([^)]*login", content):
|
|
813
|
+
features.add("authentication")
|
|
814
|
+
if _has_import(content, "pytesseract", "easyocr"):
|
|
815
|
+
features.add("ocr")
|
|
816
|
+
elif suffix in {".js", ".jsx", ".ts", ".tsx"}:
|
|
817
|
+
if re.search(r"""(?:from\s+["']react["']|require\s*\(\s*["']react["']|import\s+React\b)""", content):
|
|
818
|
+
features.add("react")
|
|
819
|
+
if re.search(r"(?i)\b(jwt|oauth|session|login)\b", content):
|
|
820
|
+
features.add("authentication")
|
|
821
|
+
elif Path(rel).name.lower() == "package.json":
|
|
822
|
+
if re.search(r'"react"\s*:', content):
|
|
823
|
+
features.add("react")
|
|
824
|
+
return features
|
|
825
|
+
|
|
826
|
+
|
|
827
|
+
|
|
828
|
+
def scanner_config_hash() -> str:
|
|
829
|
+
payload = {
|
|
830
|
+
"ignored_dirs": sorted(DEFAULT_IGNORED_DIRS),
|
|
831
|
+
"ignored_patterns": sorted(DEFAULT_IGNORED_PATTERNS),
|
|
832
|
+
"text_extensions": sorted(DEFAULT_TEXT_EXTENSIONS),
|
|
833
|
+
"max_file_size": 1_000_000,
|
|
834
|
+
"include_hidden": False,
|
|
835
|
+
"redact": True,
|
|
836
|
+
}
|
|
837
|
+
return sha256_text(json.dumps(payload, sort_keys=True))
|