ghostcode 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ghostcode/__init__.py +3 -0
- ghostcode/audit/__init__.py +0 -0
- ghostcode/audit/logger.py +149 -0
- ghostcode/cli.py +986 -0
- ghostcode/config.py +187 -0
- ghostcode/mapping/__init__.py +0 -0
- ghostcode/mapping/encryption.py +143 -0
- ghostcode/mapping/ghost_map.py +222 -0
- ghostcode/mapping/token_generator.py +78 -0
- ghostcode/parsers/__init__.py +0 -0
- ghostcode/parsers/base.py +66 -0
- ghostcode/parsers/cpp_parser.py +341 -0
- ghostcode/parsers/python_parser.py +397 -0
- ghostcode/reveal/__init__.py +0 -0
- ghostcode/reveal/code_revealer.py +374 -0
- ghostcode/reveal/diff_analyzer.py +426 -0
- ghostcode/reveal/explanation_translator.py +214 -0
- ghostcode/risk_report.py +467 -0
- ghostcode/transformers/__init__.py +0 -0
- ghostcode/transformers/comment_anonymizer.py +95 -0
- ghostcode/transformers/comment_stripper.py +60 -0
- ghostcode/transformers/isolator.py +312 -0
- ghostcode/transformers/literal_scrubber.py +452 -0
- ghostcode/transformers/multi_file.py +99 -0
- ghostcode/transformers/symbol_renamer.py +64 -0
- ghostcode/utils/__init__.py +0 -0
- ghostcode/utils/clipboard.py +52 -0
- ghostcode/utils/stdlib_registry.py +221 -0
- ghostcode-0.5.0.dist-info/METADATA +92 -0
- ghostcode-0.5.0.dist-info/RECORD +33 -0
- ghostcode-0.5.0.dist-info/WHEEL +5 -0
- ghostcode-0.5.0.dist-info/entry_points.txt +2 -0
- ghostcode-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
"""Code revealer — restores ghost tokens to original names.
|
|
2
|
+
|
|
3
|
+
Handles both pure code files and AI responses containing mixed content
|
|
4
|
+
(prose + code blocks). When processing AI responses, separates zones
|
|
5
|
+
and applies different restoration strategies to each.
|
|
6
|
+
|
|
7
|
+
Zone types:
|
|
8
|
+
CODE_BLOCK — fenced code blocks (```lang ... ```)
|
|
9
|
+
INLINE_CODE — backtick spans in prose (`gv_001`)
|
|
10
|
+
PROSE — natural language text
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from enum import Enum
|
|
16
|
+
|
|
17
|
+
from ..mapping.ghost_map import GhostMap
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ZoneType(Enum):
|
|
21
|
+
PROSE = "prose"
|
|
22
|
+
CODE_BLOCK = "code_block"
|
|
23
|
+
INLINE_CODE = "inline_code"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Zone:
|
|
28
|
+
"""A segment of an AI response with a specific type."""
|
|
29
|
+
type: ZoneType
|
|
30
|
+
content: str
|
|
31
|
+
start: int
|
|
32
|
+
end: int
|
|
33
|
+
language: str = "" # for code blocks
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class RevealResult:
|
|
38
|
+
"""Result of revealing an AI response."""
|
|
39
|
+
restored_code: str
|
|
40
|
+
restored_explanation: str
|
|
41
|
+
symbols_restored: int
|
|
42
|
+
new_symbols: list[str] = field(default_factory=list)
|
|
43
|
+
new_dependencies: list[str] = field(default_factory=list)
|
|
44
|
+
annotations: list[dict] = field(default_factory=list)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Ghost token pattern — matches any ghost token format
|
|
48
|
+
GHOST_TOKEN_PATTERN = re.compile(r"\bg[vftcsnmx]_\d{3}\b")
|
|
49
|
+
|
|
50
|
+
# Pattern for token with common suffixes in prose
|
|
51
|
+
GHOST_TOKEN_PROSE_PATTERN = re.compile(
|
|
52
|
+
r"\bg[vftcsnmx]_\d{3}(?='s|[-/]|(?:\b))"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class CodeRevealer:
|
|
57
|
+
"""Restores ghost tokens to original names in code and AI responses."""
|
|
58
|
+
|
|
59
|
+
def __init__(self, ghost_map: GhostMap):
|
|
60
|
+
self._map = ghost_map
|
|
61
|
+
self._forward = ghost_map.forward_map()
|
|
62
|
+
# Sort by token length descending to avoid substring collisions
|
|
63
|
+
self._sorted_tokens = sorted(
|
|
64
|
+
self._forward.keys(), key=len, reverse=True
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def reveal_code(self, ghost_source: str,
|
|
68
|
+
original_ghost: str | None = None,
|
|
69
|
+
diff_result=None) -> tuple[str, int, list[str]]:
|
|
70
|
+
"""Reveal a pure code file (no prose).
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
ghost_source: The AI-modified ghost code to reveal.
|
|
74
|
+
original_ghost: The original ghost file sent to AI (optional).
|
|
75
|
+
If provided, changed blocks are annotated with descriptive
|
|
76
|
+
comments like '# --- AI MODIFIED: desc ---'.
|
|
77
|
+
diff_result: Optional DiffResult from DiffAnalyzer (unused, reserved).
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Tuple of (restored_source, count_restored, new_symbols).
|
|
81
|
+
"""
|
|
82
|
+
restored = ghost_source
|
|
83
|
+
count = 0
|
|
84
|
+
|
|
85
|
+
for token in self._sorted_tokens:
|
|
86
|
+
original = self._forward[token]
|
|
87
|
+
if token in restored:
|
|
88
|
+
restored = restored.replace(token, original)
|
|
89
|
+
count += 1
|
|
90
|
+
|
|
91
|
+
# Restore anonymized comments from map metadata
|
|
92
|
+
# Tokens appear wrapped in comment syntax: # [gc_001], """[gc_001]""", // [gc_001]
|
|
93
|
+
# We try each wrapper pattern, then fall back to bare token
|
|
94
|
+
original_comments = self._map._metadata.get("original_comments", {})
|
|
95
|
+
for comment_token, original_text in original_comments.items():
|
|
96
|
+
replaced = False
|
|
97
|
+
# Try wrapped patterns first (most specific to least)
|
|
98
|
+
for pattern in [
|
|
99
|
+
f'"""{comment_token}"""',
|
|
100
|
+
f"'''{comment_token}'''",
|
|
101
|
+
f"/* {comment_token} */",
|
|
102
|
+
f"// {comment_token}",
|
|
103
|
+
f"# {comment_token}",
|
|
104
|
+
]:
|
|
105
|
+
if pattern in restored:
|
|
106
|
+
restored = restored.replace(pattern, original_text)
|
|
107
|
+
count += 1
|
|
108
|
+
replaced = True
|
|
109
|
+
break
|
|
110
|
+
# Fall back to bare token
|
|
111
|
+
if not replaced and comment_token in restored:
|
|
112
|
+
restored = restored.replace(comment_token, original_text)
|
|
113
|
+
count += 1
|
|
114
|
+
|
|
115
|
+
new_symbols = self._detect_new_symbols(restored)
|
|
116
|
+
for sym in new_symbols:
|
|
117
|
+
restored = restored.replace(sym, f"NEW_{sym}")
|
|
118
|
+
|
|
119
|
+
# Annotate new/changed lines if original ghost file is provided
|
|
120
|
+
if original_ghost is not None:
|
|
121
|
+
restored = self._annotate_new_lines(restored, original_ghost)
|
|
122
|
+
|
|
123
|
+
return restored, count, new_symbols
|
|
124
|
+
|
|
125
|
+
def _reveal_original(self, original_ghost: str) -> str:
|
|
126
|
+
"""Reveal the original ghost file for fair comparison."""
|
|
127
|
+
original_revealed = original_ghost
|
|
128
|
+
for token in self._sorted_tokens:
|
|
129
|
+
original_name = self._forward[token]
|
|
130
|
+
if token in original_revealed:
|
|
131
|
+
original_revealed = original_revealed.replace(token, original_name)
|
|
132
|
+
|
|
133
|
+
# Restore comments in original too (try wrapped patterns first)
|
|
134
|
+
original_comments = self._map._metadata.get("original_comments", {})
|
|
135
|
+
for comment_token, original_text in original_comments.items():
|
|
136
|
+
for pattern in [
|
|
137
|
+
f'"""{comment_token}"""',
|
|
138
|
+
f"'''{comment_token}'''",
|
|
139
|
+
f"/* {comment_token} */",
|
|
140
|
+
f"// {comment_token}",
|
|
141
|
+
f"# {comment_token}",
|
|
142
|
+
comment_token,
|
|
143
|
+
]:
|
|
144
|
+
if pattern in original_revealed:
|
|
145
|
+
original_revealed = original_revealed.replace(pattern, original_text)
|
|
146
|
+
break
|
|
147
|
+
|
|
148
|
+
return original_revealed
|
|
149
|
+
|
|
150
|
+
def _detect_comment_style(self, code: str) -> str:
|
|
151
|
+
"""Auto-detect comment prefix from surrounding code."""
|
|
152
|
+
for line in code.splitlines():
|
|
153
|
+
stripped = line.strip()
|
|
154
|
+
if stripped.startswith("//"):
|
|
155
|
+
return "//"
|
|
156
|
+
if stripped.startswith("#") and not stripped.startswith("#include"):
|
|
157
|
+
return "#"
|
|
158
|
+
# Fallback based on common patterns
|
|
159
|
+
if "def " in code or "import " in code:
|
|
160
|
+
return "#"
|
|
161
|
+
if "#include" in code or "int main" in code or "::" in code:
|
|
162
|
+
return "//"
|
|
163
|
+
return "#"
|
|
164
|
+
|
|
165
|
+
def _annotate_new_lines(self, revealed: str, original_ghost: str) -> str:
|
|
166
|
+
"""Annotate changed blocks with descriptive AI-change comments.
|
|
167
|
+
|
|
168
|
+
Compares the revealed code against the original ghost file
|
|
169
|
+
(after revealing it too) and inserts block-level annotations like:
|
|
170
|
+
# --- AI MODIFIED: changed '+' to '-' ---
|
|
171
|
+
# --- AI ADDED: null-safety check ---
|
|
172
|
+
"""
|
|
173
|
+
from .diff_analyzer import DiffAnalyzer
|
|
174
|
+
|
|
175
|
+
original_revealed = self._reveal_original(original_ghost)
|
|
176
|
+
|
|
177
|
+
analyzer = DiffAnalyzer()
|
|
178
|
+
blocks = analyzer.detect_change_blocks(original_revealed, revealed)
|
|
179
|
+
|
|
180
|
+
if not blocks:
|
|
181
|
+
return revealed
|
|
182
|
+
|
|
183
|
+
comment_prefix = self._detect_comment_style(revealed)
|
|
184
|
+
lines = revealed.splitlines()
|
|
185
|
+
|
|
186
|
+
# Insert annotations backwards to preserve line numbers
|
|
187
|
+
for block in reversed(blocks):
|
|
188
|
+
description = analyzer.describe_change(block)
|
|
189
|
+
|
|
190
|
+
if block.block_type == "added":
|
|
191
|
+
label = "AI ADDED"
|
|
192
|
+
elif block.block_type == "deleted":
|
|
193
|
+
label = "AI REMOVED"
|
|
194
|
+
else:
|
|
195
|
+
label = "AI MODIFIED"
|
|
196
|
+
|
|
197
|
+
annotation = f"{comment_prefix} --- {label}: {description} ---"
|
|
198
|
+
|
|
199
|
+
# Insert annotation above the block's start line
|
|
200
|
+
insert_at = block.start_line
|
|
201
|
+
if insert_at <= len(lines):
|
|
202
|
+
# Match indentation of the first line of the block
|
|
203
|
+
if insert_at < len(lines) and lines[insert_at].strip():
|
|
204
|
+
indent = len(lines[insert_at]) - len(lines[insert_at].lstrip())
|
|
205
|
+
annotation = " " * indent + annotation
|
|
206
|
+
lines.insert(insert_at, annotation)
|
|
207
|
+
|
|
208
|
+
return "\n".join(lines)
|
|
209
|
+
|
|
210
|
+
def reveal_ai_response(self, response: str) -> RevealResult:
|
|
211
|
+
"""Reveal a full AI response (prose + code blocks).
|
|
212
|
+
|
|
213
|
+
Parses the response into zones, applies zone-specific restoration,
|
|
214
|
+
and produces both restored code and translated explanation.
|
|
215
|
+
"""
|
|
216
|
+
zones = self._parse_zones(response)
|
|
217
|
+
restored_parts = []
|
|
218
|
+
code_blocks = []
|
|
219
|
+
symbols_restored = 0
|
|
220
|
+
|
|
221
|
+
for zone in zones:
|
|
222
|
+
if zone.type == ZoneType.CODE_BLOCK:
|
|
223
|
+
revealed, count, new_syms = self.reveal_code(zone.content)
|
|
224
|
+
code_blocks.append(revealed)
|
|
225
|
+
symbols_restored += count
|
|
226
|
+
# Reconstruct fenced block
|
|
227
|
+
lang = zone.language
|
|
228
|
+
restored_parts.append(f"```{lang}\n{revealed}\n```")
|
|
229
|
+
|
|
230
|
+
elif zone.type == ZoneType.INLINE_CODE:
|
|
231
|
+
revealed = self._reveal_inline(zone.content)
|
|
232
|
+
if revealed != zone.content:
|
|
233
|
+
symbols_restored += 1
|
|
234
|
+
restored_parts.append(f"`{revealed}`")
|
|
235
|
+
|
|
236
|
+
else:
|
|
237
|
+
# Prose — apply token replacement with word boundaries
|
|
238
|
+
revealed = self._reveal_prose(zone.content)
|
|
239
|
+
restored_parts.append(revealed)
|
|
240
|
+
|
|
241
|
+
restored_full = "".join(restored_parts)
|
|
242
|
+
|
|
243
|
+
# Extract just the code blocks for the code output
|
|
244
|
+
restored_code = "\n\n".join(code_blocks) if code_blocks else ""
|
|
245
|
+
|
|
246
|
+
# Detect new symbols across all code blocks
|
|
247
|
+
all_new = []
|
|
248
|
+
for block in code_blocks:
|
|
249
|
+
all_new.extend(self._detect_new_symbols(block))
|
|
250
|
+
all_new = list(set(all_new))
|
|
251
|
+
|
|
252
|
+
# Detect new dependencies
|
|
253
|
+
new_deps = self._detect_new_dependencies(code_blocks)
|
|
254
|
+
|
|
255
|
+
result = RevealResult(
|
|
256
|
+
restored_code=restored_code,
|
|
257
|
+
restored_explanation=restored_full,
|
|
258
|
+
symbols_restored=symbols_restored,
|
|
259
|
+
new_symbols=all_new,
|
|
260
|
+
new_dependencies=new_deps,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
def _parse_zones(self, text: str) -> list[Zone]:
|
|
266
|
+
"""Parse AI response into typed zones."""
|
|
267
|
+
zones = []
|
|
268
|
+
pos = 0
|
|
269
|
+
|
|
270
|
+
# Pattern for fenced code blocks
|
|
271
|
+
code_block_pattern = re.compile(
|
|
272
|
+
r"```(\w*)\n(.*?)```", re.DOTALL
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
for match in code_block_pattern.finditer(text):
|
|
276
|
+
# Add prose before this code block
|
|
277
|
+
if match.start() > pos:
|
|
278
|
+
prose = text[pos:match.start()]
|
|
279
|
+
# Split prose further into inline code and text
|
|
280
|
+
zones.extend(self._parse_inline_zones(prose, pos))
|
|
281
|
+
|
|
282
|
+
# Add the code block
|
|
283
|
+
zones.append(Zone(
|
|
284
|
+
type=ZoneType.CODE_BLOCK,
|
|
285
|
+
content=match.group(2),
|
|
286
|
+
start=match.start(),
|
|
287
|
+
end=match.end(),
|
|
288
|
+
language=match.group(1),
|
|
289
|
+
))
|
|
290
|
+
pos = match.end()
|
|
291
|
+
|
|
292
|
+
# Add remaining prose after last code block
|
|
293
|
+
if pos < len(text):
|
|
294
|
+
zones.extend(self._parse_inline_zones(text[pos:], pos))
|
|
295
|
+
|
|
296
|
+
return zones
|
|
297
|
+
|
|
298
|
+
def _parse_inline_zones(self, text: str, base_offset: int) -> list[Zone]:
|
|
299
|
+
"""Split prose text into PROSE and INLINE_CODE zones."""
|
|
300
|
+
zones = []
|
|
301
|
+
pos = 0
|
|
302
|
+
|
|
303
|
+
for match in re.finditer(r"`([^`]+)`", text):
|
|
304
|
+
# Prose before inline code
|
|
305
|
+
if match.start() > pos:
|
|
306
|
+
zones.append(Zone(
|
|
307
|
+
type=ZoneType.PROSE,
|
|
308
|
+
content=text[pos:match.start()],
|
|
309
|
+
start=base_offset + pos,
|
|
310
|
+
end=base_offset + match.start(),
|
|
311
|
+
))
|
|
312
|
+
|
|
313
|
+
# Inline code
|
|
314
|
+
zones.append(Zone(
|
|
315
|
+
type=ZoneType.INLINE_CODE,
|
|
316
|
+
content=match.group(1),
|
|
317
|
+
start=base_offset + match.start(),
|
|
318
|
+
end=base_offset + match.end(),
|
|
319
|
+
))
|
|
320
|
+
pos = match.end()
|
|
321
|
+
|
|
322
|
+
# Remaining prose
|
|
323
|
+
if pos < len(text):
|
|
324
|
+
zones.append(Zone(
|
|
325
|
+
type=ZoneType.PROSE,
|
|
326
|
+
content=text[pos:],
|
|
327
|
+
start=base_offset + pos,
|
|
328
|
+
end=base_offset + len(text),
|
|
329
|
+
))
|
|
330
|
+
|
|
331
|
+
return zones
|
|
332
|
+
|
|
333
|
+
def _reveal_inline(self, code_span: str) -> str:
|
|
334
|
+
"""Reveal tokens in an inline code span."""
|
|
335
|
+
result = code_span
|
|
336
|
+
for token in self._sorted_tokens:
|
|
337
|
+
if token in result:
|
|
338
|
+
result = result.replace(token, self._forward[token])
|
|
339
|
+
return result
|
|
340
|
+
|
|
341
|
+
def _reveal_prose(self, text: str) -> str:
|
|
342
|
+
"""Reveal tokens in prose with word-boundary matching.
|
|
343
|
+
|
|
344
|
+
Handles common prose patterns:
|
|
345
|
+
gv_001-related → connectionPool-related
|
|
346
|
+
gv_001's value → connectionPool's value
|
|
347
|
+
gf_001/gf_002 → update_matrix/validate_input
|
|
348
|
+
"""
|
|
349
|
+
result = text
|
|
350
|
+
for token in self._sorted_tokens:
|
|
351
|
+
original = self._forward[token]
|
|
352
|
+
# Word boundary match that allows common suffixes
|
|
353
|
+
pattern = re.compile(
|
|
354
|
+
r"\b" + re.escape(token) + r"(?='s|[-/.,;:!?\s\)]|$)"
|
|
355
|
+
)
|
|
356
|
+
result = pattern.sub(original, result)
|
|
357
|
+
return result
|
|
358
|
+
|
|
359
|
+
def _detect_new_symbols(self, code: str) -> list[str]:
|
|
360
|
+
"""Find ghost-pattern tokens not in our map."""
|
|
361
|
+
found = set(GHOST_TOKEN_PATTERN.findall(code))
|
|
362
|
+
known = self._map.all_tokens()
|
|
363
|
+
# Also exclude tokens we already replaced (they shouldn't be here)
|
|
364
|
+
return sorted(found - known)
|
|
365
|
+
|
|
366
|
+
def _detect_new_dependencies(self, code_blocks: list[str]) -> list[str]:
|
|
367
|
+
"""Detect new #include or import statements the AI introduced."""
|
|
368
|
+
deps = []
|
|
369
|
+
for block in code_blocks:
|
|
370
|
+
for line in block.split("\n"):
|
|
371
|
+
stripped = line.strip()
|
|
372
|
+
if stripped.startswith("#include") or stripped.startswith("import ") or stripped.startswith("from "):
|
|
373
|
+
deps.append(stripped)
|
|
374
|
+
return deps
|