@ivannikov-pro/ai-context-surgeon 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +372 -0
- package/bin/catalog.js +153 -0
- package/bin/cli.js +380 -0
- package/bin/installer.js +135 -0
- package/bin/prompts.js +371 -0
- package/checklists/phase-1-analysis.md +58 -0
- package/checklists/phase-2-planning.md +67 -0
- package/checklists/phase-3-restructuring.md +77 -0
- package/checklists/phase-4-documentation.md +111 -0
- package/checklists/phase-5-validation.md +91 -0
- package/examples/before-after/README.md +139 -0
- package/examples/ideal-monorepo/README.md +127 -0
- package/knowledge/agent-context-system/artifacts/guide.md +183 -0
- package/knowledge/agent-context-system/artifacts/knowledge.md +177 -0
- package/knowledge/agent-context-system/artifacts/skills.md +101 -0
- package/knowledge/agent-context-system/artifacts/workflows.md +143 -0
- package/knowledge/agent-context-system/metadata.json +26 -0
- package/knowledge/agent-context-system/timestamps.json +5 -0
- package/knowledge/agent-vulnerabilities/LICENSE +21 -0
- package/knowledge/agent-vulnerabilities/artifacts/stealth_injection.md +110 -0
- package/knowledge/agent-vulnerabilities/artifacts/vulnerabilities.md +232 -0
- package/knowledge/agent-vulnerabilities/metadata.json +14 -0
- package/knowledge/agent-vulnerabilities/timestamps.json +5 -0
- package/knowledge/power-words-dictionary/LICENSE +21 -0
- package/knowledge/power-words-dictionary/artifacts/dictionary.md +231 -0
- package/knowledge/power-words-dictionary/artifacts/prompt_amplifier.py +381 -0
- package/knowledge/power-words-dictionary/metadata.json +14 -0
- package/knowledge/power-words-dictionary/timestamps.json +5 -0
- package/package.json +77 -0
- package/roles/README.md +81 -0
- package/roles/architect.md +203 -0
- package/roles/inspector.md +232 -0
- package/roles/librarian.md +176 -0
- package/roles/scout.md +169 -0
- package/roles/surgeon.md +172 -0
- package/roles/tuner.md +204 -0
- package/skills/annotate-jsdoc/SKILL.md +262 -0
- package/skills/prompt-engineering/LICENSE +21 -0
- package/skills/prompt-engineering/SKILL.md +235 -0
- package/skills/prompt-engineering/scripts/extract_instructions.py +416 -0
- package/skills/prompt-engineering/scripts/prompt_amplifier.py +381 -0
- package/skills/prompt-engineering/scripts/prompt_diff_tracker.py +281 -0
- package/skills/prompt-engineering/scripts/prompt_dna_analyzer.py +692 -0
- package/skills/prompt-engineering/scripts/templates/code_review.md +47 -0
- package/skills/prompt-engineering/scripts/templates/dump_extraction.md +59 -0
- package/skills/prompt-engineering/scripts/templates/multi_agent_orchestration.md +100 -0
- package/skills/prompt-engineering/scripts/templates/prompt_audit.md +106 -0
- package/skills/prompt-engineering/scripts/templates/stealth_injection.md +110 -0
- package/skills/prompt-engineering/scripts/templates/task_automation.md +87 -0
- package/skills/prompt-engineering/workflows/amplify.md +36 -0
- package/skills/prompt-engineering/workflows/audit.md +55 -0
- package/skills/prompt-engineering/workflows/context-dump.md +90 -0
- package/skills/prompt-engineering/workflows/diff.md +44 -0
- package/strategy/bash-guide.md +134 -0
- package/strategy/context-exclusion.md +220 -0
- package/strategy/context-weight-theory.md +49 -0
- package/strategy/file-navigation-header.md +562 -0
- package/strategy/jsdoc-guide.md +596 -0
- package/strategy/monorepo_strategy.md +726 -0
- package/strategy/package-json-guide.md +541 -0
- package/templates/AGENTS.md.template +148 -0
- package/templates/antigravityignore.template +64 -0
- package/templates/cursorrules.template +7 -0
- package/templates/knowledge-item.template +44 -0
- package/templates/package-json-ideal.template +26 -0
- package/templates/package-readme.template +45 -0
- package/templates/publish-meta.template +34 -0
- package/templates/skill.template +50 -0
- package/templates/workflow.template +33 -0
- package/tools/analyze-package-json.sh +213 -0
- package/tools/analyze-structure.sh +101 -0
- package/tools/audit-jsdoc.sh +176 -0
- package/tools/check-fnh-freshness.sh +74 -0
- package/tools/detect-circular-deps.sh +147 -0
- package/tools/detect-god-files.sh +71 -0
- package/tools/enforce-god-files.sh +112 -0
- package/tools/enrich-package-json.js +311 -0
- package/tools/generate-jsdoc-headers.sh +109 -0
- package/tools/generate-source-map.sh +71 -0
- package/tools/lint-imports.sh +123 -0
- package/tools/measure-context-cost.sh +206 -0
- package/tools/scan-fnh.sh +174 -0
- package/tools/shared/config.sh +53 -0
- package/tools/validate-context-hygiene.sh +52 -0
- package/tools/validate-context-weight.sh +100 -0
- package/tools/validate-naming.sh +98 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
extract_instructions.py — Extracts ALL instructions, directives, rules, and their
|
|
4
|
+
variants from system_prompt_logs.txt across multiple dump sessions.
|
|
5
|
+
|
|
6
|
+
Outputs a structured report to /tmp/extracted_instructions_report.md
|
|
7
|
+
|
|
8
|
+
Sections extracted:
|
|
9
|
+
1. Dump timestamps & model info
|
|
10
|
+
2. All XML/HTML tags found (identity, planning_mode, etc.)
|
|
11
|
+
3. CRITICAL INSTRUCTION 1 & 2 variants
|
|
12
|
+
4. MANDATORY RULE variants
|
|
13
|
+
5. CRITICAL REMINDER variants
|
|
14
|
+
6. EPHEMERAL_MESSAGE content variants
|
|
15
|
+
7. Tool schemas (names only, with counts)
|
|
16
|
+
8. Section-by-section diff across dumps
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
import re
|
|
20
|
+
import os
|
|
21
|
+
import hashlib
|
|
22
|
+
from collections import defaultdict, OrderedDict
|
|
23
|
+
from datetime import datetime
|
|
24
|
+
|
|
25
|
+
INPUT_FILE = '/tmp/system_prompt_logs.txt'
|
|
26
|
+
OUTPUT_FILE = '/tmp/extracted_instructions_report.md'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def read_file(path):
|
|
31
|
+
with open(path, 'r', encoding='utf-8', errors='replace') as f:
|
|
32
|
+
return f.read()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def split_dumps(content):
|
|
36
|
+
"""Split the log file into individual dump sessions by TIMESTAMP markers."""
|
|
37
|
+
pattern = r'={10,}\nTIMESTAMP:\s*(.+?)(?:\n={10,})'
|
|
38
|
+
splits = re.split(pattern, content)
|
|
39
|
+
|
|
40
|
+
dumps = []
|
|
41
|
+
# splits[0] is content before first timestamp (may be empty or preamble)
|
|
42
|
+
if splits[0].strip():
|
|
43
|
+
dumps.append({
|
|
44
|
+
'timestamp': 'INITIAL (no timestamp header)',
|
|
45
|
+
'content': splits[0].strip()
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
# After that, pairs of (timestamp, content)
|
|
49
|
+
for i in range(1, len(splits) - 1, 2):
|
|
50
|
+
timestamp = splits[i].strip()
|
|
51
|
+
body = splits[i + 1].strip() if i + 1 < len(splits) else ''
|
|
52
|
+
dumps.append({
|
|
53
|
+
'timestamp': timestamp,
|
|
54
|
+
'content': body
|
|
55
|
+
})
|
|
56
|
+
|
|
57
|
+
return dumps
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def extract_xml_tags(content):
|
|
61
|
+
"""Find all XML-like tags used in the content."""
|
|
62
|
+
# Opening tags
|
|
63
|
+
opening = re.findall(r'<(\w[\w_-]*)(?:\s[^>]*)?>', content)
|
|
64
|
+
# Filter out common HTML and keep system-level tags
|
|
65
|
+
system_tags = set()
|
|
66
|
+
for tag in opening:
|
|
67
|
+
tag_lower = tag.lower()
|
|
68
|
+
if tag_lower in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'br', 'hr', 'div',
|
|
69
|
+
'span', 'ul', 'ol', 'li', 'a', 'img', 'code', 'pre', 'em',
|
|
70
|
+
'strong', 'b', 'i', 'table', 'tr', 'td', 'th', 'thead', 'tbody'):
|
|
71
|
+
continue
|
|
72
|
+
system_tags.add(tag)
|
|
73
|
+
return sorted(system_tags)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_tag_content(content, tag_name):
|
|
77
|
+
"""Extract the full content between <tag_name> and </tag_name>."""
|
|
78
|
+
pattern = re.compile(
|
|
79
|
+
r'<' + re.escape(tag_name) + r'[^>]*>(.*?)</' + re.escape(tag_name) + r'>',
|
|
80
|
+
re.DOTALL
|
|
81
|
+
)
|
|
82
|
+
matches = pattern.findall(content)
|
|
83
|
+
return [m.strip() for m in matches]
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def extract_critical_instructions(content):
|
|
87
|
+
"""Extract CRITICAL INSTRUCTION blocks."""
|
|
88
|
+
results = {}
|
|
89
|
+
|
|
90
|
+
# CRITICAL INSTRUCTION 1
|
|
91
|
+
pat1 = re.compile(
|
|
92
|
+
r'CRITICAL INSTRUCTION 1:\s*(.*?)(?=CRITICAL INSTRUCTION 2:|$)',
|
|
93
|
+
re.DOTALL
|
|
94
|
+
)
|
|
95
|
+
m1 = pat1.findall(content)
|
|
96
|
+
if m1:
|
|
97
|
+
results['CRITICAL INSTRUCTION 1'] = [x.strip()[:500] for x in m1]
|
|
98
|
+
|
|
99
|
+
# CRITICAL INSTRUCTION 2
|
|
100
|
+
pat2 = re.compile(
|
|
101
|
+
r'CRITICAL INSTRUCTION 2:\s*(.*?)(?=<\/|$)',
|
|
102
|
+
re.DOTALL
|
|
103
|
+
)
|
|
104
|
+
m2 = pat2.findall(content)
|
|
105
|
+
if m2:
|
|
106
|
+
results['CRITICAL INSTRUCTION 2'] = [x.strip()[:500] for x in m2]
|
|
107
|
+
|
|
108
|
+
return results
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def extract_mandatory_rules(content):
|
|
112
|
+
"""Extract MANDATORY RULE blocks."""
|
|
113
|
+
pattern = re.compile(r'MANDATORY RULE:\s*(.*?)(?:\n\n|\n<|$)', re.DOTALL)
|
|
114
|
+
matches = pattern.findall(content)
|
|
115
|
+
return [m.strip()[:300] for m in matches]
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def extract_critical_reminders(content):
|
|
119
|
+
"""Extract CRITICAL REMINDER blocks."""
|
|
120
|
+
pattern = re.compile(r'CRITICAL REMINDER:\s*(.*?)(?:\n<|$)', re.DOTALL)
|
|
121
|
+
matches = pattern.findall(content)
|
|
122
|
+
return [m.strip()[:300] for m in matches]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def extract_tool_names(content):
|
|
126
|
+
"""Extract tool/function names from JSON schemas."""
|
|
127
|
+
pattern = re.compile(r'"name":\s*"([^"]+)"')
|
|
128
|
+
names = pattern.findall(content)
|
|
129
|
+
# Filter to likely tool names (not parameter names)
|
|
130
|
+
tool_names = []
|
|
131
|
+
for n in names:
|
|
132
|
+
if n.startswith('$') or n in ('type', 'string', 'integer', 'boolean', 'array', 'object'):
|
|
133
|
+
continue
|
|
134
|
+
tool_names.append(n)
|
|
135
|
+
return sorted(set(tool_names))
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def content_hash(text):
|
|
139
|
+
"""Short hash for dedup."""
|
|
140
|
+
return hashlib.md5(text.encode('utf-8', errors='replace')).hexdigest()[:8]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def extract_section_content(content, section_header):
|
|
144
|
+
"""Extract content under === SECTION === headers."""
|
|
145
|
+
pattern = re.compile(
|
|
146
|
+
r'===\s*' + re.escape(section_header) + r'\s*===\s*\n(.*?)(?====\s*\w|$)',
|
|
147
|
+
re.DOTALL
|
|
148
|
+
)
|
|
149
|
+
matches = pattern.findall(content)
|
|
150
|
+
return [m.strip() for m in matches]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# Key system tags to track
|
|
154
|
+
SYSTEM_TAGS = [
|
|
155
|
+
'identity', 'web_application_development', 'ephemeral_message',
|
|
156
|
+
'skills', 'plugins', 'persistent_context', 'artifacts',
|
|
157
|
+
'planning_mode', 'planning_mode_artifacts', 'communication_style',
|
|
158
|
+
'user_information', 'mcp_servers', 'user_rules', 'workflows',
|
|
159
|
+
'USER_REQUEST', 'ADDITIONAL_METADATA', 'USER_SETTINGS_CHANGE',
|
|
160
|
+
'EPHEMERAL_MESSAGE', 'WORKFLOW', 'RULE', 'function', 'functions'
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
# Named section headers to track
|
|
164
|
+
SECTION_HEADERS = [
|
|
165
|
+
'PREAMBLE & TOOLS', 'IDENTITY', 'WEB APP DEV', 'EPHEMERAL MESSAGE DEF',
|
|
166
|
+
'SKILLS DEF', 'PLUGINS DEF', 'PERSISTENT CONTEXT', 'ARTIFACTS',
|
|
167
|
+
'PLANNING MODE', 'PLANNING MODE ARTIFACTS', 'COMMUNICATION STYLE',
|
|
168
|
+
'INSTRUCTION INVOCATION', 'USER INFORMATION', 'MCP SERVERS',
|
|
169
|
+
'USER RULES', 'WORKFLOWS', 'ACTIVE SKILLS', 'CONVERSATION HISTORY',
|
|
170
|
+
'USER REQUEST', 'ADDITIONAL METADATA', 'EPHEMERAL INTERRUPT',
|
|
171
|
+
'RUNTIME EPHEMERAL_MESSAGE'
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def main():
|
|
176
|
+
print(f"Reading {INPUT_FILE}...")
|
|
177
|
+
content = read_file(INPUT_FILE)
|
|
178
|
+
print(f" File size: {len(content):,} bytes, {content.count(chr(10)):,} lines")
|
|
179
|
+
|
|
180
|
+
# 1. Split into dumps
|
|
181
|
+
dumps = split_dumps(content)
|
|
182
|
+
print(f" Found {len(dumps)} dump session(s)")
|
|
183
|
+
|
|
184
|
+
# 2. Analyze each dump
|
|
185
|
+
analysis = []
|
|
186
|
+
for i, dump in enumerate(dumps):
|
|
187
|
+
info = {
|
|
188
|
+
'index': i + 1,
|
|
189
|
+
'timestamp': dump['timestamp'],
|
|
190
|
+
'size': len(dump['content']),
|
|
191
|
+
'lines': dump['content'].count('\n'),
|
|
192
|
+
'xml_tags': extract_xml_tags(dump['content']),
|
|
193
|
+
'critical_instructions': extract_critical_instructions(dump['content']),
|
|
194
|
+
'mandatory_rules': extract_mandatory_rules(dump['content']),
|
|
195
|
+
'critical_reminders': extract_critical_reminders(dump['content']),
|
|
196
|
+
'tool_names': extract_tool_names(dump['content']),
|
|
197
|
+
'tag_contents': {},
|
|
198
|
+
'section_contents': {},
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# Extract content for each system tag
|
|
202
|
+
for tag in SYSTEM_TAGS:
|
|
203
|
+
contents = extract_tag_content(dump['content'], tag)
|
|
204
|
+
if contents:
|
|
205
|
+
info['tag_contents'][tag] = contents
|
|
206
|
+
|
|
207
|
+
# Extract named sections
|
|
208
|
+
for header in SECTION_HEADERS:
|
|
209
|
+
sections = extract_section_content(dump['content'], header)
|
|
210
|
+
if sections:
|
|
211
|
+
info['section_contents'][header] = sections
|
|
212
|
+
|
|
213
|
+
analysis.append(info)
|
|
214
|
+
|
|
215
|
+
# 3. Build the report
|
|
216
|
+
report = []
|
|
217
|
+
report.append("# System Prompt Instructions — Full Extraction Report\n")
|
|
218
|
+
report.append(f"> Generated: {datetime.now().isoformat()}\n")
|
|
219
|
+
report.append(f"> Source: `/tmp/system_prompt_logs.txt` ({len(content):,} bytes, {content.count(chr(10)):,} lines)\n")
|
|
220
|
+
report.append(f"> Dumps found: **{len(dumps)}**\n\n")
|
|
221
|
+
|
|
222
|
+
# --- Overview table ---
|
|
223
|
+
report.append("## 1. Dump Sessions Overview\n\n")
|
|
224
|
+
report.append("| # | Timestamp | Size | Lines | Tools | XML Tags |\n")
|
|
225
|
+
report.append("|---|----|----|----|----|----|\n")
|
|
226
|
+
for a in analysis:
|
|
227
|
+
report.append(f"| {a['index']} | {a['timestamp'][:60]} | {a['size']:,} | {a['lines']} | {len(a['tool_names'])} | {len(a['xml_tags'])} |\n")
|
|
228
|
+
report.append("\n")
|
|
229
|
+
|
|
230
|
+
# --- All XML tags across all dumps ---
|
|
231
|
+
report.append("## 2. All System XML Tags Found\n\n")
|
|
232
|
+
all_tags = set()
|
|
233
|
+
for a in analysis:
|
|
234
|
+
all_tags.update(a['xml_tags'])
|
|
235
|
+
for tag in sorted(all_tags):
|
|
236
|
+
occurrences = sum(1 for a in analysis if tag in a['xml_tags'])
|
|
237
|
+
report.append(f"- `<{tag}>` — found in {occurrences}/{len(analysis)} dumps\n")
|
|
238
|
+
report.append("\n")
|
|
239
|
+
|
|
240
|
+
# --- Tool names ---
|
|
241
|
+
report.append("## 3. Tool/Function Names\n\n")
|
|
242
|
+
all_tools = set()
|
|
243
|
+
for a in analysis:
|
|
244
|
+
all_tools.update(a['tool_names'])
|
|
245
|
+
report.append(f"**Total unique tools: {len(all_tools)}**\n\n")
|
|
246
|
+
for tool in sorted(all_tools):
|
|
247
|
+
occurrences = sum(1 for a in analysis if tool in a['tool_names'])
|
|
248
|
+
report.append(f"- `{tool}` — in {occurrences}/{len(analysis)} dumps\n")
|
|
249
|
+
report.append("\n")
|
|
250
|
+
|
|
251
|
+
# --- CRITICAL INSTRUCTIONs ---
|
|
252
|
+
report.append("## 4. CRITICAL INSTRUCTION Variants\n\n")
|
|
253
|
+
for ci_name in ['CRITICAL INSTRUCTION 1', 'CRITICAL INSTRUCTION 2']:
|
|
254
|
+
report.append(f"### {ci_name}\n\n")
|
|
255
|
+
variants = defaultdict(list)
|
|
256
|
+
for a in analysis:
|
|
257
|
+
ci_data = a['critical_instructions'].get(ci_name, [])
|
|
258
|
+
for text in ci_data:
|
|
259
|
+
h = content_hash(text)
|
|
260
|
+
variants[h].append((a['index'], text))
|
|
261
|
+
|
|
262
|
+
if not variants:
|
|
263
|
+
report.append("*Not found in any dump.*\n\n")
|
|
264
|
+
else:
|
|
265
|
+
report.append(f"**{len(variants)} unique variant(s):**\n\n")
|
|
266
|
+
for h, occurrences in variants.items():
|
|
267
|
+
dump_ids = [str(o[0]) for o in occurrences]
|
|
268
|
+
report.append(f"**Variant `{h}`** (dumps: {', '.join(dump_ids)}):\n")
|
|
269
|
+
report.append("```\n")
|
|
270
|
+
report.append(occurrences[0][1][:400] + "\n")
|
|
271
|
+
report.append("```\n\n")
|
|
272
|
+
|
|
273
|
+
# --- MANDATORY RULES ---
|
|
274
|
+
report.append("## 5. MANDATORY RULE Variants\n\n")
|
|
275
|
+
variants = defaultdict(list)
|
|
276
|
+
for a in analysis:
|
|
277
|
+
for text in a['mandatory_rules']:
|
|
278
|
+
h = content_hash(text)
|
|
279
|
+
variants[h].append((a['index'], text))
|
|
280
|
+
|
|
281
|
+
if not variants:
|
|
282
|
+
report.append("*Not found.*\n\n")
|
|
283
|
+
else:
|
|
284
|
+
report.append(f"**{len(variants)} unique variant(s):**\n\n")
|
|
285
|
+
for h, occurrences in variants.items():
|
|
286
|
+
dump_ids = [str(o[0]) for o in occurrences]
|
|
287
|
+
report.append(f"**Variant `{h}`** (dumps: {', '.join(dump_ids)}):\n")
|
|
288
|
+
report.append("```\n")
|
|
289
|
+
report.append(occurrences[0][1][:300] + "\n")
|
|
290
|
+
report.append("```\n\n")
|
|
291
|
+
|
|
292
|
+
# --- CRITICAL REMINDERs ---
|
|
293
|
+
report.append("## 6. CRITICAL REMINDER Variants\n\n")
|
|
294
|
+
variants = defaultdict(list)
|
|
295
|
+
for a in analysis:
|
|
296
|
+
for text in a['critical_reminders']:
|
|
297
|
+
h = content_hash(text)
|
|
298
|
+
variants[h].append((a['index'], text))
|
|
299
|
+
|
|
300
|
+
if not variants:
|
|
301
|
+
report.append("*Not found.*\n\n")
|
|
302
|
+
else:
|
|
303
|
+
report.append(f"**{len(variants)} unique variant(s):**\n\n")
|
|
304
|
+
for h, occurrences in variants.items():
|
|
305
|
+
dump_ids = [str(o[0]) for o in occurrences]
|
|
306
|
+
report.append(f"**Variant `{h}`** (dumps: {', '.join(dump_ids)}):\n")
|
|
307
|
+
report.append("```\n")
|
|
308
|
+
report.append(occurrences[0][1][:300] + "\n")
|
|
309
|
+
report.append("```\n\n")
|
|
310
|
+
|
|
311
|
+
# --- Tag content comparison ---
|
|
312
|
+
report.append("## 7. System Tag Content — Cross-Dump Comparison\n\n")
|
|
313
|
+
report.append("For each system tag, shows how many unique content variants exist across dumps.\n\n")
|
|
314
|
+
|
|
315
|
+
tag_variants = {}
|
|
316
|
+
for tag in SYSTEM_TAGS:
|
|
317
|
+
variants = defaultdict(list)
|
|
318
|
+
for a in analysis:
|
|
319
|
+
contents = a['tag_contents'].get(tag, [])
|
|
320
|
+
for c in contents:
|
|
321
|
+
h = content_hash(c)
|
|
322
|
+
variants[h].append(a['index'])
|
|
323
|
+
if variants:
|
|
324
|
+
tag_variants[tag] = variants
|
|
325
|
+
|
|
326
|
+
report.append("| Tag | Unique Variants | In Dumps |\n")
|
|
327
|
+
report.append("|----|----|----|\n")
|
|
328
|
+
for tag, variants in sorted(tag_variants.items()):
|
|
329
|
+
all_dumps = set()
|
|
330
|
+
for dump_ids in variants.values():
|
|
331
|
+
all_dumps.update(dump_ids)
|
|
332
|
+
report.append(f"| `<{tag}>` | {len(variants)} | {sorted(all_dumps)} |\n")
|
|
333
|
+
report.append("\n")
|
|
334
|
+
|
|
335
|
+
# --- Detailed tag content for key sections ---
|
|
336
|
+
KEY_TAGS = ['identity', 'communication_style', 'user_rules', 'EPHEMERAL_MESSAGE']
|
|
337
|
+
report.append("## 8. Key Tag Content Details\n\n")
|
|
338
|
+
for tag in KEY_TAGS:
|
|
339
|
+
report.append(f"### `<{tag}>`\n\n")
|
|
340
|
+
if tag not in tag_variants:
|
|
341
|
+
report.append("*Not found.*\n\n")
|
|
342
|
+
continue
|
|
343
|
+
|
|
344
|
+
for h, dump_ids in tag_variants[tag].items():
|
|
345
|
+
# Find the actual content from one of the dumps
|
|
346
|
+
sample_text = None
|
|
347
|
+
for a in analysis:
|
|
348
|
+
contents = a['tag_contents'].get(tag, [])
|
|
349
|
+
for c in contents:
|
|
350
|
+
if content_hash(c) == h:
|
|
351
|
+
sample_text = c
|
|
352
|
+
break
|
|
353
|
+
if sample_text:
|
|
354
|
+
break
|
|
355
|
+
|
|
356
|
+
report.append(f"**Variant `{h}`** (dumps: {dump_ids}):\n")
|
|
357
|
+
report.append("```\n")
|
|
358
|
+
# Truncate very long content
|
|
359
|
+
if sample_text and len(sample_text) > 600:
|
|
360
|
+
report.append(sample_text[:600] + "\n... [truncated, " + str(len(sample_text)) + " chars total]\n")
|
|
361
|
+
elif sample_text:
|
|
362
|
+
report.append(sample_text + "\n")
|
|
363
|
+
report.append("```\n\n")
|
|
364
|
+
|
|
365
|
+
# --- Section headers comparison ---
|
|
366
|
+
report.append("## 9. Named Sections (=== HEADER ===) Across Dumps\n\n")
|
|
367
|
+
for header in SECTION_HEADERS:
|
|
368
|
+
found_in = []
|
|
369
|
+
for a in analysis:
|
|
370
|
+
if header in a['section_contents']:
|
|
371
|
+
found_in.append(a['index'])
|
|
372
|
+
if found_in:
|
|
373
|
+
report.append(f"- **{header}** — in dumps: {found_in}\n")
|
|
374
|
+
report.append("\n")
|
|
375
|
+
|
|
376
|
+
# --- All unique directives/imperatives ---
|
|
377
|
+
report.append("## 10. All Directive Keywords Found\n\n")
|
|
378
|
+
report.append("Scanning for imperative phrases across all dumps...\n\n")
|
|
379
|
+
|
|
380
|
+
directive_patterns = [
|
|
381
|
+
(r'MUST\s+(?:ALWAYS\s+)?(?:NOT\s+)?[\w\s]+', 'MUST ...'),
|
|
382
|
+
(r'NEVER\s+[\w\s]+', 'NEVER ...'),
|
|
383
|
+
(r'ALWAYS\s+[\w\s]+', 'ALWAYS ...'),
|
|
384
|
+
(r'DO\s+NOT\s+[\w\s]+', 'DO NOT ...'),
|
|
385
|
+
(r'IMPORTANT:\s*[^\n]+', 'IMPORTANT: ...'),
|
|
386
|
+
(r'UNACCEPTABLE', 'UNACCEPTABLE'),
|
|
387
|
+
(r'FAILED!', 'FAILED!'),
|
|
388
|
+
(r'CRITICAL\s+\w+', 'CRITICAL ...'),
|
|
389
|
+
(r'MANDATORY\s+\w+', 'MANDATORY ...'),
|
|
390
|
+
]
|
|
391
|
+
|
|
392
|
+
for pat_str, label in directive_patterns:
|
|
393
|
+
pat = re.compile(pat_str)
|
|
394
|
+
all_matches = set()
|
|
395
|
+
for dump in dumps:
|
|
396
|
+
matches = pat.findall(dump['content'])
|
|
397
|
+
for m in matches:
|
|
398
|
+
all_matches.add(m.strip()[:120])
|
|
399
|
+
|
|
400
|
+
if all_matches:
|
|
401
|
+
report.append(f"### `{label}` ({len(all_matches)} unique)\n\n")
|
|
402
|
+
for m in sorted(all_matches):
|
|
403
|
+
report.append(f"- `{m}`\n")
|
|
404
|
+
report.append("\n")
|
|
405
|
+
|
|
406
|
+
# Write report
|
|
407
|
+
report_text = ''.join(report)
|
|
408
|
+
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
409
|
+
f.write(report_text)
|
|
410
|
+
|
|
411
|
+
print(f"\nReport written to: {OUTPUT_FILE}")
|
|
412
|
+
print(f" Size: {len(report_text):,} bytes, {report_text.count(chr(10)):,} lines")
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
if __name__ == '__main__':
|
|
416
|
+
main()
|