@holdyourvoice/hyv 2.0.1 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/assets/ai-eliminator-rules.md +130 -0
- package/assets/ai-eliminator-skill.md +63 -0
- package/assets/chatgpt-instructions 2.txt +8 -0
- package/assets/chatgpt-instructions 3.txt +8 -0
- package/assets/chatgpt-instructions.txt +8 -0
- package/assets/claude-code-skill 2.md +24 -0
- package/assets/claude-code-skill.md +24 -0
- package/assets/cursor-rules 2.md +12 -0
- package/assets/cursor-rules 3.md +12 -0
- package/assets/cursor-rules.md +12 -0
- package/assets/economic-drift-voice.md +42 -0
- package/assets/hold-your-voice-skill.md +174 -0
- package/assets/voice-matcher-skill.md +57 -0
- package/assets/voice-profile-schema.json +28 -0
- package/dist/index.js +6484 -315
- package/package.json +8 -8
- package/scripts/hold_voice.py +2013 -0
- package/scripts/hold_voice_sync.py +194 -0
|
@@ -0,0 +1,2013 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
#!/usr/bin/env python3
|
|
3
|
+
"""Portable Hold Your Voice helpers.
|
|
4
|
+
|
|
5
|
+
This script intentionally has no third-party dependencies. It is not the Hold
|
|
6
|
+
Your Voice product backend; it is the reusable local layer for Codex projects:
|
|
7
|
+
build a sample-grounded profile, scan for AI cadence, and generate line-level
|
|
8
|
+
rewrite prompts.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import datetime
|
|
15
|
+
import html
|
|
16
|
+
import json
|
|
17
|
+
import math
|
|
18
|
+
import re
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
TEXT_EXTENSIONS = {
|
|
25
|
+
".md",
|
|
26
|
+
".mdx",
|
|
27
|
+
".txt",
|
|
28
|
+
".html",
|
|
29
|
+
".htm",
|
|
30
|
+
".rst",
|
|
31
|
+
".adoc",
|
|
32
|
+
".csv",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
AI_PATTERN_RULES = [
|
|
36
|
+
# --- Binary reframing & negation ---
|
|
37
|
+
("binary_reframing", re.compile(
|
|
38
|
+
r"\b(?:it'?s|that'?s|this\s+(?:is|was)|here'?s)\s+not\b.{0,80}\b(?:it'?s|that'?s|but)\b|"
|
|
39
|
+
r"\b(?:the\s+)?(?:hard\s+)?(?:part|point)\s+isn'?t\b.{0,80}\b(?:it'?s|but)\b|"
|
|
40
|
+
r"\byou\s+don'?t\s+need\b.{0,80}\byou\s+need\b|"
|
|
41
|
+
r"\b(?:brand|trust|strategy|marketing|pricing|success|growth|content|design)\s+is\s+not\s+(?:just\s+)?about\b",
|
|
42
|
+
re.I,
|
|
43
|
+
)),
|
|
44
|
+
("not_just_but", re.compile(
|
|
45
|
+
r"\bnot\s+just\b.{3,80}\bbut\s+(?:also\s+)?|"
|
|
46
|
+
r"\bnot\s+only\b.{3,80}\bbut\s+(?:also\s+)?",
|
|
47
|
+
re.I,
|
|
48
|
+
)),
|
|
49
|
+
("more_than_just", re.compile(
|
|
50
|
+
r"\bmore\s+than\s+just\b|\bit'?s\s+not\s+just\s+about\b",
|
|
51
|
+
re.I,
|
|
52
|
+
)),
|
|
53
|
+
|
|
54
|
+
# --- Truth/reality posturing ---
|
|
55
|
+
("truth_harsh_reality", re.compile(
|
|
56
|
+
r"\b(?:the\s+)?(?:uncomfortable|hard|harsh|brutal|ugly|unsexy|real|honest)\s+(?:truth|reality)\b|"
|
|
57
|
+
r"\bthe\s+truth\s+is\b|\bthe\s+reality\s+is\b|\bhere'?s\s+the\s+truth\b|"
|
|
58
|
+
r"\bthe\s+ugly\s+truth\b|\bthe\s+harsh\s+reality\b|"
|
|
59
|
+
r"\b(?:brutal\s+honesty|real\s+talk)\b|\breality\s+check:",
|
|
60
|
+
re.I,
|
|
61
|
+
)),
|
|
62
|
+
|
|
63
|
+
# --- Staccato drama & performance cadence ---
|
|
64
|
+
("staccato_drama", re.compile(
|
|
65
|
+
r"\b(?:no|not)\s+\w[^.!?\n]{0,40}[.!?]\s*(?:no|not)\s+\w[^.!?\n]{0,40}[.!?]\s*(?:no|not|just)\s+\w",
|
|
66
|
+
re.I,
|
|
67
|
+
)),
|
|
68
|
+
("founder_cadence", re.compile(
|
|
69
|
+
r"\b(?:the\s+)?moment\b.{3,80}\bbecomes?\b|"
|
|
70
|
+
r"\b(?:here'?s|here\s+is)\s+(?:the\s+)?(?:thing|kicker|part\s+most\s+people\s+miss|what\s+nobody'?s\s+saying)\b|"
|
|
71
|
+
r"\b(?:and|but)\s+honestly\?|\bhere'?s\s+the\s+kicker\b|"
|
|
72
|
+
r"\bwhat\s+nobody'?s\s+(?:saying|talking\s+about)\b|"
|
|
73
|
+
r"\bthe\s+part\s+most\s+people\s+miss\b|"
|
|
74
|
+
r"\bthe\s+best\s+part\?|\bthe\s+kicker\?|"
|
|
75
|
+
r"\bsame\s+[^.!?\n]{1,35}[.!?]\s*(?:better|nicer|cleaner|calmer|safer)\s+[^.!?\n]{1,35}[.!?]?",
|
|
76
|
+
re.I,
|
|
77
|
+
)),
|
|
78
|
+
("restatement_polish", re.compile(
|
|
79
|
+
r"\bwhich\s+is\s+another\s+way\s+of\s+saying\b|"
|
|
80
|
+
r"\bin\s+other\s+words\b|"
|
|
81
|
+
r"\bto\s+put\s+it\s+(?:simply|another\s+way)\b|"
|
|
82
|
+
r"\bin\s+a\s+nutshell\b",
|
|
83
|
+
re.I,
|
|
84
|
+
)),
|
|
85
|
+
("spoiler_reveal", re.compile(
|
|
86
|
+
r"\bspoiler(?:\s+alert)?:\s*it'?s\s+not\b|"
|
|
87
|
+
r"\b(?:and|but)\s+here'?s\s+the\s+(?:truth|reality)\b",
|
|
88
|
+
re.I,
|
|
89
|
+
)),
|
|
90
|
+
|
|
91
|
+
# --- Landscape / era / temporal grandstanding ---
|
|
92
|
+
("landscape_era", re.compile(
|
|
93
|
+
r"\b(?:in\s+)?(?:today'?s\s+)?(?:fast.paced|ever.evolving|ever.changing|digital)\s+(?:world|age|era|landscape)\b|"
|
|
94
|
+
r"\b(?:ever.evolving|ever.increasing|constantly\s+growing|increasingly)\s+(?:landscape|world)\b|"
|
|
95
|
+
r"\bin\s+today'?s\s+world\b|\bin\s+the\s+digital\s+age\b|"
|
|
96
|
+
r"\bin\s+this\s+era\s+of\b|"
|
|
97
|
+
r"\bnow\s+more\s+than\s+ever\b|"
|
|
98
|
+
r"\b(?:as|like)\s+never\s+before\b|"
|
|
99
|
+
r"\bhas\s+never\s+been\s+more\s+important\b|"
|
|
100
|
+
r"\bthe\s+rise\s+of\s+(?:the\s+)?\w+\b|"
|
|
101
|
+
r"\b(?:a|the)\s+new\s+era\s+of\b|"
|
|
102
|
+
r"\b(?:has\s+been\s+around\s+for\s+centuries|since\s+the\s+dawn\s+of\s+time)\b",
|
|
103
|
+
re.I,
|
|
104
|
+
)),
|
|
105
|
+
|
|
106
|
+
# --- Formulaic connectors & transitions ---
|
|
107
|
+
("formulaic_connector", re.compile(
|
|
108
|
+
r"\b(?:firstly|secondly|thirdly|lastly|moreover|furthermore|in\s+addition\b(?:\s*,\s*|$)|"
|
|
109
|
+
r"in\s+conclusion|to\s+summarize|to\s+sum\s+up|to\s+recap|in\s+summary\b(?:\s*,\s*|$)|"
|
|
110
|
+
r"it\s+is\s+important\s+to\s+note|it'?s\s+important\s+to\s+note|it\s+should\s+be\s+noted|"
|
|
111
|
+
r"it'?s\s+worth\s+noting\s+that|it'?s\s+important\s+to\s+remember\b|"
|
|
112
|
+
r"however\s*,\s*it'?s\s+important\s+to\s+remember|"
|
|
113
|
+
r"keep\s+in\s+mind\s+that|remember\s+that\b|"
|
|
114
|
+
r"on\s+top\s+of\s+that\b)",
|
|
115
|
+
re.I,
|
|
116
|
+
)),
|
|
117
|
+
|
|
118
|
+
# --- Transitions: balance & contrast ---
|
|
119
|
+
("balanced_contrast", re.compile(
|
|
120
|
+
r"\bon\s+the\s+one\s+hand\b|"
|
|
121
|
+
r"\bon\s+the\s+other\s+hand\b|"
|
|
122
|
+
r"\bon\s+the\s+surface\b.{0,80}\b(?:but\s+)?beneath\b|"
|
|
123
|
+
r"\bat\s+first\s+glance\b|"
|
|
124
|
+
r"\bon\s+the\s+flip\s+side\b|"
|
|
125
|
+
r"\bat\s+first\s*,\s*it\s+might\s+seem\b|"
|
|
126
|
+
r"\bon\s+paper\b.{0,80}\bin\s+practice\b|"
|
|
127
|
+
r"\bwhether\s+you\s+(?:love\s+it\s+or\s+hate\s+it|realize\s+it\s+or\s+not)\b|"
|
|
128
|
+
r"\blike\s+it\s+or\s+not\b|"
|
|
129
|
+
r"\bready\s+or\s+not\b",
|
|
130
|
+
re.I,
|
|
131
|
+
)),
|
|
132
|
+
|
|
133
|
+
# --- Hedging & non-committal ---
|
|
134
|
+
("hedging_noncommittal", re.compile(
|
|
135
|
+
r"\bit\s+depends\b.{0,60}\bbut\b|"
|
|
136
|
+
r"\bno\s+one.size.fits.all\b|"
|
|
137
|
+
r"\btailor\s+(?:it|this|these|them)\b.{0,40}\bto\s+(?:your|the)\s+(?:needs|context|audience)\b|"
|
|
138
|
+
r"\balways\s+tailor\b|"
|
|
139
|
+
r"\b(?:in\s+many\s+ways|from\s+a\s+broader\s+perspective)\b|"
|
|
140
|
+
r"\bin\s+the\s+context\s+of\b|"
|
|
141
|
+
r"\b(?:chances\s+are|more\s+often\s+than\s+not|at\s+first\s*,\s*it\s+might\s+seem)\b|"
|
|
142
|
+
r"\bit\s+can\s+be\s+tempting\s+to\b|\byou\s+might\s+be\s+tempted\s+to\b|"
|
|
143
|
+
r"\bonly\s+time\s+will\s+tell\b|"
|
|
144
|
+
r"\bboth\s+sides\s+have\s+valid\s+points\b|"
|
|
145
|
+
r"\bthat\s+said\b",
|
|
146
|
+
re.I,
|
|
147
|
+
)),
|
|
148
|
+
|
|
149
|
+
# --- Let's/X invitation ---
|
|
150
|
+
("lets_invitation", re.compile(
|
|
151
|
+
r"\blet'?s\s+(?:dive|explore|break\s+(?:it|this)\s+down|delve|be\s+honest)\b|"
|
|
152
|
+
r"\b(?:dive|delv(?:e|ing))\s+(?:deeper|into|the\s+intricacies)\b|"
|
|
153
|
+
r"\bdeep\s+dive\b|\blet'?s\s+dive\s+in\b",
|
|
154
|
+
re.I,
|
|
155
|
+
)),
|
|
156
|
+
|
|
157
|
+
# --- Empathy/validation openers ---
|
|
158
|
+
("empathy_opener", re.compile(
|
|
159
|
+
r"\bit'?s\s+easy\s+to\s+feel\b|\byou'?re\s+not\s+alone\b|"
|
|
160
|
+
r"\bif\s+you'?ve\s+ever\s+felt\b|\byou'?re\s+not\s+imagining\s+it\b|"
|
|
161
|
+
r"\byou'?re\s+not\s+wrong\s+to\s+feel\b|"
|
|
162
|
+
r"\byou\s+deserve\b|\bfear\s+not\b|"
|
|
163
|
+
r"\bshouting\s+into\s+the\s+void\b|"
|
|
164
|
+
r"\bcurious\s+what\s+others\s+think\b",
|
|
165
|
+
re.I,
|
|
166
|
+
)),
|
|
167
|
+
|
|
168
|
+
# --- Journey / destination clichés ---
|
|
169
|
+
("journey_cliche", re.compile(
|
|
170
|
+
r"\b(?:brand|learning|success|life|growth|writing|fitness|business)\s+isn'?t\s+a\s+destination\b.{0,40}\bjourney\b|"
|
|
171
|
+
r"\bit'?s\s+a\s+journey\b.{0,50}\bnot\s+a\s+destination\b|"
|
|
172
|
+
r"\bno\s+matter\s+where\s+you\s+are\s+on\s+your\s+journey\b|"
|
|
173
|
+
r"\bembark\s+on\s+(?:a|your|the|this)\b|"
|
|
174
|
+
r"\byou'?re\s+still\s+early\b|\bit'?s\s+still\s+day\s+one\b|"
|
|
175
|
+
r"\bfrom\s+(?:confusion\s+to\s+clarity|followers\s+to\s+fans|ideas\s+to\s+income)\b|"
|
|
176
|
+
r"\b(?:brand.building|writing|creative|learning)\s+journey\b",
|
|
177
|
+
re.I,
|
|
178
|
+
)),
|
|
179
|
+
|
|
180
|
+
# --- Marketing/inflated verbs ---
|
|
181
|
+
("inflated_verbs", re.compile(
|
|
182
|
+
r"\b(?:unlock|harness|leverage)\s+the\s+power\s+of\b|"
|
|
183
|
+
r"\b(?:unlock|unleash)\s+(?:the\s+)?(?:potential|power)\b|"
|
|
184
|
+
r"\b(?:supercharge|turbocharge|revolutionize\s+the\s+way)\b|"
|
|
185
|
+
r"\b(?:transform|elevate|enhance|boost|improve)\s+your\s+\w+\b|"
|
|
186
|
+
r"\btake\s+(?:your|it|this|them|their)\b.{0,30}\bto\s+(?:the\s+next\s+level|new\s+heights)\b|"
|
|
187
|
+
r"\b(?:game.changer|on\s+steroids)\b|"
|
|
188
|
+
r"\bmaster\s+the\s+art\s+of\b|"
|
|
189
|
+
r"\bdiscover\s+a\s+powerful\s+way\b",
|
|
190
|
+
re.I,
|
|
191
|
+
)),
|
|
192
|
+
|
|
193
|
+
# --- Metaphor clusters ---
|
|
194
|
+
("ai_metaphors", re.compile(
|
|
195
|
+
r"\b(?:beacon|lighthouse)\s+(?:of|for|in)\b|"
|
|
196
|
+
r"\b(?:tapestry|symphony|tides)\s+of\b|"
|
|
197
|
+
r"\b(?:flood|avalanche|tsunami)\s+of\b|"
|
|
198
|
+
r"\b(?:noise|signal)\b.{0,30}\b(?:signal|noise)\b|"
|
|
199
|
+
r"\b(?:north\s+star|double.edged\s+sword|blessing\s+and\s+a\s+curse)\b|"
|
|
200
|
+
r"\b(?:silent\s+killer|hidden\s+gem|hidden\s+lever|low.hanging\s+fruit)\b|"
|
|
201
|
+
r"\b(?:tip\s+of\s+the\s+iceberg|scratch(?:es)?\s+the\s+surface)\b|"
|
|
202
|
+
r"\b(?:skeleton|framework|scaffolding|blueprint|roadmap|playbook)\b\s+(?:for|to|that|as)\b|"
|
|
203
|
+
r"\b(?:wealth\s+of|treasure\s+trove)\b|"
|
|
204
|
+
r"\b(?:the\s+)?power\s+of\b.{0,40}\b(?:cannot|should\s+not|is\s+immense|is\s+real|is\s+undeniable)\b",
|
|
205
|
+
re.I,
|
|
206
|
+
)),
|
|
207
|
+
|
|
208
|
+
# --- Inflated importance claims ---
|
|
209
|
+
("inflated_importance", re.compile(
|
|
210
|
+
r"\b(?:crucial|critical|pivotal)\s+role\b|"
|
|
211
|
+
r"\b(?:a\s+testament\s+to|the\s+results\s+speak\s+for\s+themselves)\b|"
|
|
212
|
+
r"\b(?:remarkably|incredibly|highly)\s+\w+\b|"
|
|
213
|
+
r"\b(?:significant\s+milestone|at\s+scale)\b|"
|
|
214
|
+
r"\b(?:at\s+its\s+finest|at\s+the\s+heart\s+of)\b|"
|
|
215
|
+
r"\b(?:the\s+power\s+of\b.{0,40}\b(?:cannot|should\s+not)\b)|"
|
|
216
|
+
r"\b(?:championing|advocating\s+for)\b.{0,40}\b(?:change|reform|transparency)\b",
|
|
217
|
+
re.I,
|
|
218
|
+
)),
|
|
219
|
+
|
|
220
|
+
# --- Audience-inclusion triads ---
|
|
221
|
+
("audience_triad", re.compile(
|
|
222
|
+
r"\bwhether\s+you'?re\s+(?:a\s+)?\w+(?:\s+\w+)?\s*,\s*(?:a\s+)?\w+(?:\s+\w+)?\s*,\s*(?:or|and)\s+(?:a\s+)?\w+\b|"
|
|
223
|
+
r"\bfrom\s+(?:solo\s+)?(?:tiny\s+)?\w+\s+to\s+(?:large\s+)?(?:global\s+)?\w+\s*,\s*everyone\s+\w+\b|"
|
|
224
|
+
r"\bwhether\s+you'?re\s+(?:a\s+)?beginner\b|"
|
|
225
|
+
r"\bno\s+matter\s+where\s+you\s+are\b",
|
|
226
|
+
re.I,
|
|
227
|
+
)),
|
|
228
|
+
|
|
229
|
+
# --- SEO / guide framing ---
|
|
230
|
+
("guide_framing", re.compile(
|
|
231
|
+
r"\byou'?re\s+in\s+the\s+right\s+place\b|"
|
|
232
|
+
r"\bhere'?s\s+a\s+step.by.step\s+guide\b|"
|
|
233
|
+
r"\b(?:step\s+1|step\s+2|step\s+3)\b|"
|
|
234
|
+
r"\b(?:first\s*,\s*second\s*,\s*third)\b|"
|
|
235
|
+
r"\bkey\s+(?:takeaways?|insights?)\b|"
|
|
236
|
+
r"\bactionable\s+tips?\b|"
|
|
237
|
+
r"\bno\s+fluff\b|\bno.nonsense\b",
|
|
238
|
+
re.I,
|
|
239
|
+
)),
|
|
240
|
+
|
|
241
|
+
# --- Wrapping/closing patterns ---
|
|
242
|
+
("wrapping_patterns", re.compile(
|
|
243
|
+
r"\b(?:ultimately|at\s+the\s+end\s+of\s+the\s+day|the\s+bottom\s+line\s+is|"
|
|
244
|
+
r"it\s+all\s+comes\s+down\s+to)\b|"
|
|
245
|
+
r"\b(?:best.case\s+scenario|worst.case\s+scenario)\b|"
|
|
246
|
+
r"\b(?:the\s+good\s+news\s+is|the\s+bad\s+news\s+is)\b|"
|
|
247
|
+
r"\blet\s+that\s+sink\s+in\b|\bif\s+you\s+think\s+about\s+it\b|"
|
|
248
|
+
r"\bthe\s+stakes\s+are\s+high\b|"
|
|
249
|
+
r"\b(?:before\s+you\s+know\s+it|in\s+the\s+blink\s+of\s+an\s+eye)\b|"
|
|
250
|
+
r"\b(?:more\s+often\s+than\s+you\s+think|you\s+won'?t\s+believe)\b|"
|
|
251
|
+
r"\b(?:happens|occur|churn|happen|changes?)\s+(?:faster|quicker|sooner)\s+than\s+you\s+think\b",
|
|
252
|
+
re.I,
|
|
253
|
+
)),
|
|
254
|
+
|
|
255
|
+
# --- Temporal / trend clichés ---
|
|
256
|
+
("trend_cliches", re.compile(
|
|
257
|
+
r"\b(?:attention|trust|retention|data)\s+is\s+the\s+new\s+(?:currency|growth\s+hack|acquisition|oil)\b|"
|
|
258
|
+
r"\bthe\s+best\s+time\s+(?:to\s+\w+|was)\b.{0,80}\b(?:second.best|is\s+now)\b|"
|
|
259
|
+
r"\b(?:low\s+barrier|high\s+leverage)\b|"
|
|
260
|
+
r"\b(?:quick\s+wins?|silver\s+bullet)\b|"
|
|
261
|
+
r"\bstart\s+small\s+and\s+iterate\b|"
|
|
262
|
+
r"\b(?:from\s+\w+\s+to\s+\w+\s*[,:]\s*)\b",
|
|
263
|
+
re.I,
|
|
264
|
+
)),
|
|
265
|
+
|
|
266
|
+
# --- Pain points & problem framing ---
|
|
267
|
+
("pain_points_framing", re.compile(
|
|
268
|
+
r"\bpain\s+points?\b(?!\s+of)|\baddress\s+(?:the|their|your)\s+pain\s+points\b|"
|
|
269
|
+
r"\bspeak\s+(?:directly\s+)?to\s+(?:their|your)\s+pain\s+points\b",
|
|
270
|
+
re.I,
|
|
271
|
+
)),
|
|
272
|
+
|
|
273
|
+
# --- Overly structured / meta patterns ---
|
|
274
|
+
("meta_structuring", re.compile(
|
|
275
|
+
r"\b(?:in\s+this\s+(?:article|guide|post|piece)|this\s+(?:article|guide|post|piece)\s+(?:explores|will\s+explore|discusses))\b|"
|
|
276
|
+
r"\b(?:this\s+essay\s+will\s+discuss|in\s+conclusion\s*,\s*this\s+essay)\b|"
|
|
277
|
+
r"\blet\s+me\s+know\s+if\s+you\s+need\s+(?:any|more)\s+help\b|"
|
|
278
|
+
r"\bfeel\s+free\s+to\s+ask\b|"
|
|
279
|
+
r"\b(?:if\s+you\s+have\s+follow.up\s+questions|i'?m\s+here\s+to\s+help)\b",
|
|
280
|
+
re.I,
|
|
281
|
+
)),
|
|
282
|
+
|
|
283
|
+
# --- Experience / friction words ---
|
|
284
|
+
("ux_buzzwords", re.compile(
|
|
285
|
+
r"\b(?:seamless(?:\s+experience|\s+journey)?|frictionless(?:\s+journey|\s+experience)?|"
|
|
286
|
+
r"holistic\b(?:\s+\w+)?|comprehensive\b(?:\s+\w+)?|innovative\b(?:\s+\w+)?|"
|
|
287
|
+
r"cutting.edge|state.of.the.art|"
|
|
288
|
+
r"robust(?:\s+\w+)?|scalable(?:\s+\w+)?|best.in.class)\b",
|
|
289
|
+
re.I,
|
|
290
|
+
)),
|
|
291
|
+
|
|
292
|
+
# --- Story / narrative templates ---
|
|
293
|
+
("story_templates", re.compile(
|
|
294
|
+
r"\b(?:little\s+did\s+(?:i|we)\s+know|"
|
|
295
|
+
r"at\s+first\s*,\s*i\s+was\s+skeptical\b.{0,80}\bbut\b|"
|
|
296
|
+
r"imagine\s+this|picture\s+this|"
|
|
297
|
+
r"you\s+wake\s+up\s+to\b)",
|
|
298
|
+
re.I,
|
|
299
|
+
)),
|
|
300
|
+
|
|
301
|
+
# --- Specifically AI-vocab density words ---
|
|
302
|
+
("ai_vocab_density", re.compile(
|
|
303
|
+
r"\b(?:delve|underscore|testament|intricate|multifaceted|cornerstone|landscape|"
|
|
304
|
+
r"foster|harness|tapestry|illuminate|pivotal|elevate|empower|"
|
|
305
|
+
r"seamlessly|revolutionize|supercharge|transformative|holistic|comprehensive|"
|
|
306
|
+
r"innovative|impactful|meaningful|utilize|paradigm|navigate|endeavor|realm|"
|
|
307
|
+
r"profound|encapsulate|synergy|robust|facilitate|bolster|streamline|"
|
|
308
|
+
r"differentiate|myriad|transform|vibrant|dynamic|bustling|ecosystem|"
|
|
309
|
+
r"ever.increasing|constantly\s+growing|increasingly|"
|
|
310
|
+
r"unlock|unleash|(?:re)?imagin(?:e|ing)|curate|iterate|optimize|"
|
|
311
|
+
r"amplify|align|drive\s+\w+|foster|cultivate|shed\s+light\s+on|"
|
|
312
|
+
r"quietly|silently|behind\s+every\b|"
|
|
313
|
+
r"not\s+all\s+\w+\s+are\s+created\s+equal|"
|
|
314
|
+
r"there'?s\s+a\s+fine\s+line\s+between|"
|
|
315
|
+
r"the\s+line\s+between\b.{0,40}\bis\s+blurry\b|"
|
|
316
|
+
r"you\s+don'?t\s+have\s+to\b.{0,40}\b(?:to|you\s+can)\b|"
|
|
317
|
+
r"champion(?:ing|s|ed)\b|advocat(?:ing|e[ds]?)\s+for\b|"
|
|
318
|
+
r"more\s+often\s+than\s+you\s+think\b)",
|
|
319
|
+
re.I,
|
|
320
|
+
)),
|
|
321
|
+
|
|
322
|
+
# --- Em dash (typographic tell) ---
|
|
323
|
+
("em_dash", re.compile(r"\u2014")),
|
|
324
|
+
|
|
325
|
+
# --- Buyer psychology templates ---
|
|
326
|
+
("buyer_psychology", re.compile(
|
|
327
|
+
r"\bpeople\s+don'?t\s+(?:just\s+)?buy\b.{0,60}\bthey\s+buy\b|"
|
|
328
|
+
r"\bpeople\s+buy\s+the\s+feeling\b|"
|
|
329
|
+
r"\bpeople\s+don'?t\s+read\b.{0,40}\bthey\s+skim\b|"
|
|
330
|
+
r"\b(?:it'?s\s+not\s+about|people\s+don'?t\s+care\s+about)\s+your\s+product\b",
|
|
331
|
+
re.I,
|
|
332
|
+
)),
|
|
333
|
+
|
|
334
|
+
# --- The X of Y metaphoric positioning ---
|
|
335
|
+
("x_of_y_metaphor", re.compile(
|
|
336
|
+
r"\bthe\s+(?:netflix|uber|airbnb|apple|google|spotify|tesla|amazon)\s+of\s+\w+\b|"
|
|
337
|
+
r"\boperating\s+system\s+(?:of|for)\s+(?:your|the)\s+\w+\b",
|
|
338
|
+
re.I,
|
|
339
|
+
)),
|
|
340
|
+
|
|
341
|
+
# --- Overwhelm-reassurance ---
|
|
342
|
+
("overwhelm_reassurance", re.compile(
|
|
343
|
+
r"\b(?:can\s+feel|might\s+seem|can\s+be)\s+overwhelming\b.{0,80}\bbut\s+it\s+doesn'?t\s+have\s+to\s+be\b|"
|
|
344
|
+
r"\b(?:can\s+feel|might\s+seem)\s+(?:intimidating|complex|difficult)\b.{0,80}\bbut\b",
|
|
345
|
+
re.I,
|
|
346
|
+
)),
|
|
347
|
+
|
|
348
|
+
# --- Pros/cons framing ---
|
|
349
|
+
("pros_cons_framing", re.compile(
|
|
350
|
+
r"\b(?:pros\s+and\s+cons|advantages\s+and\s+disadvantages)\s+(?:of|to)\b|"
|
|
351
|
+
r"\bhere\s+are\s+the\s+pros\s+and\s+cons\b",
|
|
352
|
+
re.I,
|
|
353
|
+
)),
|
|
354
|
+
|
|
355
|
+
# --- Triple-adjective bloat ---
|
|
356
|
+
("triple_adjective", re.compile(
|
|
357
|
+
r"\b(?:\w+,\s+\w+,\s+(?:and\s+)?\w+\s+(?:approach|strategy|solution|framework|platform|system|tool|method|plan|process))\b|"
|
|
358
|
+
r"\b(?:simple|clear|easy)\s*,\s*(?:useful|effective|powerful|intuitive)\s*,\s*(?:and\s+)?(?:memorable|sustainable|scalable|actionable)\b",
|
|
359
|
+
re.I,
|
|
360
|
+
)),
|
|
361
|
+
|
|
362
|
+
# --- Behind-the-scenes / hidden depth ---
|
|
363
|
+
("hidden_depth", re.compile(
|
|
364
|
+
r"\bbehind\s+(?:the\s+scenes|every\s+\w+)\b.{0,80}\b(?:lies|is)\b|"
|
|
365
|
+
r"\bbehind\s+the\s+scenes\b|"
|
|
366
|
+
r"\bbeneath\s+the\s+surface\b",
|
|
367
|
+
re.I,
|
|
368
|
+
)),
|
|
369
|
+
|
|
370
|
+
# --- Self-referential / AI disclaimer ---
|
|
371
|
+
("self_referential", re.compile(
|
|
372
|
+
r"\bas\s+an\s+ai\s+(?:language\s+)?model\b|"
|
|
373
|
+
r"\bi\s+(?:can'?t|cannot)\s+provide\s+(?:legal|medical|financial|investment)\s+advice\b|"
|
|
374
|
+
r"\bi\s+don'?t\s+have\s+(?:personal\s+experiences|feelings|opinions)\b",
|
|
375
|
+
re.I,
|
|
376
|
+
)),
|
|
377
|
+
|
|
378
|
+
# --- Placeholder brackets ---
|
|
379
|
+
("placeholder_brackets", re.compile(
|
|
380
|
+
r"\[(?:your\s+(?:brand|product|company|list|audience|name|metric|goal)|"
|
|
381
|
+
r"insert\s+(?:metric|name|number|value|example)|target\s+\w+)\]",
|
|
382
|
+
re.I,
|
|
383
|
+
)),
|
|
384
|
+
|
|
385
|
+
# --- Zoom / camera metaphor ---
|
|
386
|
+
("zoom_camera", re.compile(
|
|
387
|
+
r"\b(?:zooming\s+(?:in|out)|from\s+a\s+broader\s+perspective|let'?s\s+zoom\s+(?:in|out))\b",
|
|
388
|
+
re.I,
|
|
389
|
+
)),
|
|
390
|
+
|
|
391
|
+
# --- Core/essence statements (#41, #130) ---
|
|
392
|
+
("essence_statements", re.compile(
|
|
393
|
+
r"\bat\s+(?:its|the)\s+(?:core|heart)\b|"
|
|
394
|
+
r"\bat\s+(?:its|the)\s+(?:core|heart)\s*(?:of\s+)?\w+\s+(?:is|lies|are)\b",
|
|
395
|
+
re.I,
|
|
396
|
+
)),
|
|
397
|
+
|
|
398
|
+
# --- Analogy / simile invitations (#42-43) ---
|
|
399
|
+
("ai_analogies", re.compile(
|
|
400
|
+
r"\bthink\s+of\b.{0,30}\bas\s+(?:a|the|your)\b|"
|
|
401
|
+
r"\b(?:your|the|a|an|\w+)\s+(?:is|are)\s+(?:like|kind\s+of\s+like)\s+(?:a|the)\b|"
|
|
402
|
+
r"\bimagine\s+(?:your|the|a|an|\w+)\s+as\b",
|
|
403
|
+
re.I,
|
|
404
|
+
)),
|
|
405
|
+
|
|
406
|
+
# --- "Sounds simple but" (#44) and "In fact" (#46) ---
|
|
407
|
+
("simple_but_infact", re.compile(
|
|
408
|
+
r"\b(?:this|it|that)\s+(?:might|may|can)\s+sound\s+simple\s*[,.]?\s+but\b|"
|
|
409
|
+
r"\b(?:sounds?\s+simple\s*[,.]?\s+but)\b|"
|
|
410
|
+
r"\bin\s+fact\s*,\s*\w+",
|
|
411
|
+
re.I,
|
|
412
|
+
)),
|
|
413
|
+
|
|
414
|
+
# --- "The X you didn't know you needed" (#71) ---
|
|
415
|
+
("clickbait_didnt_know", re.compile(
|
|
416
|
+
r"\bthe\s+\w+(?:\s+\w+)?\s+you\s+didn'?t\s+know\s+you\s+needed\b",
|
|
417
|
+
re.I,
|
|
418
|
+
)),
|
|
419
|
+
|
|
420
|
+
# --- Self-referential restatement (#79) ---
|
|
421
|
+
("self_referential_restatement", re.compile(
|
|
422
|
+
r"\byou\s+(?:asked|wanted\s+to\s+know|wonder(?:ing)?)\s+(?:how|what|why|whether)\b.{0,80}\b(?:let'?s|so|here'?s)\b|"
|
|
423
|
+
r"\byou\s+(?:asked|wanted\s+to\s+know)\s+about\b.{0,80}\b(?:let'?s|so|here'?s)\s+(?:break|walk|dive|explore)\b",
|
|
424
|
+
re.I,
|
|
425
|
+
)),
|
|
426
|
+
]
|
|
427
|
+
|
|
428
|
+
ABSTRACT_STYLE_WORDS = {
|
|
429
|
+
"alignment",
|
|
430
|
+
"authenticity",
|
|
431
|
+
"awareness",
|
|
432
|
+
"clarity",
|
|
433
|
+
"confidence",
|
|
434
|
+
"consistency",
|
|
435
|
+
"differentiation",
|
|
436
|
+
"execution",
|
|
437
|
+
"framework",
|
|
438
|
+
"identity",
|
|
439
|
+
"messaging",
|
|
440
|
+
"narrative",
|
|
441
|
+
"personality",
|
|
442
|
+
"positioning",
|
|
443
|
+
"preference",
|
|
444
|
+
"presence",
|
|
445
|
+
"recall",
|
|
446
|
+
"relevance",
|
|
447
|
+
"resonance",
|
|
448
|
+
"signal",
|
|
449
|
+
"strategy",
|
|
450
|
+
"trust",
|
|
451
|
+
"utility",
|
|
452
|
+
"value",
|
|
453
|
+
# Expanded from 220 AI patterns document
|
|
454
|
+
"ecosystem",
|
|
455
|
+
"landscape",
|
|
456
|
+
"space",
|
|
457
|
+
"realm",
|
|
458
|
+
"sphere",
|
|
459
|
+
"paradigm",
|
|
460
|
+
"synergy",
|
|
461
|
+
"holistic",
|
|
462
|
+
"robust",
|
|
463
|
+
"scalable",
|
|
464
|
+
"innovative",
|
|
465
|
+
"transformative",
|
|
466
|
+
"comprehensive",
|
|
467
|
+
"sustainable",
|
|
468
|
+
"impactful",
|
|
469
|
+
"meaningful",
|
|
470
|
+
"actionable",
|
|
471
|
+
"seamless",
|
|
472
|
+
"frictionless",
|
|
473
|
+
"cutting-edge",
|
|
474
|
+
"state-of-the-art",
|
|
475
|
+
"best-in-class",
|
|
476
|
+
"optimization",
|
|
477
|
+
"efficiency",
|
|
478
|
+
"productivity",
|
|
479
|
+
"growth",
|
|
480
|
+
"retention",
|
|
481
|
+
"acquisition",
|
|
482
|
+
"engagement",
|
|
483
|
+
"conversion",
|
|
484
|
+
"monetization",
|
|
485
|
+
"scalability",
|
|
486
|
+
"agility",
|
|
487
|
+
"resilience",
|
|
488
|
+
"empowerment",
|
|
489
|
+
"transformation",
|
|
490
|
+
"innovation",
|
|
491
|
+
"disruption",
|
|
492
|
+
"evolution",
|
|
493
|
+
"revolution",
|
|
494
|
+
"iteration",
|
|
495
|
+
"velocity",
|
|
496
|
+
"leverage",
|
|
497
|
+
"amplification",
|
|
498
|
+
"acceleration",
|
|
499
|
+
"facilitation",
|
|
500
|
+
"orchestration",
|
|
501
|
+
"curation",
|
|
502
|
+
"personalization",
|
|
503
|
+
"customization",
|
|
504
|
+
"democratization",
|
|
505
|
+
"accessibility",
|
|
506
|
+
"inclusivity",
|
|
507
|
+
"infrastructure",
|
|
508
|
+
"architecture",
|
|
509
|
+
"foundation",
|
|
510
|
+
"cornerstone",
|
|
511
|
+
"pillar",
|
|
512
|
+
"backbone",
|
|
513
|
+
"lifeblood",
|
|
514
|
+
"catalyst",
|
|
515
|
+
"enabler",
|
|
516
|
+
"driver",
|
|
517
|
+
"engine",
|
|
518
|
+
"flywheel",
|
|
519
|
+
"moat",
|
|
520
|
+
"advantage",
|
|
521
|
+
"differentiator",
|
|
522
|
+
"proposition",
|
|
523
|
+
"promise",
|
|
524
|
+
"mission",
|
|
525
|
+
"vision",
|
|
526
|
+
"purpose",
|
|
527
|
+
"intention",
|
|
528
|
+
"mindset",
|
|
529
|
+
"mindfulness",
|
|
530
|
+
"consciousness",
|
|
531
|
+
"feedback",
|
|
532
|
+
"vulnerability",
|
|
533
|
+
"transparency",
|
|
534
|
+
"accountability",
|
|
535
|
+
"responsibility",
|
|
536
|
+
"ownership",
|
|
537
|
+
"agency",
|
|
538
|
+
"autonomy",
|
|
539
|
+
"sovereignty",
|
|
540
|
+
"freedom",
|
|
541
|
+
"liberation",
|
|
542
|
+
"elevation",
|
|
543
|
+
"ascension",
|
|
544
|
+
"mastery",
|
|
545
|
+
"excellence",
|
|
546
|
+
"greatness",
|
|
547
|
+
"potential",
|
|
548
|
+
"possibility",
|
|
549
|
+
"opportunity",
|
|
550
|
+
"abundance",
|
|
551
|
+
"prosperity",
|
|
552
|
+
"fulfillment",
|
|
553
|
+
"happiness",
|
|
554
|
+
"wellness",
|
|
555
|
+
"wellbeing",
|
|
556
|
+
"balance",
|
|
557
|
+
"harmony",
|
|
558
|
+
"coherence",
|
|
559
|
+
"congruence",
|
|
560
|
+
"integrity",
|
|
561
|
+
"honor",
|
|
562
|
+
"dignity",
|
|
563
|
+
"respect",
|
|
564
|
+
"empathy",
|
|
565
|
+
"compassion",
|
|
566
|
+
"humanity",
|
|
567
|
+
"connection",
|
|
568
|
+
"belonging",
|
|
569
|
+
"tribe",
|
|
570
|
+
"movement",
|
|
571
|
+
"renaissance",
|
|
572
|
+
"awakening",
|
|
573
|
+
"enlightenment",
|
|
574
|
+
"breakthrough",
|
|
575
|
+
"tipping point",
|
|
576
|
+
"inflection",
|
|
577
|
+
"pivot",
|
|
578
|
+
"shift",
|
|
579
|
+
"transition",
|
|
580
|
+
"metamorphosis",
|
|
581
|
+
"rebirth",
|
|
582
|
+
"reinvention",
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
GENERIC_OPENERS = re.compile(
|
|
586
|
+
r"^(?:most|many|some|all)\s+(?:brands|teams|people|founders|companies|businesses|organizations|leaders)\b|"
|
|
587
|
+
r"^(?:in\s+)?(?:today'?s|the)\s+(?:fast.paced|ever.evolving|modern|digital|current|contemporary)\s+(?:world|age|era|landscape|economy)\b",
|
|
588
|
+
re.I,
|
|
589
|
+
)
|
|
590
|
+
QUESTION_OPENER = re.compile(
|
|
591
|
+
r"^(?:have you|do you|did you|what if|why do|how do|are you|is your|can you|will you)\b",
|
|
592
|
+
re.I,
|
|
593
|
+
)
|
|
594
|
+
LESSON_OPENER = re.compile(
|
|
595
|
+
r"^(?:the most important thing|the key to|success is|if you want to|what i learned|"
|
|
596
|
+
r"the hard part|the point isn'?t|you don'?t need|the hard(?:est)?\s+(?:part|thing))\b",
|
|
597
|
+
re.I,
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
# CTA/engagement bait endings
|
|
601
|
+
CTA_ENDINGS = re.compile(
|
|
602
|
+
r"\blet\s+me\s+know\s+if\s+you\s+need\s+(?:any\s+more\s+|any\s+|more\s+)?help\b|"
|
|
603
|
+
r"\bfeel\s+free\s+to\s+(?:ask|reach\s+out|contact|dm|let\s+me\s+know)\b|"
|
|
604
|
+
r"\bcurious\s+what\s+others\s+think\b|"
|
|
605
|
+
r"\bi'?m\s+here\s+to\s+help\b|"
|
|
606
|
+
r"\bif\s+you\s+have\s+follow.up\s+questions\b",
|
|
607
|
+
re.I,
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
SEVEN_WORD_SENTENCE_PATTERN = re.compile(
|
|
611
|
+
r"^(?:\w+\s+){6}(?:\w+)[.!?]$",
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def strip_markup(text: str, suffix: str = "") -> str:
|
|
616
|
+
if suffix.lower() not in {".html", ".htm"}:
|
|
617
|
+
return text
|
|
618
|
+
text = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", text)
|
|
619
|
+
text = re.sub(r"(?s)<[^>]+>", " ", text)
|
|
620
|
+
return html.unescape(text)
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def read_text(path: Path) -> str:
|
|
624
|
+
raw = path.read_text(encoding="utf-8", errors="ignore")
|
|
625
|
+
return strip_markup(raw, path.suffix)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
def iter_text_files(paths: list[str]) -> list[Path]:
|
|
629
|
+
files: list[Path] = []
|
|
630
|
+
for raw in paths:
|
|
631
|
+
path = Path(raw).expanduser()
|
|
632
|
+
if not path.exists():
|
|
633
|
+
raise SystemExit(f"path not found: {path}")
|
|
634
|
+
if path.is_file():
|
|
635
|
+
files.append(path)
|
|
636
|
+
continue
|
|
637
|
+
for item in sorted(path.rglob("*")):
|
|
638
|
+
if not item.is_file():
|
|
639
|
+
continue
|
|
640
|
+
if any(part.startswith(".") for part in item.relative_to(path).parts):
|
|
641
|
+
continue
|
|
642
|
+
if item.suffix.lower() in TEXT_EXTENSIONS:
|
|
643
|
+
files.append(item)
|
|
644
|
+
return files
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def words(text: str) -> list[str]:
|
|
648
|
+
return re.findall(r"[a-zA-Z][a-zA-Z0-9']*", text)
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
def sentences(text: str) -> list[str]:
|
|
652
|
+
parts = re.split(r"(?<=[.!?])\s+|\n{2,}", text)
|
|
653
|
+
return [part.strip() for part in parts if words(part)]
|
|
654
|
+
|
|
655
|
+
|
|
656
|
+
def paragraphs(text: str) -> list[str]:
|
|
657
|
+
return [p.strip() for p in re.split(r"\n\s*\n", text) if len(words(p)) >= 6]
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def variance_label(lengths: list[int]) -> str:
|
|
661
|
+
if len(lengths) < 3:
|
|
662
|
+
return "medium"
|
|
663
|
+
mean = sum(lengths) / len(lengths)
|
|
664
|
+
if mean <= 0:
|
|
665
|
+
return "medium"
|
|
666
|
+
stdev = math.sqrt(sum((length - mean) ** 2 for length in lengths) / len(lengths))
|
|
667
|
+
ratio = stdev / mean
|
|
668
|
+
if ratio < 0.35:
|
|
669
|
+
return "low"
|
|
670
|
+
if ratio > 0.85:
|
|
671
|
+
return "high"
|
|
672
|
+
return "medium"
|
|
673
|
+
|
|
674
|
+
|
|
675
|
+
def infer_case_style(lines: list[str]) -> str:
|
|
676
|
+
starters = []
|
|
677
|
+
properish = 0
|
|
678
|
+
for line in lines:
|
|
679
|
+
stripped = line.strip()
|
|
680
|
+
if not stripped:
|
|
681
|
+
continue
|
|
682
|
+
match = re.search(r"[A-Za-z]", stripped)
|
|
683
|
+
if not match:
|
|
684
|
+
continue
|
|
685
|
+
char = match.group(0)
|
|
686
|
+
starters.append(char)
|
|
687
|
+
if re.search(r"\b[A-Z][a-z]{2,}\b", stripped):
|
|
688
|
+
properish += 1
|
|
689
|
+
if not starters:
|
|
690
|
+
return "mixed"
|
|
691
|
+
lower_ratio = sum(1 for char in starters if char.islower()) / len(starters)
|
|
692
|
+
if lower_ratio >= 0.85 and properish <= len(starters) * 0.2:
|
|
693
|
+
return "mostly lowercase"
|
|
694
|
+
if lower_ratio <= 0.25:
|
|
695
|
+
return "standard sentence case"
|
|
696
|
+
return "mixed"
|
|
697
|
+
|
|
698
|
+
|
|
699
|
+
def infer_argument_pattern(text: str) -> str:
|
|
700
|
+
low = text.lower()
|
|
701
|
+
sentence_list = sentences(text)
|
|
702
|
+
if not sentence_list:
|
|
703
|
+
return "mixed"
|
|
704
|
+
question_ratio = sum(1 for sentence in sentence_list if sentence.rstrip().endswith("?")) / len(sentence_list)
|
|
705
|
+
first_person = len(re.findall(r"\b(?:i|we|my|our|me|us)\b", low))
|
|
706
|
+
contrast = len(re.findall(r"\b(?:but|actually|instead|not|wrong|real|because)\b", low))
|
|
707
|
+
numbers = len(re.findall(r"\b\d+(?:\.\d+)?%?\b", low))
|
|
708
|
+
if question_ratio > 0.18:
|
|
709
|
+
return "question-led"
|
|
710
|
+
if numbers >= max(4, len(sentence_list) // 10):
|
|
711
|
+
return "data-led"
|
|
712
|
+
if first_person >= max(6, len(sentence_list) // 4):
|
|
713
|
+
return "narrative"
|
|
714
|
+
if contrast >= max(8, len(sentence_list) // 3):
|
|
715
|
+
return "contrarian"
|
|
716
|
+
return "mixed"
|
|
717
|
+
|
|
718
|
+
|
|
719
|
+
def first_words(text: str, count: int = 7) -> str:
|
|
720
|
+
found = words(text.lower())
|
|
721
|
+
return " ".join(found[:count])
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def top_opening_moves(paragraph_list: list[str], limit: int = 8) -> list[str]:
|
|
725
|
+
counts: dict[str, int] = {}
|
|
726
|
+
order: list[str] = []
|
|
727
|
+
for paragraph in paragraph_list:
|
|
728
|
+
move = first_words(paragraph, 6)
|
|
729
|
+
if len(move.split()) < 3:
|
|
730
|
+
continue
|
|
731
|
+
if move not in counts:
|
|
732
|
+
order.append(move)
|
|
733
|
+
counts[move] = counts.get(move, 0) + 1
|
|
734
|
+
ranked = sorted(order, key=lambda item: (-counts[item], order.index(item)))
|
|
735
|
+
return ranked[:limit]
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def choose_anchors(paragraph_list: list[str], limit: int = 3) -> list[str]:
|
|
739
|
+
candidates = []
|
|
740
|
+
for paragraph in paragraph_list:
|
|
741
|
+
compact = re.sub(r"\s+", " ", paragraph).strip()
|
|
742
|
+
if 80 <= len(compact) <= 420:
|
|
743
|
+
candidates.append(compact)
|
|
744
|
+
if not candidates:
|
|
745
|
+
candidates = [re.sub(r"\s+", " ", p).strip()[:360] for p in paragraph_list if p.strip()]
|
|
746
|
+
anchors: list[str] = []
|
|
747
|
+
seen = set()
|
|
748
|
+
for candidate in candidates:
|
|
749
|
+
key = candidate[:80].lower()
|
|
750
|
+
if key in seen:
|
|
751
|
+
continue
|
|
752
|
+
seen.add(key)
|
|
753
|
+
anchors.append(candidate[:360])
|
|
754
|
+
if len(anchors) >= limit:
|
|
755
|
+
break
|
|
756
|
+
return anchors
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def build_profile(paths: list[str], name: str) -> dict[str, Any]:
|
|
760
|
+
files = iter_text_files(paths)
|
|
761
|
+
samples = []
|
|
762
|
+
combined_parts = []
|
|
763
|
+
for path in files:
|
|
764
|
+
text = read_text(path).strip()
|
|
765
|
+
if not text:
|
|
766
|
+
continue
|
|
767
|
+
samples.append({"path": str(path), "text": text})
|
|
768
|
+
combined_parts.append(text)
|
|
769
|
+
|
|
770
|
+
if not samples:
|
|
771
|
+
raise SystemExit("no readable text samples found")
|
|
772
|
+
|
|
773
|
+
combined = "\n\n".join(combined_parts)
|
|
774
|
+
sentence_list = sentences(combined)
|
|
775
|
+
paragraph_list = paragraphs(combined)
|
|
776
|
+
sentence_lengths = [len(words(sentence)) for sentence in sentence_list if words(sentence)]
|
|
777
|
+
paragraph_sentence_counts = [max(1, len(sentences(paragraph))) for paragraph in paragraph_list]
|
|
778
|
+
line_list = [line for text in combined_parts for line in text.splitlines()]
|
|
779
|
+
avg_sentence = round(sum(sentence_lengths) / len(sentence_lengths), 1) if sentence_lengths else 0
|
|
780
|
+
avg_paragraph = round(sum(paragraph_sentence_counts) / len(paragraph_sentence_counts), 1) if paragraph_sentence_counts else 0
|
|
781
|
+
opening_moves = top_opening_moves(paragraph_list)
|
|
782
|
+
case_style = infer_case_style(line_list)
|
|
783
|
+
argument_pattern = infer_argument_pattern(combined)
|
|
784
|
+
anchors = choose_anchors(paragraph_list)
|
|
785
|
+
|
|
786
|
+
cadence = [
|
|
787
|
+
f"average sentence length around {avg_sentence} words",
|
|
788
|
+
f"sentence length variance is {variance_label(sentence_lengths)}",
|
|
789
|
+
f"average paragraph length around {avg_paragraph} sentences",
|
|
790
|
+
]
|
|
791
|
+
if case_style == "mostly lowercase":
|
|
792
|
+
cadence.append("leans lowercase in visible prose")
|
|
793
|
+
|
|
794
|
+
never_list = [
|
|
795
|
+
"here's the thing",
|
|
796
|
+
"let's be honest",
|
|
797
|
+
"at the end of the day",
|
|
798
|
+
"not just x but y",
|
|
799
|
+
"which is another way of saying",
|
|
800
|
+
"in other words",
|
|
801
|
+
"the moment x becomes y",
|
|
802
|
+
"same x. better y.",
|
|
803
|
+
]
|
|
804
|
+
|
|
805
|
+
voice_rules = [
|
|
806
|
+
"trust the supplied samples over generic style advice",
|
|
807
|
+
"open from a concrete observation, scene, mechanism, or quoted line",
|
|
808
|
+
"keep the writer's natural sentence and paragraph rhythm",
|
|
809
|
+
"preserve specific roughness when it carries the voice",
|
|
810
|
+
"repair AI-pattern drift line by line instead of rewriting clean prose",
|
|
811
|
+
]
|
|
812
|
+
if opening_moves:
|
|
813
|
+
voice_rules.append("study these sample opening moves before drafting: " + "; ".join(opening_moves[:4]))
|
|
814
|
+
|
|
815
|
+
return {
|
|
816
|
+
"profile_version": "hold-your-voice-portable-v1",
|
|
817
|
+
"name": name,
|
|
818
|
+
"source_count": len(samples),
|
|
819
|
+
"sources": [{"path": sample["path"], "chars": len(sample["text"])} for sample in samples],
|
|
820
|
+
"word_count": len(words(combined)),
|
|
821
|
+
"sentence": {"avg_words": avg_sentence, "variance": variance_label(sentence_lengths)},
|
|
822
|
+
"paragraph": {"avg_sentences": avg_paragraph},
|
|
823
|
+
"signature": {
|
|
824
|
+
"case_style": case_style,
|
|
825
|
+
"argument_pattern": argument_pattern,
|
|
826
|
+
"opening_moves": opening_moves,
|
|
827
|
+
"cadence": cadence,
|
|
828
|
+
"anchors": anchors,
|
|
829
|
+
"never_list": never_list,
|
|
830
|
+
},
|
|
831
|
+
"voice_rules": voice_rules,
|
|
832
|
+
"ai_eliminator": {
|
|
833
|
+
"rewrite_scope": "flagged-lines-only",
|
|
834
|
+
"preserve_surrounding_lines": True,
|
|
835
|
+
"avoid_polished_founder_cadence": True,
|
|
836
|
+
},
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def line_style_hits(line: str) -> list[dict[str, str]]:
|
|
841
|
+
low = (line or "").strip().lower()
|
|
842
|
+
if not low:
|
|
843
|
+
return []
|
|
844
|
+
hits = []
|
|
845
|
+
line_words = re.findall(r"[a-z']+", low)
|
|
846
|
+
abstract_count = sum(1 for word in line_words if word in ABSTRACT_STYLE_WORDS)
|
|
847
|
+
if abstract_count >= 3 and not re.search(r"\b(?:for example|for instance|such as)\b|\d", low):
|
|
848
|
+
hits.append({"rule": "abstract_noun_cluster", "phrase": line.strip()[:160]})
|
|
849
|
+
if GENERIC_OPENERS.match(low):
|
|
850
|
+
hits.append({"rule": "generic_opening_generalization", "phrase": line.strip()[:160]})
|
|
851
|
+
if QUESTION_OPENER.match(low):
|
|
852
|
+
hits.append({"rule": "voice_question_opener", "phrase": "opens with a question instead of a concrete observation"})
|
|
853
|
+
if LESSON_OPENER.match(low):
|
|
854
|
+
hits.append({"rule": "voice_lesson_opener", "phrase": "opens with a lesson or inspirational claim"})
|
|
855
|
+
if CTA_ENDINGS.search(low):
|
|
856
|
+
hits.append({"rule": "cta_ending", "phrase": line.strip()[:160]})
|
|
857
|
+
# detect TED-talk contrastive slogan pattern: "It's not X, it's Y" in a single line
|
|
858
|
+
if re.search(r"\bit'?s\s+not\b.{0,40}\bit'?s\b", low):
|
|
859
|
+
hits.append({"rule": "ted_talk_slogan", "phrase": line.strip()[:160]})
|
|
860
|
+
# detect perfect 6-8 word marketing sentence that starts generic + has buzzword density
|
|
861
|
+
line_parts = re.split(r"(?<=[.!?])\s+", line.strip())
|
|
862
|
+
for part in line_parts:
|
|
863
|
+
wc = len(re.findall(r"[a-zA-Z']+", part))
|
|
864
|
+
if 6 <= wc <= 8 and part and part[-1] in ".!?":
|
|
865
|
+
part_low = part.lower()
|
|
866
|
+
generic_start = re.match(r"^(?:the|your|this|a|an|it|our|most|many|some|all)", part_low)
|
|
867
|
+
has_buzzword = bool(re.search(r"\b(?:attention|trust|retention|brand|growth|strategy|content|value|customer|product|data)\b", part_low))
|
|
868
|
+
if generic_start and has_buzzword:
|
|
869
|
+
hits.append({"rule": "perfect_marketing_sentence", "phrase": part.strip()[:160]})
|
|
870
|
+
break
|
|
871
|
+
return hits
|
|
872
|
+
|
|
873
|
+
|
|
874
|
+
def _structural_analysis(text: str) -> list[dict[str, Any]]:
|
|
875
|
+
"""Analyze structural/rhythmic properties beyond individual word patterns."""
|
|
876
|
+
hits: list[dict[str, Any]] = []
|
|
877
|
+
sentence_list = sentences(text)
|
|
878
|
+
paragraph_list = paragraphs(text)
|
|
879
|
+
|
|
880
|
+
if not sentence_list:
|
|
881
|
+
return hits
|
|
882
|
+
|
|
883
|
+
# --- Burstiness (sentence length variance) ---
|
|
884
|
+
lengths = [len(words(s)) for s in sentence_list if words(s)]
|
|
885
|
+
if len(lengths) >= 5:
|
|
886
|
+
mean = sum(lengths) / len(lengths)
|
|
887
|
+
stdev = math.sqrt(sum((l - mean) ** 2 for l in lengths) / len(lengths))
|
|
888
|
+
cv = stdev / mean if mean > 0 else 0
|
|
889
|
+
if cv < 0.35:
|
|
890
|
+
hits.append({
|
|
891
|
+
"rule": "low_burstiness",
|
|
892
|
+
"phrase": f"sentence length variation {cv:.2f} (< 0.35 = AI-flat rhythm)",
|
|
893
|
+
"line": 0,
|
|
894
|
+
})
|
|
895
|
+
|
|
896
|
+
# --- Mechanical paragraph structure ---
|
|
897
|
+
if len(paragraph_list) >= 3:
|
|
898
|
+
para_sent_counts = [max(1, len(sentences(p))) for p in paragraph_list]
|
|
899
|
+
para_mean = sum(para_sent_counts) / len(para_sent_counts)
|
|
900
|
+
if para_mean > 0:
|
|
901
|
+
para_stdev = math.sqrt(sum((c - para_mean) ** 2 for c in para_sent_counts) / len(para_sent_counts))
|
|
902
|
+
para_cv = para_stdev / para_mean
|
|
903
|
+
if para_cv < 0.30:
|
|
904
|
+
hits.append({
|
|
905
|
+
"rule": "mechanical_paragraphs",
|
|
906
|
+
"phrase": f"paragraphs all similar length (cv={para_cv:.2f}, mean={para_mean:.1f} sentences)",
|
|
907
|
+
"line": 0,
|
|
908
|
+
})
|
|
909
|
+
|
|
910
|
+
# --- Over-structured lists: every list has exactly 3 items? ---
|
|
911
|
+
list_item_pattern = re.compile(r"^[\s]*[-*•]\s+", re.M)
|
|
912
|
+
list_items = list_item_pattern.findall(text or "")
|
|
913
|
+
if len(list_items) >= 6:
|
|
914
|
+
line_num = (text or "").split("\n").index([l for l in (text or "").split("\n") if list_item_pattern.match(l)][0]) + 1 if text else 0
|
|
915
|
+
# check if list items follow a strict "X, Y, and Z" triad pattern
|
|
916
|
+
triad_count = sum(1 for p in paragraph_list if len(sentences(p)) == 1 and len(re.findall(r"[-*•]", p)) >= 2)
|
|
917
|
+
if triad_count >= 2:
|
|
918
|
+
hits.append({
|
|
919
|
+
"rule": "over_structured_lists",
|
|
920
|
+
"phrase": "lists follow rigid 3-item pattern throughout",
|
|
921
|
+
"line": line_num,
|
|
922
|
+
})
|
|
923
|
+
|
|
924
|
+
# --- Uniform sentence rhythm within paragraphs ---
|
|
925
|
+
ai_rhythm_count = 0
|
|
926
|
+
for para in paragraph_list:
|
|
927
|
+
para_sentences = sentences(para)
|
|
928
|
+
if len(para_sentences) >= 3:
|
|
929
|
+
s_lengths = [len(words(s)) for s in para_sentences if words(s)]
|
|
930
|
+
if s_lengths and all(12 <= l <= 22 for l in s_lengths):
|
|
931
|
+
ai_rhythm_count += 1
|
|
932
|
+
if ai_rhythm_count >= max(1, len(paragraph_list) * 0.6) and len(paragraph_list) >= 2:
|
|
933
|
+
hits.append({
|
|
934
|
+
"rule": "uniform_paragraph_rhythm",
|
|
935
|
+
"phrase": f"{ai_rhythm_count}/{len(paragraph_list)} paragraphs have mechanical 12-22 word sentence uniformity",
|
|
936
|
+
"line": 0,
|
|
937
|
+
})
|
|
938
|
+
|
|
939
|
+
# --- Formal/tone analysis: contractions ratio ---
|
|
940
|
+
contraction_pattern = re.compile(r"\b(?:don'?t|can'?t|won'?t|isn'?t|aren'?t|wasn'?t|weren'?t|"
|
|
941
|
+
r"hasn'?t|haven'?t|hadn'?t|shouldn'?t|wouldn'?t|couldn'?t|"
|
|
942
|
+
r"mightn'?t|mustn'?t|it'?s|that'?s|what'?s|there'?s|"
|
|
943
|
+
r"here'?s|who'?s|let'?s|i'?m|you'?re|we'?re|they'?re|"
|
|
944
|
+
r"i'?ve|you'?ve|we'?ve|they'?ve|i'?ll|you'?ll|we'?ll|they'?ll)\b", re.I)
|
|
945
|
+
contractions = len(contraction_pattern.findall(text or ""))
|
|
946
|
+
total_words = len(words(text or ""))
|
|
947
|
+
contraction_ratio = contractions / max(1, total_words / 100) # per 100 words
|
|
948
|
+
if total_words > 200 and contraction_ratio < 0.8:
|
|
949
|
+
hits.append({
|
|
950
|
+
"rule": "low_contractions",
|
|
951
|
+
"phrase": f"{contraction_ratio:.1f} contractions per 100 words (human average 1.5-3.0; overly formal/rigid)",
|
|
952
|
+
"line": 0,
|
|
953
|
+
})
|
|
954
|
+
|
|
955
|
+
# --- Overly formal hedging density ---
|
|
956
|
+
formal_hedges_pattern = re.compile(
|
|
957
|
+
r"\b(?:it\s+is\s+important\s+to\s+note|it\s+should\s+be\s+noted|it\s+is\s+worth\s+noting|"
|
|
958
|
+
r"it\s+is\s+crucial\s+to|it\s+is\s+essential\s+to|it\s+appears\s+that|"
|
|
959
|
+
r"there\s+is\s+a\s+possibility\s+that|one\s+should\s+consider|"
|
|
960
|
+
r"it\s+is\s+imperative\s+to|it\s+is\s+necessary\s+to)\b",
|
|
961
|
+
re.I,
|
|
962
|
+
)
|
|
963
|
+
formal_hedges = len(formal_hedges_pattern.findall(text or ""))
|
|
964
|
+
if formal_hedges >= 2:
|
|
965
|
+
hits.append({
|
|
966
|
+
"rule": "formal_hedging_density",
|
|
967
|
+
"phrase": f"{formal_hedges} formal hedging phrases found (institutional/overly polite tone)",
|
|
968
|
+
"line": 0,
|
|
969
|
+
})
|
|
970
|
+
|
|
971
|
+
# --- Non-specific intensifiers density ---
|
|
972
|
+
intensifiers_pattern = re.compile(
|
|
973
|
+
r"\b(?:remarkably|incredibly|amazingly|extraordinarily|exceptionally|"
|
|
974
|
+
r"tremendously|absolutely|completely|thoroughly|utterly)\s+\w+\b",
|
|
975
|
+
re.I,
|
|
976
|
+
)
|
|
977
|
+
intensifiers = len(intensifiers_pattern.findall(text or ""))
|
|
978
|
+
if intensifiers >= 3:
|
|
979
|
+
hits.append({
|
|
980
|
+
"rule": "generic_intensifiers",
|
|
981
|
+
"phrase": f"{intensifiers} generic intensifiers (remarkably/incredibly/amazingly) - marketing tone",
|
|
982
|
+
"line": 0,
|
|
983
|
+
})
|
|
984
|
+
|
|
985
|
+
# --- Perfect grammar / no fragments ---
|
|
986
|
+
total_sentences = len(sentence_list)
|
|
987
|
+
fragments = sum(1 for s in sentence_list if len(words(s)) <= 4 and s.strip() and s.strip()[-1] in ".!?"
|
|
988
|
+
and not re.search(r"\b(?:yes|no|hey|hi|ok|bye|wow|oh)\b", s.lower()))
|
|
989
|
+
fragment_ratio = fragments / max(1, total_sentences)
|
|
990
|
+
if total_sentences > 20 and fragment_ratio < 0.02:
|
|
991
|
+
hits.append({
|
|
992
|
+
"rule": "no_fragments",
|
|
993
|
+
"phrase": f"only {fragments} sentence fragments in {total_sentences} sentences - over-polished",
|
|
994
|
+
"line": 0,
|
|
995
|
+
})
|
|
996
|
+
|
|
997
|
+
return hits
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
def scan_text(text: str) -> list[dict[str, Any]]:
|
|
1001
|
+
hits: list[dict[str, Any]] = []
|
|
1002
|
+
for rule_id, pattern in AI_PATTERN_RULES:
|
|
1003
|
+
for match in pattern.finditer(text or ""):
|
|
1004
|
+
snippet = match.group(0).strip()
|
|
1005
|
+
if not snippet:
|
|
1006
|
+
continue
|
|
1007
|
+
line_no = text[: match.start()].count("\n") + 1
|
|
1008
|
+
hits.append({"line": line_no, "rule": rule_id, "phrase": snippet[:160]})
|
|
1009
|
+
|
|
1010
|
+
for line_no, line in enumerate((text or "").splitlines(), 1):
|
|
1011
|
+
for hit in line_style_hits(line):
|
|
1012
|
+
hits.append({"line": line_no, "rule": hit["rule"], "phrase": hit["phrase"], "text": line.strip()[:240]})
|
|
1013
|
+
|
|
1014
|
+
# Structural / rhythmic analysis
|
|
1015
|
+
for structural_hit in _structural_analysis(text):
|
|
1016
|
+
hits.append(structural_hit)
|
|
1017
|
+
|
|
1018
|
+
# Staccato triplet detection — only fire when sentences are clearly performative
|
|
1019
|
+
sentence_hits = []
|
|
1020
|
+
for line_no, line in enumerate((text or "").splitlines(), 1):
|
|
1021
|
+
for sentence in re.split(r"(?<=[.!?])\s+", line):
|
|
1022
|
+
found = words(sentence)
|
|
1023
|
+
if found:
|
|
1024
|
+
sentence_hits.append((line_no, sentence.strip(), len(found)))
|
|
1025
|
+
for idx in range(len(sentence_hits) - 2):
|
|
1026
|
+
window = sentence_hits[idx : idx + 3]
|
|
1027
|
+
lengths_ok = all(count <= 5 for _, _, count in window)
|
|
1028
|
+
if not lengths_ok:
|
|
1029
|
+
continue
|
|
1030
|
+
combined = " ".join(s[1] for s in window).lower()
|
|
1031
|
+
connector_words = {"but", "and", "or", "so", "because", "then", "if", "when", "while"}
|
|
1032
|
+
has_connector = any(f" {w} " in f" {combined} " for w in connector_words)
|
|
1033
|
+
# Allow: pure performance staccato (3 verbs in a row, no connectors, no "I")
|
|
1034
|
+
pure_staccato = all(count <= 3 for _, _, count in window) and not has_connector
|
|
1035
|
+
has_i = bool(re.search(r"\b(?:i|we|my|our|me|us)\b", combined))
|
|
1036
|
+
if pure_staccato or (not has_connector and not has_i):
|
|
1037
|
+
hits.append(
|
|
1038
|
+
{
|
|
1039
|
+
"line": window[0][0],
|
|
1040
|
+
"rule": "voice_staccato_triplet",
|
|
1041
|
+
"phrase": "three short sentences in a row reads like performance",
|
|
1042
|
+
"text": window[0][1],
|
|
1043
|
+
}
|
|
1044
|
+
)
|
|
1045
|
+
break
|
|
1046
|
+
|
|
1047
|
+
return sorted(hits, key=lambda item: (item.get("line", 0), item.get("rule", "")))
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def format_scan_text(path: str, text: str, hits: list[dict[str, Any]]) -> str:
|
|
1051
|
+
if not hits:
|
|
1052
|
+
return f"{path}: no deterministic AI-pattern issues found"
|
|
1053
|
+
lines = [f"{path}: {len(hits)} issue(s)"]
|
|
1054
|
+
for hit in hits:
|
|
1055
|
+
phrase = hit.get("phrase", "")
|
|
1056
|
+
lines.append(f"- line {hit.get('line')}: {hit.get('rule')} - {phrase}")
|
|
1057
|
+
return "\n".join(lines)
|
|
1058
|
+
|
|
1059
|
+
|
|
1060
|
+
def load_draft(path: str) -> tuple[str, str]:
|
|
1061
|
+
if path == "-":
|
|
1062
|
+
return "stdin", sys.stdin.read()
|
|
1063
|
+
draft_path = Path(path).expanduser()
|
|
1064
|
+
if not draft_path.exists():
|
|
1065
|
+
raise SystemExit(f"draft not found: {draft_path}")
|
|
1066
|
+
return str(draft_path), read_text(draft_path)
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def build_rewrite_prompt(draft_name: str, draft: str, profile_text: str | None, constraints: str = "", meta: dict[str, Any] | None = None) -> str:
|
|
1070
|
+
hits = scan_text(draft)
|
|
1071
|
+
if meta:
|
|
1072
|
+
hits = filter_hits_by_weights(hits, meta)
|
|
1073
|
+
issue_lines = "\n".join(
|
|
1074
|
+
f"- line {hit['line']} [{hit['rule']}]: {hit.get('phrase', '')}"
|
|
1075
|
+
for hit in hits
|
|
1076
|
+
) or "- none found by deterministic scan"
|
|
1077
|
+
|
|
1078
|
+
numbered_draft = "\n".join(f"{idx}: {line}" for idx, line in enumerate(draft.splitlines(), 1))
|
|
1079
|
+
profile_block = profile_text.strip() if profile_text and profile_text.strip() else "(no voice profile supplied)"
|
|
1080
|
+
constraints_block = constraints.strip() if constraints.strip() else "(none)"
|
|
1081
|
+
|
|
1082
|
+
return f"""Rewrite only the flagged lines. Do not rewrite the whole piece.
|
|
1083
|
+
|
|
1084
|
+
Return only valid JSON in this exact shape:
|
|
1085
|
+
{{"replacements":[{{"line":1,"text":"replacement line"}}]}}
|
|
1086
|
+
|
|
1087
|
+
Rules:
|
|
1088
|
+
- Include only flagged line numbers.
|
|
1089
|
+
- Preserve unflagged lines exactly by not returning them.
|
|
1090
|
+
- Preserve the original argument and local meaning.
|
|
1091
|
+
- Use the voice profile as the benchmark when present.
|
|
1092
|
+
- Remove AI cadence, polished founder cadence, abstract strategy-deck language, and generic lesson shapes.
|
|
1093
|
+
- Do not add new sections, hooks, CTAs, markdown, bullets, or commentary.
|
|
1094
|
+
|
|
1095
|
+
Voice profile:
|
|
1096
|
+
{profile_block}
|
|
1097
|
+
|
|
1098
|
+
Extra constraints:
|
|
1099
|
+
{constraints_block}
|
|
1100
|
+
|
|
1101
|
+
Flagged lines:
|
|
1102
|
+
{issue_lines}
|
|
1103
|
+
|
|
1104
|
+
Draft with line numbers ({draft_name}):
|
|
1105
|
+
{numbered_draft}
|
|
1106
|
+
"""
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
DEFAULT_NEVER_LIST = [
|
|
1110
|
+
"here's the thing",
|
|
1111
|
+
"let's be honest",
|
|
1112
|
+
"at the end of the day",
|
|
1113
|
+
"not just x but y",
|
|
1114
|
+
"which is another way of saying",
|
|
1115
|
+
"in other words",
|
|
1116
|
+
"the moment x becomes y",
|
|
1117
|
+
"same x. better y.",
|
|
1118
|
+
]
|
|
1119
|
+
|
|
1120
|
+
SIGNAL_VERSION = "hold-your-voice-signal-v1"
|
|
1121
|
+
META_SIGNAL_VERSION = "hold-your-voice-signal-v2"
|
|
1122
|
+
|
|
1123
|
+
PATTERN_CONFIDENCE_THRESHOLD = 0.30 # patterns below this are auto-suppressed
|
|
1124
|
+
PATTERN_STATUS = ("active", "declining", "stale")
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
def lines_changed_pct(orig_line: str, acc_line: str) -> bool:
|
|
1128
|
+
"""return True if two lines differ meaningfully as edited text."""
|
|
1129
|
+
return orig_line.strip() != acc_line.strip()
|
|
1130
|
+
|
|
1131
|
+
|
|
1132
|
+
def build_signal_report(
|
|
1133
|
+
original_path: str,
|
|
1134
|
+
accepted_path: str,
|
|
1135
|
+
original_text: str,
|
|
1136
|
+
accepted_text: str,
|
|
1137
|
+
profile: dict[str, Any] | None,
|
|
1138
|
+
) -> dict[str, Any]:
|
|
1139
|
+
"""diff original vs accepted to extract learning signals."""
|
|
1140
|
+
orig_lines = original_text.splitlines(keepends=True)
|
|
1141
|
+
acc_lines = accepted_text.splitlines(keepends=True)
|
|
1142
|
+
orig_hits = scan_text(original_text)
|
|
1143
|
+
|
|
1144
|
+
flagged_line_nums: set[int] = set(hit["line"] for hit in orig_hits)
|
|
1145
|
+
# build a map: line_num -> [pattern_ids]
|
|
1146
|
+
line_pattern_map: dict[int, list[str]] = {}
|
|
1147
|
+
for hit in orig_hits:
|
|
1148
|
+
line_pattern_map.setdefault(hit["line"], []).append(hit["rule"])
|
|
1149
|
+
|
|
1150
|
+
patterns_accepted: dict[str, int] = {}
|
|
1151
|
+
patterns_overridden: dict[str, int] = {}
|
|
1152
|
+
changed_unflagged: dict[int, str] = {}
|
|
1153
|
+
|
|
1154
|
+
min_lines = min(len(orig_lines), len(acc_lines))
|
|
1155
|
+
|
|
1156
|
+
for i in range(min_lines):
|
|
1157
|
+
line_no = i + 1
|
|
1158
|
+
changed = lines_changed_pct(orig_lines[i], acc_lines[i])
|
|
1159
|
+
patterns = line_pattern_map.get(line_no, [])
|
|
1160
|
+
|
|
1161
|
+
if changed and patterns:
|
|
1162
|
+
for pid in patterns:
|
|
1163
|
+
patterns_accepted[pid] = patterns_accepted.get(pid, 0) + 1
|
|
1164
|
+
elif not changed and patterns:
|
|
1165
|
+
for pid in patterns:
|
|
1166
|
+
patterns_overridden[pid] = patterns_overridden.get(pid, 0) + 1
|
|
1167
|
+
elif changed and line_no not in flagged_line_nums:
|
|
1168
|
+
# user changed a line that wasn't flagged — potential new pattern
|
|
1169
|
+
orig_stripped = orig_lines[i].strip()
|
|
1170
|
+
if len(orig_stripped) > 40 and orig_stripped not in ("", "\n"):
|
|
1171
|
+
changed_unflagged[line_no] = orig_stripped[:240]
|
|
1172
|
+
|
|
1173
|
+
total_changed = sum(1 for i in range(min_lines) if lines_changed_pct(orig_lines[i], acc_lines[i]))
|
|
1174
|
+
full_rewrite = total_changed > max(1, min_lines * 0.8)
|
|
1175
|
+
|
|
1176
|
+
# session stats from accepted
|
|
1177
|
+
acc_sentences = sentences(accepted_text)
|
|
1178
|
+
acc_paragraphs = paragraphs(accepted_text)
|
|
1179
|
+
acc_sentence_lengths = [len(words(s)) for s in acc_sentences if words(s)]
|
|
1180
|
+
acc_paragraph_sentence_counts = [max(1, len(sentences(p))) for p in acc_paragraphs]
|
|
1181
|
+
avg_s = round(sum(acc_sentence_lengths) / len(acc_sentence_lengths), 1) if acc_sentence_lengths else 0
|
|
1182
|
+
avg_p = round(sum(acc_paragraph_sentence_counts) / len(acc_paragraph_sentence_counts), 1) if acc_paragraph_sentence_counts else 0
|
|
1183
|
+
|
|
1184
|
+
# simplified new_removals: surface a sample of changed-unflagged lines for review
|
|
1185
|
+
new_removals = []
|
|
1186
|
+
seen_phrases: set[str] = set()
|
|
1187
|
+
for line_no in sorted(changed_unflagged):
|
|
1188
|
+
phrase = changed_unflagged[line_no]
|
|
1189
|
+
key = phrase.lower().strip()[:60]
|
|
1190
|
+
if key not in seen_phrases:
|
|
1191
|
+
seen_phrases.add(key)
|
|
1192
|
+
new_removals.append({"line": line_no, "original_text": phrase, "context": ""})
|
|
1193
|
+
if len(new_removals) >= 10:
|
|
1194
|
+
break
|
|
1195
|
+
|
|
1196
|
+
report: dict[str, Any] = {
|
|
1197
|
+
"signal_version": SIGNAL_VERSION,
|
|
1198
|
+
"session": {
|
|
1199
|
+
"original_path": original_path,
|
|
1200
|
+
"accepted_path": accepted_path,
|
|
1201
|
+
"full_rewrite": full_rewrite,
|
|
1202
|
+
},
|
|
1203
|
+
"patterns_accepted": dict(sorted(patterns_accepted.items())),
|
|
1204
|
+
"patterns_overridden": dict(sorted(patterns_overridden.items())),
|
|
1205
|
+
"new_removals": new_removals,
|
|
1206
|
+
"session_stats": {
|
|
1207
|
+
"original_words": len(words(original_text)),
|
|
1208
|
+
"accepted_words": len(words(accepted_text)),
|
|
1209
|
+
"accepted_avg_sentence": avg_s,
|
|
1210
|
+
"accepted_avg_paragraph": avg_p,
|
|
1211
|
+
"accepted_sentence_count": len(acc_sentence_lengths),
|
|
1212
|
+
"accepted_paragraph_count": len(acc_paragraph_sentence_counts),
|
|
1213
|
+
},
|
|
1214
|
+
}
|
|
1215
|
+
return report
|
|
1216
|
+
|
|
1217
|
+
|
|
1218
|
+
def _current_date() -> str:
|
|
1219
|
+
return datetime.date.today().isoformat()
|
|
1220
|
+
|
|
1221
|
+
|
|
1222
|
+
def init_temporal_pattern(rule_id: str) -> dict[str, Any]:
|
|
1223
|
+
"""create a new temporal pattern entry."""
|
|
1224
|
+
now = _current_date()
|
|
1225
|
+
return {
|
|
1226
|
+
"id": rule_id,
|
|
1227
|
+
"confidence": 0.0,
|
|
1228
|
+
"first_seen": now,
|
|
1229
|
+
"last_confirmed": now,
|
|
1230
|
+
"source_samples": [], # list of sample paths that triggered this
|
|
1231
|
+
"contradictions": [], # dates when pattern was flagged but overridden by user
|
|
1232
|
+
"superseded_by": None,
|
|
1233
|
+
"status": "active",
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
def evolve_meta_from_signal(
|
|
1238
|
+
meta: dict[str, Any],
|
|
1239
|
+
patterns_accepted: dict[str, int],
|
|
1240
|
+
patterns_overridden: dict[str, int],
|
|
1241
|
+
source_samples: list[str] | None = None,
|
|
1242
|
+
) -> dict[str, Any]:
|
|
1243
|
+
"""update temporal pattern weights in meta based on accept/override signals.
|
|
1244
|
+
|
|
1245
|
+
each pattern tracks: first_seen, last_confirmed, contradictions per date,
|
|
1246
|
+
source_samples, confidence (0.0-1.0), and status.
|
|
1247
|
+
"""
|
|
1248
|
+
now = _current_date()
|
|
1249
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1250
|
+
|
|
1251
|
+
for rule_id, count in patterns_accepted.items():
|
|
1252
|
+
tp = temporal.get(rule_id)
|
|
1253
|
+
if tp is None:
|
|
1254
|
+
tp = init_temporal_pattern(rule_id)
|
|
1255
|
+
temporal[rule_id] = tp
|
|
1256
|
+
tp["last_confirmed"] = now
|
|
1257
|
+
if source_samples:
|
|
1258
|
+
for s in source_samples:
|
|
1259
|
+
if s not in tp["source_samples"]:
|
|
1260
|
+
tp["source_samples"].append(s)
|
|
1261
|
+
# accepted signals boost confidence
|
|
1262
|
+
boost = min(count * 0.08, 0.40) # cap boost per session
|
|
1263
|
+
tp["confidence"] = min(1.0, tp["confidence"] + boost)
|
|
1264
|
+
tp["status"] = "active"
|
|
1265
|
+
|
|
1266
|
+
for rule_id, count in patterns_overridden.items():
|
|
1267
|
+
tp = temporal.get(rule_id)
|
|
1268
|
+
if tp is None:
|
|
1269
|
+
tp = init_temporal_pattern(rule_id)
|
|
1270
|
+
temporal[rule_id] = tp
|
|
1271
|
+
tp["contradictions"].append({"date": now, "count": count})
|
|
1272
|
+
# overridden signals decrease confidence faster
|
|
1273
|
+
penalty = min(count * 0.12, 0.50)
|
|
1274
|
+
tp["confidence"] = max(0.0, tp["confidence"] - penalty)
|
|
1275
|
+
# determine status
|
|
1276
|
+
if len(tp["contradictions"]) >= 3 and tp["confidence"] < 0.30:
|
|
1277
|
+
tp["status"] = "declining"
|
|
1278
|
+
if len(tp["contradictions"]) >= 5 and tp["confidence"] < 0.15:
|
|
1279
|
+
tp["status"] = "stale"
|
|
1280
|
+
|
|
1281
|
+
# decay untouched patterns whose last_confirmed is > 14 days ago
|
|
1282
|
+
two_weeks_ms = 14 * 24 * 60 * 60
|
|
1283
|
+
for tp in temporal.values():
|
|
1284
|
+
last = tp.get("last_confirmed", now)
|
|
1285
|
+
try:
|
|
1286
|
+
last_date = datetime.date.fromisoformat(last)
|
|
1287
|
+
days_since = (datetime.date.today() - last_date).days
|
|
1288
|
+
except (ValueError, TypeError):
|
|
1289
|
+
days_since = 0
|
|
1290
|
+
if days_since > 14:
|
|
1291
|
+
decay = min(days_since * 0.005, 0.15) # slow decay over time
|
|
1292
|
+
tp["confidence"] = max(0.0, tp["confidence"] - decay)
|
|
1293
|
+
if tp["confidence"] < PATTERN_CONFIDENCE_THRESHOLD and tp["status"] == "active":
|
|
1294
|
+
tp["status"] = "stale"
|
|
1295
|
+
|
|
1296
|
+
meta["temporal_patterns"] = temporal
|
|
1297
|
+
meta["signal_version"] = META_SIGNAL_VERSION
|
|
1298
|
+
meta["last_updated"] = now
|
|
1299
|
+
meta["signal_count"] = meta.get("signal_count", 0) + sum(patterns_accepted.values()) + sum(patterns_overridden.values())
|
|
1300
|
+
|
|
1301
|
+
return meta
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
def get_active_patterns(meta: dict[str, Any]) -> list[str]:
|
|
1305
|
+
"""return rule_ids of patterns that are active and above confidence threshold."""
|
|
1306
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1307
|
+
return [
|
|
1308
|
+
rid for rid, tp in temporal.items()
|
|
1309
|
+
if tp.get("status") == "active" and tp.get("confidence", 0) >= PATTERN_CONFIDENCE_THRESHOLD
|
|
1310
|
+
]
|
|
1311
|
+
|
|
1312
|
+
|
|
1313
|
+
def get_declining_patterns(meta: dict[str, Any]) -> list[str]:
|
|
1314
|
+
"""return rule_ids that are declining or stale."""
|
|
1315
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1316
|
+
return [rid for rid, tp in temporal.items() if tp.get("status") in ("declining", "stale")]
|
|
1317
|
+
|
|
1318
|
+
|
|
1319
|
+
def filter_hits_by_weights(hits: list[dict[str, Any]], meta: dict[str, Any]) -> list[dict[str, Any]]:
|
|
1320
|
+
"""remove hits for patterns that have been learned as not applicable to this voice."""
|
|
1321
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1322
|
+
if not temporal:
|
|
1323
|
+
return hits
|
|
1324
|
+
declined = {}
|
|
1325
|
+
for rid, tp in temporal.items():
|
|
1326
|
+
if tp.get("status") in ("declining", "stale"):
|
|
1327
|
+
declined[rid] = tp.get("confidence", 0)
|
|
1328
|
+
if not declined:
|
|
1329
|
+
return hits
|
|
1330
|
+
return [h for h in hits if h.get("rule") not in declined]
|
|
1331
|
+
|
|
1332
|
+
|
|
1333
|
+
def evolve_profile(
|
|
1334
|
+
profile: dict[str, Any],
|
|
1335
|
+
meta: dict[str, Any],
|
|
1336
|
+
original_text: str,
|
|
1337
|
+
accepted_text: str,
|
|
1338
|
+
original_path: str = "original",
|
|
1339
|
+
accepted_path: str = "accepted",
|
|
1340
|
+
new_samples_text: str | None = None,
|
|
1341
|
+
) -> tuple[dict[str, Any], dict[str, Any]]:
|
|
1342
|
+
"""one-shot evolution: extract signals + update meta + merge profile stats.
|
|
1343
|
+
|
|
1344
|
+
this is the core auto-improvement function. after every writing session:
|
|
1345
|
+
1. diff original vs accepted to extract accept/override signals
|
|
1346
|
+
2. update temporal pattern weights in meta
|
|
1347
|
+
3. merge new sample stats into the profile if new_samples_text is provided
|
|
1348
|
+
4. filter out declining/stale patterns
|
|
1349
|
+
|
|
1350
|
+
returns (updated_profile, updated_meta).
|
|
1351
|
+
"""
|
|
1352
|
+
signal = build_signal_report(original_path, accepted_path, original_text, accepted_text, profile)
|
|
1353
|
+
meta = evolve_meta_from_signal(
|
|
1354
|
+
meta, signal["patterns_accepted"], signal["patterns_overridden"],
|
|
1355
|
+
source_samples=[original_path],
|
|
1356
|
+
)
|
|
1357
|
+
if new_samples_text:
|
|
1358
|
+
profile = update_profile(profile, new_samples_text.strip())
|
|
1359
|
+
return profile, meta
|
|
1360
|
+
|
|
1361
|
+
|
|
1362
|
+
def update_profile(profile: dict[str, Any], new_samples_text: str) -> dict[str, Any]:
|
|
1363
|
+
"""merge new writing samples into an existing profile using rolling averages.
|
|
1364
|
+
|
|
1365
|
+
the existing profile's stats are weighted by source_count. new stats
|
|
1366
|
+
get their own count. values that aren't simple averages (opening_moves,
|
|
1367
|
+
anchors) use a merge strategy rather than a formula.
|
|
1368
|
+
"""
|
|
1369
|
+
sentence_list = sentences(new_samples_text)
|
|
1370
|
+
paragraph_list = paragraphs(new_samples_text)
|
|
1371
|
+
sentence_lengths = [len(words(s)) for s in sentence_list if words(s)]
|
|
1372
|
+
paragraph_sentence_counts = [max(1, len(sentences(p))) for p in paragraph_list]
|
|
1373
|
+
|
|
1374
|
+
old_count = profile.get("source_count", 1)
|
|
1375
|
+
new_count = 1 # treating this update as one new source
|
|
1376
|
+
total_count = old_count + new_count
|
|
1377
|
+
|
|
1378
|
+
# rolling average for sentence length
|
|
1379
|
+
old_avg_words = profile.get("sentence", {}).get("avg_words", 0)
|
|
1380
|
+
new_avg_words = round(sum(sentence_lengths) / len(sentence_lengths), 1) if sentence_lengths else 0
|
|
1381
|
+
if old_avg_words and new_avg_words:
|
|
1382
|
+
merged_avg_words = round((old_avg_words * old_count + new_avg_words * new_count) / total_count, 1)
|
|
1383
|
+
else:
|
|
1384
|
+
merged_avg_words = old_avg_words or new_avg_words
|
|
1385
|
+
|
|
1386
|
+
# rolling average for paragraph length
|
|
1387
|
+
old_avg_par = profile.get("paragraph", {}).get("avg_sentences", 0)
|
|
1388
|
+
new_avg_par = round(sum(paragraph_sentence_counts) / len(paragraph_sentence_counts), 1) if paragraph_sentence_counts else 0
|
|
1389
|
+
if old_avg_par and new_avg_par:
|
|
1390
|
+
merged_avg_par = round((old_avg_par * old_count + new_avg_par * new_count) / total_count, 1)
|
|
1391
|
+
else:
|
|
1392
|
+
merged_avg_par = old_avg_par or new_avg_par
|
|
1393
|
+
|
|
1394
|
+
# merge opening moves: keep old ones, prepend new top moves
|
|
1395
|
+
existing_moves = profile.get("signature", {}).get("opening_moves", [])
|
|
1396
|
+
new_moves = top_opening_moves(paragraph_list, 4)
|
|
1397
|
+
merged_moves = list(dict.fromkeys(new_moves + existing_moves))[:8]
|
|
1398
|
+
|
|
1399
|
+
# merge anchors: keep old anchors, insert new ones that aren't near-duplicates
|
|
1400
|
+
existing_anchors = profile.get("signature", {}).get("anchors", [])
|
|
1401
|
+
new_anchors = choose_anchors(paragraph_list, 2)
|
|
1402
|
+
seen = {a[:80].lower() for a in existing_anchors}
|
|
1403
|
+
for anchor in new_anchors:
|
|
1404
|
+
if anchor[:80].lower() not in seen:
|
|
1405
|
+
seen.add(anchor[:80].lower())
|
|
1406
|
+
existing_anchors.append(anchor)
|
|
1407
|
+
if len(existing_anchors) >= 5:
|
|
1408
|
+
break
|
|
1409
|
+
|
|
1410
|
+
# rebuild variance label using combined length estimate
|
|
1411
|
+
# we approximate the combined variance since we don't store raw lengths
|
|
1412
|
+
# conservative: keep old variance unless new samples strongly suggest otherwise
|
|
1413
|
+
new_variance = variance_label(sentence_lengths) if len(sentence_lengths) >= 3 else None
|
|
1414
|
+
old_variance = profile.get("sentence", {}).get("variance", "medium")
|
|
1415
|
+
merged_variance = new_variance if new_variance and new_variance != old_variance else old_variance
|
|
1416
|
+
|
|
1417
|
+
# update cadence
|
|
1418
|
+
existing_cadence = profile.get("signature", {}).get("cadence", [])
|
|
1419
|
+
updated_cadence = [
|
|
1420
|
+
f"average sentence length around {merged_avg_words} words",
|
|
1421
|
+
f"sentence length variance is {merged_variance}",
|
|
1422
|
+
f"average paragraph length around {merged_avg_par} sentences",
|
|
1423
|
+
]
|
|
1424
|
+
case_style = profile.get("signature", {}).get("case_style", "mixed")
|
|
1425
|
+
if case_style == "mostly lowercase" and "leans lowercase in visible prose" not in [c for c in updated_cadence]:
|
|
1426
|
+
updated_cadence.append("leans lowercase in visible prose")
|
|
1427
|
+
|
|
1428
|
+
# rebuild profile
|
|
1429
|
+
profile["source_count"] = total_count
|
|
1430
|
+
profile["word_count"] = profile.get("word_count", 0) + len(words(new_samples_text))
|
|
1431
|
+
profile["sentence"] = {"avg_words": merged_avg_words, "variance": merged_variance}
|
|
1432
|
+
profile["paragraph"] = {"avg_sentences": merged_avg_par}
|
|
1433
|
+
profile["signature"]["opening_moves"] = merged_moves
|
|
1434
|
+
profile["signature"]["anchors"] = existing_anchors[:5]
|
|
1435
|
+
profile["signature"]["cadence"] = updated_cadence
|
|
1436
|
+
|
|
1437
|
+
return profile
|
|
1438
|
+
|
|
1439
|
+
|
|
1440
|
+
def cmd_profile_update(args: argparse.Namespace) -> int:
|
|
1441
|
+
"""merge new samples into an existing profile."""
|
|
1442
|
+
profile_path = Path(args.profile).expanduser()
|
|
1443
|
+
if not profile_path.exists():
|
|
1444
|
+
raise SystemExit(f"profile not found: {profile_path}")
|
|
1445
|
+
profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1446
|
+
|
|
1447
|
+
combined_text = ""
|
|
1448
|
+
for raw_path in args.paths:
|
|
1449
|
+
files = iter_text_files([raw_path])
|
|
1450
|
+
for path in files:
|
|
1451
|
+
combined_text += "\n\n" + read_text(path)
|
|
1452
|
+
|
|
1453
|
+
if not combined_text.strip():
|
|
1454
|
+
print("no new text samples found; profile unchanged")
|
|
1455
|
+
return 0
|
|
1456
|
+
|
|
1457
|
+
profile = update_profile(profile, combined_text.strip())
|
|
1458
|
+
rendered = json.dumps(profile, indent=2, ensure_ascii=False)
|
|
1459
|
+
write_or_print(rendered, args.out)
|
|
1460
|
+
return 0
|
|
1461
|
+
|
|
1462
|
+
|
|
1463
|
+
def cmd_profile_export(args: argparse.Namespace) -> int:
|
|
1464
|
+
"""bundle a profile + optional meta into a portable .hyv file."""
|
|
1465
|
+
profile_path = Path(args.profile).expanduser()
|
|
1466
|
+
if not profile_path.exists():
|
|
1467
|
+
raise SystemExit(f"profile not found: {profile_path}")
|
|
1468
|
+
profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1469
|
+
|
|
1470
|
+
bundle: dict[str, Any] = {
|
|
1471
|
+
"bundle_version": "hold-your-voice-bundle-v1",
|
|
1472
|
+
"exported_at": datetime.datetime.now().isoformat()[:19],
|
|
1473
|
+
"profile": profile,
|
|
1474
|
+
}
|
|
1475
|
+
if args.meta:
|
|
1476
|
+
meta_path = Path(args.meta).expanduser()
|
|
1477
|
+
if meta_path.exists():
|
|
1478
|
+
bundle["meta"] = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1479
|
+
write_or_print(json.dumps(bundle, indent=2, ensure_ascii=False), args.out)
|
|
1480
|
+
return 0
|
|
1481
|
+
|
|
1482
|
+
|
|
1483
|
+
def cmd_profile_import(args: argparse.Namespace) -> int:
|
|
1484
|
+
"""import a .hyv bundle into a destination profile."""
|
|
1485
|
+
source_path = Path(args.source).expanduser()
|
|
1486
|
+
if not source_path.exists():
|
|
1487
|
+
raise SystemExit(f"source not found: {source_path}")
|
|
1488
|
+
source = json.loads(source_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1489
|
+
if source.get("bundle_version") != "hold-your-voice-bundle-v1":
|
|
1490
|
+
raise SystemExit(f"unknown bundle version: {source.get('bundle_version')}")
|
|
1491
|
+
|
|
1492
|
+
dest_profile: dict[str, Any]
|
|
1493
|
+
dest_path = Path(args.profile).expanduser()
|
|
1494
|
+
if dest_path.exists():
|
|
1495
|
+
dest_profile = json.loads(dest_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1496
|
+
else:
|
|
1497
|
+
dest_profile = {}
|
|
1498
|
+
|
|
1499
|
+
source_profile = source.get("profile", {})
|
|
1500
|
+
src_count = source_profile.get("source_count", 0)
|
|
1501
|
+
dest_count = dest_profile.get("source_count", 0)
|
|
1502
|
+
|
|
1503
|
+
# if destination is empty, this is a pure copy of the source profile
|
|
1504
|
+
if not dest_profile:
|
|
1505
|
+
dest_profile = dict(source_profile)
|
|
1506
|
+
write_or_print(json.dumps(dest_profile, indent=2, ensure_ascii=False), args.profile)
|
|
1507
|
+
print(f"imported into {args.profile}")
|
|
1508
|
+
# merge meta still applies
|
|
1509
|
+
_merge_import_meta(source, args, args.profile)
|
|
1510
|
+
return 0
|
|
1511
|
+
|
|
1512
|
+
src_count = source_profile.get("source_count", 0)
|
|
1513
|
+
dest_count = dest_profile.get("source_count", 0)
|
|
1514
|
+
|
|
1515
|
+
# merge profile: prefer higher source_count for stats
|
|
1516
|
+
if src_count > dest_count:
|
|
1517
|
+
# source has more signal; take its stats
|
|
1518
|
+
dest_profile["source_count"] = dest_count + src_count
|
|
1519
|
+
dest_profile["word_count"] = dest_profile.get("word_count", 0) + source_profile.get("word_count", 0)
|
|
1520
|
+
dest_profile["sentence"] = source_profile.get("sentence", {})
|
|
1521
|
+
dest_profile["paragraph"] = source_profile.get("paragraph", {})
|
|
1522
|
+
# merge signature fields
|
|
1523
|
+
dest_sig = dest_profile.get("signature", {})
|
|
1524
|
+
src_sig = source_profile.get("signature", {})
|
|
1525
|
+
merged_moves = list(dict.fromkeys(src_sig.get("opening_moves", []) + dest_sig.get("opening_moves", [])))[:8]
|
|
1526
|
+
merged_anchors = list(dict.fromkeys(src_sig.get("anchors", []) + dest_sig.get("anchors", [])))[:5]
|
|
1527
|
+
merged_never = list(dict.fromkeys(src_sig.get("never_list", DEFAULT_NEVER_LIST) + dest_sig.get("never_list", DEFAULT_NEVER_LIST)))
|
|
1528
|
+
dest_sig["opening_moves"] = merged_moves
|
|
1529
|
+
dest_sig["anchors"] = merged_anchors
|
|
1530
|
+
dest_sig["never_list"] = merged_never
|
|
1531
|
+
dest_profile["signature"] = dest_sig
|
|
1532
|
+
else:
|
|
1533
|
+
# destination has more or equal signal; keep its stats, merge in source anchors/moves
|
|
1534
|
+
dest_profile["source_count"] = dest_count + src_count
|
|
1535
|
+
dest_profile["word_count"] = dest_profile.get("word_count", 0) + source_profile.get("word_count", 0)
|
|
1536
|
+
dest_sig = dest_profile.get("signature", {})
|
|
1537
|
+
src_sig = source_profile.get("signature", {})
|
|
1538
|
+
merged_moves = list(dict.fromkeys(dest_sig.get("opening_moves", []) + src_sig.get("opening_moves", [])))[:8]
|
|
1539
|
+
merged_anchors = list(dict.fromkeys(dest_sig.get("anchors", []) + src_sig.get("anchors", [])))[:5]
|
|
1540
|
+
dest_sig["opening_moves"] = merged_moves
|
|
1541
|
+
dest_sig["anchors"] = merged_anchors
|
|
1542
|
+
dest_profile["signature"] = dest_sig
|
|
1543
|
+
|
|
1544
|
+
# merge meta if present
|
|
1545
|
+
write_or_print(json.dumps(dest_profile, indent=2, ensure_ascii=False), args.profile)
|
|
1546
|
+
print(f"imported into {args.profile}", end="")
|
|
1547
|
+
_merge_import_meta(source, args, str(dest_path))
|
|
1548
|
+
print()
|
|
1549
|
+
return 0
|
|
1550
|
+
|
|
1551
|
+
|
|
1552
|
+
def _merge_import_meta(source: dict[str, Any], args: argparse.Namespace, dest_profile_path: str) -> None:
|
|
1553
|
+
"""merge meta from a .hyv bundle into a destination meta file."""
|
|
1554
|
+
source_meta = source.get("meta", {})
|
|
1555
|
+
if not source_meta:
|
|
1556
|
+
return
|
|
1557
|
+
|
|
1558
|
+
meta_path_str = args.meta
|
|
1559
|
+
if meta_path_str:
|
|
1560
|
+
mpath = Path(meta_path_str).expanduser()
|
|
1561
|
+
else:
|
|
1562
|
+
mpath = Path(dest_profile_path).with_suffix(".meta.json")
|
|
1563
|
+
|
|
1564
|
+
dest_meta: dict[str, Any] = {}
|
|
1565
|
+
if mpath.exists():
|
|
1566
|
+
try:
|
|
1567
|
+
dest_meta = json.loads(mpath.read_text(encoding="utf-8", errors="ignore"))
|
|
1568
|
+
except (json.JSONDecodeError, OSError):
|
|
1569
|
+
dest_meta = {}
|
|
1570
|
+
|
|
1571
|
+
# merge pattern_weights: take higher signal_count
|
|
1572
|
+
dest_weights = dest_meta.get("pattern_weights", {})
|
|
1573
|
+
src_weights = source_meta.get("pattern_weights", {})
|
|
1574
|
+
for key, src_w in src_weights.items():
|
|
1575
|
+
if key not in dest_weights or src_w > dest_weights[key]:
|
|
1576
|
+
dest_weights[key] = src_w
|
|
1577
|
+
if src_weights:
|
|
1578
|
+
dest_meta["pattern_weights"] = dest_weights
|
|
1579
|
+
dest_meta["signal_count"] = dest_meta.get("signal_count", 0) + source_meta.get("signal_count", 0)
|
|
1580
|
+
|
|
1581
|
+
mpath.parent.mkdir(parents=True, exist_ok=True)
|
|
1582
|
+
mpath.write_text(json.dumps(dest_meta, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1583
|
+
print(f" + {mpath}")
|
|
1584
|
+
|
|
1585
|
+
|
|
1586
|
+
def render_voice_md(profile: dict[str, Any], meta: dict[str, Any]) -> str:
|
|
1587
|
+
"""render voice.md — the human-readable voice profile summary."""
|
|
1588
|
+
lines: list[str] = []
|
|
1589
|
+
name = profile.get("name", "unnamed")
|
|
1590
|
+
lines.append(f"# voice for {name}")
|
|
1591
|
+
lines.append("")
|
|
1592
|
+
lines.append("> continuously learned by hold your voice from your writing signals")
|
|
1593
|
+
lines.append("")
|
|
1594
|
+
|
|
1595
|
+
# temporal pattern weights
|
|
1596
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1597
|
+
signal_count = meta.get("signal_count", 0)
|
|
1598
|
+
if temporal:
|
|
1599
|
+
lines.append("# evolved pattern weights")
|
|
1600
|
+
lines.append("")
|
|
1601
|
+
lines.append("| pattern | confidence | status | confirmed |")
|
|
1602
|
+
lines.append("|---------|-----------|--------|-----------|")
|
|
1603
|
+
sorted_patterns = sorted(temporal.items(), key=lambda x: -x[1].get("confidence", 0))
|
|
1604
|
+
for pid, tp in sorted_patterns:
|
|
1605
|
+
w = tp.get("confidence", 0)
|
|
1606
|
+
s = tp.get("status", "active")
|
|
1607
|
+
c = tp.get("last_confirmed", "?")
|
|
1608
|
+
lines.append(f"| {pid} | {w:.2f} | {s} | {c} |")
|
|
1609
|
+
if signal_count:
|
|
1610
|
+
lines.append("")
|
|
1611
|
+
lines.append(f"_(based on {signal_count} signals)_")
|
|
1612
|
+
lines.append("")
|
|
1613
|
+
|
|
1614
|
+
# voice stats section
|
|
1615
|
+
lines.append("# voice stats")
|
|
1616
|
+
lines.append("")
|
|
1617
|
+
sentence = profile.get("sentence", {})
|
|
1618
|
+
paragraph = profile.get("paragraph", {})
|
|
1619
|
+
lines.append(f"- sentence length: {sentence.get('avg_words', '?')} words avg (`{sentence.get('variance', '?')}` variance)")
|
|
1620
|
+
lines.append(f"- paragraph length: {paragraph.get('avg_sentences', '?')} sentences avg")
|
|
1621
|
+
sig = profile.get("signature", {})
|
|
1622
|
+
lines.append(f"- case style: {sig.get('case_style', '?')}")
|
|
1623
|
+
lines.append(f"- argument pattern: {sig.get('argument_pattern', '?')}")
|
|
1624
|
+
lines.append("")
|
|
1625
|
+
|
|
1626
|
+
# cadence
|
|
1627
|
+
cadence = sig.get("cadence", [])
|
|
1628
|
+
if cadence:
|
|
1629
|
+
lines.append("# cadence")
|
|
1630
|
+
lines.append("")
|
|
1631
|
+
for note in cadence:
|
|
1632
|
+
lines.append(f"- {note}")
|
|
1633
|
+
lines.append("")
|
|
1634
|
+
|
|
1635
|
+
# opening moves
|
|
1636
|
+
moves = sig.get("opening_moves", [])
|
|
1637
|
+
if moves:
|
|
1638
|
+
lines.append("# opening moves")
|
|
1639
|
+
lines.append("")
|
|
1640
|
+
for i, move in enumerate(moves[:6], 1):
|
|
1641
|
+
lines.append(f"{i}. \"{move}...\"")
|
|
1642
|
+
lines.append("")
|
|
1643
|
+
|
|
1644
|
+
# never list
|
|
1645
|
+
never_list = sig.get("never_list", [])
|
|
1646
|
+
if never_list:
|
|
1647
|
+
lines.append("# banned patterns")
|
|
1648
|
+
lines.append("")
|
|
1649
|
+
for phrase in never_list:
|
|
1650
|
+
lines.append(f"- {phrase}")
|
|
1651
|
+
lines.append("")
|
|
1652
|
+
|
|
1653
|
+
# anchors
|
|
1654
|
+
anchors = sig.get("anchors", [])
|
|
1655
|
+
if anchors:
|
|
1656
|
+
lines.append("# voice anchors")
|
|
1657
|
+
lines.append("")
|
|
1658
|
+
for anchor in anchors[:3]:
|
|
1659
|
+
lines.append(f"> {anchor[:240]}")
|
|
1660
|
+
lines.append("")
|
|
1661
|
+
|
|
1662
|
+
# sources
|
|
1663
|
+
sources = profile.get("sources", [])
|
|
1664
|
+
if sources:
|
|
1665
|
+
lines.append("# sources")
|
|
1666
|
+
lines.append(f"profile built from {profile.get('source_count', len(sources))} source(s):")
|
|
1667
|
+
for s in sources[:10]:
|
|
1668
|
+
lines.append(f"- [{s.get('path', '?')}]({s.get('path', '?')}) ({s.get('chars', 0)} chars)")
|
|
1669
|
+
lines.append("")
|
|
1670
|
+
|
|
1671
|
+
# meta
|
|
1672
|
+
if signal_count:
|
|
1673
|
+
lines.append("*last updated: {0} | signals processed: {1}*".format(meta.get("last_updated", "unknown"), signal_count))
|
|
1674
|
+
lines.append("")
|
|
1675
|
+
|
|
1676
|
+
return "\n".join(lines)
|
|
1677
|
+
|
|
1678
|
+
|
|
1679
|
+
def cmd_profile_status(args: argparse.Namespace) -> int:
|
|
1680
|
+
"""pretty-print the learning state of a profile."""
|
|
1681
|
+
profile_path = Path(args.profile).expanduser()
|
|
1682
|
+
if not profile_path.exists():
|
|
1683
|
+
raise SystemExit(f"profile not found: {profile_path}")
|
|
1684
|
+
profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1685
|
+
|
|
1686
|
+
# try to load meta if present
|
|
1687
|
+
meta: dict[str, Any] = {}
|
|
1688
|
+
meta_path = None
|
|
1689
|
+
meta_path_str = args.meta
|
|
1690
|
+
if meta_path_str:
|
|
1691
|
+
meta_path = Path(meta_path_str).expanduser()
|
|
1692
|
+
else:
|
|
1693
|
+
meta_path = profile_path.with_suffix(".meta.json")
|
|
1694
|
+
if meta_path and meta_path.exists():
|
|
1695
|
+
try:
|
|
1696
|
+
meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1697
|
+
except (json.JSONDecodeError, OSError):
|
|
1698
|
+
meta = {}
|
|
1699
|
+
|
|
1700
|
+
lines: list[str] = []
|
|
1701
|
+
|
|
1702
|
+
# header
|
|
1703
|
+
name = profile.get("name", "unnamed")
|
|
1704
|
+
ver = profile.get("profile_version", "?")
|
|
1705
|
+
lines.append(f"voice profile: {name}")
|
|
1706
|
+
lines.append(f" version: {ver}")
|
|
1707
|
+
lines.append(f" source_count: {profile.get('source_count', 0)}")
|
|
1708
|
+
lines.append(f" word_count: {profile.get('word_count', 0)}")
|
|
1709
|
+
signal_count = meta.get("signal_count", 0)
|
|
1710
|
+
lines.append(f" signals_processed: {signal_count}")
|
|
1711
|
+
if meta.get("last_updated"):
|
|
1712
|
+
lines.append(f" last_updated: {meta['last_updated']}")
|
|
1713
|
+
lines.append("")
|
|
1714
|
+
|
|
1715
|
+
# temporal pattern weights
|
|
1716
|
+
temporal = meta.get("temporal_patterns", {})
|
|
1717
|
+
if temporal:
|
|
1718
|
+
lines.append("pattern weights (evolved):")
|
|
1719
|
+
lines.append(f" {'pattern':<35} {'confidence':<12} {'status':<12} {'confirmed':<12}")
|
|
1720
|
+
lines.append(f" {'─'*34:<35} {'─'*11:<12} {'─'*11:<12} {'─'*11:<12}")
|
|
1721
|
+
sorted_patterns = sorted(temporal.items(), key=lambda x: -x[1].get("confidence", 0))
|
|
1722
|
+
for pid, tp in sorted_patterns:
|
|
1723
|
+
w = tp.get("confidence", 0)
|
|
1724
|
+
bar_len = int(w * 20)
|
|
1725
|
+
bar = "█" * bar_len + "░" * (20 - bar_len)
|
|
1726
|
+
status = tp.get("status", "active")
|
|
1727
|
+
confirmed = tp.get("last_confirmed", "?")
|
|
1728
|
+
lines.append(f" {pid:<35} {bar} {w:.2f} {status:<12} {confirmed:<12}")
|
|
1729
|
+
lines.append("")
|
|
1730
|
+
|
|
1731
|
+
# voice stats
|
|
1732
|
+
lines.append("voice stats:")
|
|
1733
|
+
sentence = profile.get("sentence", {})
|
|
1734
|
+
paragraph = profile.get("paragraph", {})
|
|
1735
|
+
lines.append(f" sentence length: {sentence.get('avg_words', '?')} words avg ({sentence.get('variance', '?')} variance)")
|
|
1736
|
+
lines.append(f" paragraph length: {paragraph.get('avg_sentences', '?')} sentences avg")
|
|
1737
|
+
sig = profile.get("signature", {})
|
|
1738
|
+
lines.append(f" case style: {sig.get('case_style', '?')}")
|
|
1739
|
+
lines.append(f" argument pattern: {sig.get('argument_pattern', '?')}")
|
|
1740
|
+
lines.append("")
|
|
1741
|
+
|
|
1742
|
+
# opening moves
|
|
1743
|
+
moves = sig.get("opening_moves", [])
|
|
1744
|
+
if moves:
|
|
1745
|
+
lines.append("top opening moves:")
|
|
1746
|
+
for i, move in enumerate(moves[:6], 1):
|
|
1747
|
+
lines.append(f" {i}. \"{move}...\"")
|
|
1748
|
+
lines.append("")
|
|
1749
|
+
|
|
1750
|
+
# never_list
|
|
1751
|
+
never_list = sig.get("never_list", [])
|
|
1752
|
+
if never_list:
|
|
1753
|
+
lines.append(f"banned phrases: {len(never_list)}")
|
|
1754
|
+
for phrase in never_list[:8]:
|
|
1755
|
+
lines.append(f" - {phrase}")
|
|
1756
|
+
lines.append("")
|
|
1757
|
+
|
|
1758
|
+
# sources
|
|
1759
|
+
sources = profile.get("sources", [])
|
|
1760
|
+
if sources:
|
|
1761
|
+
lines.append(f"sources ({len(sources)}):")
|
|
1762
|
+
for s in sources[:5]:
|
|
1763
|
+
lines.append(f" - {s.get('path', '?')} ({s.get('chars', 0)} chars)")
|
|
1764
|
+
if len(sources) > 5:
|
|
1765
|
+
lines.append(f" ... and {len(sources) - 5} more")
|
|
1766
|
+
lines.append("")
|
|
1767
|
+
|
|
1768
|
+
print("\n".join(lines))
|
|
1769
|
+
|
|
1770
|
+
# optionally write taste markdown
|
|
1771
|
+
if args.write_voice:
|
|
1772
|
+
voice_md = render_voice_md(profile, meta)
|
|
1773
|
+
voice_path = args.write_voice
|
|
1774
|
+
if voice_path == "-":
|
|
1775
|
+
print("--- voice.md ---")
|
|
1776
|
+
print(voice_md)
|
|
1777
|
+
else:
|
|
1778
|
+
out_path = Path(voice_path).expanduser()
|
|
1779
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1780
|
+
out_path.write_text(voice_md, encoding="utf-8")
|
|
1781
|
+
print(f"\nvoice written to {out_path}")
|
|
1782
|
+
|
|
1783
|
+
return 0
|
|
1784
|
+
|
|
1785
|
+
|
|
1786
|
+
def cmd_reinforce(args: argparse.Namespace) -> int:
|
|
1787
|
+
"""diff original vs accepted draft and emit a signal report."""
|
|
1788
|
+
orig_path, orig_text = load_draft(args.original)
|
|
1789
|
+
acc_path, acc_text = load_draft(args.accepted)
|
|
1790
|
+
profile: dict[str, Any] | None = None
|
|
1791
|
+
if args.profile:
|
|
1792
|
+
p = Path(args.profile).expanduser()
|
|
1793
|
+
if not p.exists():
|
|
1794
|
+
raise SystemExit(f"profile not found: {p}")
|
|
1795
|
+
profile = json.loads(p.read_text(encoding="utf-8", errors="ignore"))
|
|
1796
|
+
report = build_signal_report(orig_path, acc_path, orig_text, acc_text, profile)
|
|
1797
|
+
write_or_print(json.dumps(report, indent=2, ensure_ascii=False), args.out)
|
|
1798
|
+
return 0
|
|
1799
|
+
|
|
1800
|
+
|
|
1801
|
+
def cmd_profile_evolve(args: argparse.Namespace) -> int:
|
|
1802
|
+
"""one-shot evolution: extract signals, update meta, merge profile stats."""
|
|
1803
|
+
profile_path = Path(args.profile).expanduser()
|
|
1804
|
+
if not profile_path.exists():
|
|
1805
|
+
raise SystemExit(f"profile not found: {profile_path}")
|
|
1806
|
+
profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1807
|
+
|
|
1808
|
+
meta_path = Path(args.meta).expanduser() if args.meta else profile_path.with_suffix(".meta.json")
|
|
1809
|
+
meta: dict[str, Any] = {}
|
|
1810
|
+
if meta_path.exists():
|
|
1811
|
+
try:
|
|
1812
|
+
meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1813
|
+
except (json.JSONDecodeError, OSError):
|
|
1814
|
+
meta = {}
|
|
1815
|
+
|
|
1816
|
+
orig_path, orig_text = load_draft(args.original)
|
|
1817
|
+
acc_path, acc_text = load_draft(args.accepted)
|
|
1818
|
+
|
|
1819
|
+
new_samples_text: str | None = None
|
|
1820
|
+
if args.new_samples:
|
|
1821
|
+
parts = []
|
|
1822
|
+
for raw_path in args.new_samples:
|
|
1823
|
+
files = iter_text_files([raw_path])
|
|
1824
|
+
for f in files:
|
|
1825
|
+
parts.append(read_text(f))
|
|
1826
|
+
if parts:
|
|
1827
|
+
new_samples_text = "\n\n".join(parts)
|
|
1828
|
+
|
|
1829
|
+
profile, meta = evolve_profile(
|
|
1830
|
+
profile, meta, orig_text, acc_text,
|
|
1831
|
+
original_path=orig_path, accepted_path=acc_path,
|
|
1832
|
+
new_samples_text=new_samples_text,
|
|
1833
|
+
)
|
|
1834
|
+
|
|
1835
|
+
profile_path.write_text(json.dumps(profile, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1836
|
+
meta_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1837
|
+
meta_path.write_text(json.dumps(meta, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1838
|
+
|
|
1839
|
+
active = get_active_patterns(meta)
|
|
1840
|
+
declining = get_declining_patterns(meta)
|
|
1841
|
+
print(f"evolved {profile_path}")
|
|
1842
|
+
print(f" meta: {meta_path}")
|
|
1843
|
+
print(f" active patterns: {len(active)}")
|
|
1844
|
+
print(f" declining/stale: {len(declining)}")
|
|
1845
|
+
print(f" total signals: {meta.get('signal_count', 0)}")
|
|
1846
|
+
|
|
1847
|
+
synced = _auto_sync(profile_path, meta_path)
|
|
1848
|
+
if synced:
|
|
1849
|
+
print(f" synced to cloud (R2)")
|
|
1850
|
+
return 0
|
|
1851
|
+
|
|
1852
|
+
|
|
1853
|
+
def write_or_print(value: str, out: str | None) -> None:
|
|
1854
|
+
if out:
|
|
1855
|
+
output_path = Path(out).expanduser()
|
|
1856
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1857
|
+
output_path.write_text(value, encoding="utf-8")
|
|
1858
|
+
print(output_path)
|
|
1859
|
+
else:
|
|
1860
|
+
print(value)
|
|
1861
|
+
|
|
1862
|
+
|
|
1863
|
+
def cmd_profile(args: argparse.Namespace) -> int:
|
|
1864
|
+
profile = build_profile(args.paths, args.name)
|
|
1865
|
+
rendered = json.dumps(profile, indent=2, ensure_ascii=False)
|
|
1866
|
+
write_or_print(rendered, args.out)
|
|
1867
|
+
return 0
|
|
1868
|
+
|
|
1869
|
+
|
|
1870
|
+
def _auto_sync(profile_path: Path, meta_path: Path) -> bool:
|
|
1871
|
+
"""try to sync to cloud if hold_voice_sync.py is available and env is configured.
|
|
1872
|
+
syncs only if > 23h since last sync. fails silently if sync script or boto3 is missing."""
|
|
1873
|
+
sync_script = Path(__file__).resolve().parent / "hold_voice_sync.py"
|
|
1874
|
+
if not sync_script.exists():
|
|
1875
|
+
return False
|
|
1876
|
+
import subprocess
|
|
1877
|
+
try:
|
|
1878
|
+
result = subprocess.run(
|
|
1879
|
+
[sys.executable, str(sync_script), "--profile", str(profile_path), "--meta", str(meta_path)],
|
|
1880
|
+
capture_output=True, text=True, timeout=30,
|
|
1881
|
+
)
|
|
1882
|
+
return result.returncode == 0
|
|
1883
|
+
except (subprocess.TimeoutExpired, OSError):
|
|
1884
|
+
return False
|
|
1885
|
+
|
|
1886
|
+
|
|
1887
|
+
def cmd_scan(args: argparse.Namespace) -> int:
|
|
1888
|
+
meta: dict[str, Any] = {}
|
|
1889
|
+
if args.meta:
|
|
1890
|
+
meta_path = Path(args.meta).expanduser()
|
|
1891
|
+
if meta_path.exists():
|
|
1892
|
+
try:
|
|
1893
|
+
meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1894
|
+
except (json.JSONDecodeError, OSError):
|
|
1895
|
+
pass
|
|
1896
|
+
|
|
1897
|
+
results = []
|
|
1898
|
+
text_outputs = []
|
|
1899
|
+
had_hits = False
|
|
1900
|
+
for raw_path in args.paths:
|
|
1901
|
+
name, text = load_draft(raw_path)
|
|
1902
|
+
hits = scan_text(text)
|
|
1903
|
+
if meta:
|
|
1904
|
+
hits = filter_hits_by_weights(hits, meta)
|
|
1905
|
+
had_hits = had_hits or bool(hits)
|
|
1906
|
+
results.append({"path": name, "issue_count": len(hits), "issues": hits})
|
|
1907
|
+
text_outputs.append(format_scan_text(name, text, hits))
|
|
1908
|
+
|
|
1909
|
+
if args.format == "json":
|
|
1910
|
+
print(json.dumps({"files": results}, indent=2, ensure_ascii=False))
|
|
1911
|
+
else:
|
|
1912
|
+
print("\n\n".join(text_outputs))
|
|
1913
|
+
|
|
1914
|
+
return 2 if args.fail_on_hit and had_hits else 0
|
|
1915
|
+
|
|
1916
|
+
|
|
1917
|
+
def cmd_rewrite_prompt(args: argparse.Namespace) -> int:
|
|
1918
|
+
draft_name, draft = load_draft(args.draft)
|
|
1919
|
+
profile_text = None
|
|
1920
|
+
if args.profile:
|
|
1921
|
+
profile_path = Path(args.profile).expanduser()
|
|
1922
|
+
if not profile_path.exists():
|
|
1923
|
+
raise SystemExit(f"profile not found: {profile_path}")
|
|
1924
|
+
profile_text = profile_path.read_text(encoding="utf-8", errors="ignore")
|
|
1925
|
+
meta: dict[str, Any] | None = None
|
|
1926
|
+
if args.meta:
|
|
1927
|
+
meta_path = Path(args.meta).expanduser()
|
|
1928
|
+
if meta_path.exists():
|
|
1929
|
+
try:
|
|
1930
|
+
meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
|
|
1931
|
+
except (json.JSONDecodeError, OSError):
|
|
1932
|
+
meta = None
|
|
1933
|
+
prompt = build_rewrite_prompt(draft_name, draft, profile_text, args.constraints or "", meta=meta)
|
|
1934
|
+
write_or_print(prompt, args.out)
|
|
1935
|
+
return 0
|
|
1936
|
+
|
|
1937
|
+
|
|
1938
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
1939
|
+
parser = argparse.ArgumentParser(description="Portable Hold Your Voice helpers")
|
|
1940
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
1941
|
+
|
|
1942
|
+
profile = sub.add_parser("profile", help="build a voice profile from sample files or directories")
|
|
1943
|
+
profile.add_argument("paths", nargs="+", help="sample files or directories")
|
|
1944
|
+
profile.add_argument("--name", default="project voice", help="profile name")
|
|
1945
|
+
profile.add_argument("--out", help="write profile JSON to this path")
|
|
1946
|
+
profile.set_defaults(func=cmd_profile)
|
|
1947
|
+
|
|
1948
|
+
scan = sub.add_parser("scan", help="scan drafts for AI-writing patterns")
|
|
1949
|
+
scan.add_argument("paths", nargs="+", help="draft files, or '-' for stdin")
|
|
1950
|
+
scan.add_argument("--format", choices=["json", "text"], default="json")
|
|
1951
|
+
scan.add_argument("--fail-on-hit", action="store_true", help="exit 2 when issues are found")
|
|
1952
|
+
scan.add_argument("--meta", help="meta JSON file for learned pattern filtering")
|
|
1953
|
+
scan.set_defaults(func=cmd_scan)
|
|
1954
|
+
|
|
1955
|
+
rewrite = sub.add_parser("rewrite-prompt", help="generate a line-level rewrite prompt")
|
|
1956
|
+
rewrite.add_argument("draft", help="draft file, or '-' for stdin")
|
|
1957
|
+
rewrite.add_argument("--profile", help="voice profile JSON file")
|
|
1958
|
+
rewrite.add_argument("--constraints", default="", help="extra rewrite constraints")
|
|
1959
|
+
rewrite.add_argument("--out", help="write prompt to this path")
|
|
1960
|
+
rewrite.add_argument("--meta", help="meta JSON file for learned pattern filtering")
|
|
1961
|
+
rewrite.set_defaults(func=cmd_rewrite_prompt)
|
|
1962
|
+
|
|
1963
|
+
pu = sub.add_parser("profile-update", help="merge new writing samples into an existing profile using rolling averages")
|
|
1964
|
+
pu.add_argument("--profile", required=True, help="existing profile JSON file")
|
|
1965
|
+
pu.add_argument("paths", nargs="+", help="new sample files or directories")
|
|
1966
|
+
pu.add_argument("--out", help="write updated profile to this path (default: in-place)")
|
|
1967
|
+
pu.set_defaults(func=cmd_profile_update)
|
|
1968
|
+
|
|
1969
|
+
pex = sub.add_parser("profile-export", help="bundle a voice profile into a portable .hyv file")
|
|
1970
|
+
pex.add_argument("--profile", required=True, help="voice profile JSON file")
|
|
1971
|
+
pex.add_argument("--meta", help="optional signal meta JSON file to include")
|
|
1972
|
+
pex.add_argument("--out", required=True, help="output .hyv file path")
|
|
1973
|
+
pex.set_defaults(func=cmd_profile_export)
|
|
1974
|
+
|
|
1975
|
+
pim = sub.add_parser("profile-import", help="import a .hyv bundle into a destination profile")
|
|
1976
|
+
pim.add_argument("--profile", required=True, help="destination profile JSON file (will be updated)")
|
|
1977
|
+
pim.add_argument("--meta", help="destination meta JSON file path (default: profile path with .meta.json)")
|
|
1978
|
+
pim.add_argument("--source", required=True, help=".hyv bundle file to import from")
|
|
1979
|
+
pim.set_defaults(func=cmd_profile_import)
|
|
1980
|
+
|
|
1981
|
+
pst = sub.add_parser("profile-status", help="pretty-print the learning state of a profile")
|
|
1982
|
+
pst.add_argument("--profile", required=True, help="voice profile JSON file")
|
|
1983
|
+
pst.add_argument("--meta", help="signal meta JSON file (default: profile path with .meta.json)")
|
|
1984
|
+
pst.add_argument("--write-voice", nargs="?", const="-", default=None,
|
|
1985
|
+
help="write voice.md (optional path; no arg = stdout)")
|
|
1986
|
+
pst.set_defaults(func=cmd_profile_status)
|
|
1987
|
+
|
|
1988
|
+
reinforce = sub.add_parser("reinforce", help="diff original vs accepted draft to extract learning signals")
|
|
1989
|
+
reinforce.add_argument("--original", required=True, help="original draft file, or '-' for stdin")
|
|
1990
|
+
reinforce.add_argument("--accepted", required=True, help="accepted/final draft file, or '-' for stdin")
|
|
1991
|
+
reinforce.add_argument("--profile", help="voice profile JSON file (optional)")
|
|
1992
|
+
reinforce.add_argument("--out", help="write signal report to this path")
|
|
1993
|
+
reinforce.set_defaults(func=cmd_reinforce)
|
|
1994
|
+
|
|
1995
|
+
pev = sub.add_parser("profile-evolve", help="one-shot evolution: signal extraction + meta update + profile merge")
|
|
1996
|
+
pev.add_argument("--original", required=True, help="original (AI) draft file, or '-' for stdin")
|
|
1997
|
+
pev.add_argument("--accepted", required=True, help="accepted (user-edited) draft file, or '-' for stdin")
|
|
1998
|
+
pev.add_argument("--profile", required=True, help="voice profile JSON file")
|
|
1999
|
+
pev.add_argument("--meta", help="meta JSON file path (default: profile path with .meta.json)")
|
|
2000
|
+
pev.add_argument("--new-samples", nargs="*", default=None, help="additional new writing samples to merge (optional)")
|
|
2001
|
+
pev.set_defaults(func=cmd_profile_evolve)
|
|
2002
|
+
|
|
2003
|
+
return parser
|
|
2004
|
+
|
|
2005
|
+
|
|
2006
|
+
def main(argv: list[str] | None = None) -> int:
|
|
2007
|
+
parser = build_parser()
|
|
2008
|
+
args = parser.parse_args(argv)
|
|
2009
|
+
return args.func(args)
|
|
2010
|
+
|
|
2011
|
+
|
|
2012
|
+
if __name__ == "__main__":
|
|
2013
|
+
raise SystemExit(main())
|