@holdyourvoice/hyv 2.3.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -582,6 +582,49 @@ ABSTRACT_STYLE_WORDS = {
582
582
  "reinvention",
583
583
  }
584
584
 
585
+ # --- Expanded AI vocabulary for 2025-2026 models ---
586
+ AI_VOCAB_EXPANDED = {
587
+ # GPT-4o / Claude fingerprint words
588
+ "inherently", "underscores", "arguably", "notably", "intrinsically",
589
+ "fundamentally", "nuanced", "multifaceted", "underscores", "encapsulate",
590
+ "underscores", "delve", "tapestry", "underscore", "testament",
591
+ # Phrase-level compounds (checked as substrings)
592
+ "in the realm of", "it's worth diving into", "the intersection of",
593
+ "a nuanced understanding", "the broader implications", "shed light on",
594
+ "robust framework", "it's important to note", "worth noting that",
595
+ "at the end of the day", "the reality is", "here's the thing",
596
+ # 2025-2026 model fingerprints
597
+ "it's worth mentioning", "let's unpack", "let's break down",
598
+ "to put it simply", "in a nutshell", "the bottom line",
599
+ "what's fascinating", "what's interesting", "what's remarkable",
600
+ "the key takeaway", "the key insight", "the key difference",
601
+ }
602
+
603
+ # --- Writing craft signals (from Magnetic Email principles) ---
604
+ STORYTELLING_SIGNALS = re.compile(
605
+ r"\b(?:yesterday|last\s+(?:week|month|year|night)|this\s+morning|earlier\s+today)\b|"
606
+ r"\b(?:i\s+was\s+(?:sitting|standing|walking|driving|lying)|we\s+were\s+(?:enjoying|having|drinking))\b|"
607
+ r"\b(?:my\s+(?:wife|husband|friend|mother|father|brother|sister|colleague)\s+(?:said|told|asked|laughed))\b|"
608
+ r"\b(?:i\s+remember|i\s+recall|i\s+once|i\s+used\s+to)\b|"
609
+ r"\b(?:the\s+sort\s+of|the\s+kind\s+of)\s+\w+\s+(?:you|that)\b",
610
+ re.I,
611
+ )
612
+
613
+ CONVERSATIONAL_SIGNALS = re.compile(
614
+ r"\b(?:let'?s\s+be\s+real|look|listen|here'?s\s+what|here'?s\s+why|think\s+about\s+it)\b|"
615
+ r"\b(?:you\s+know|right\?|see\?|get\s+it\?|makes\s+sense\?)\b|"
616
+ r"\b(?:i'?m\s+not\s+(?:gonna|going\s+to)\s+lie|i'?ll\s+be\s+honest|real\s+talk)\b|"
617
+ r"\b(?:picture\s+this|imagine\s+this|close\s+your\s+eyes)\b|"
618
+ r"\b(?:by\s+the\s+way|btw|funny\s+thing|random\s+thought)\b",
619
+ re.I,
620
+ )
621
+
622
+ SPECIFICITY_SIGNALS = re.compile(
623
+ r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?(?:%|percent|k|K|M|B)?\b|"
624
+ r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}\b|"
625
+ r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", # Proper nouns
626
+ )
627
+
585
628
  GENERIC_OPENERS = re.compile(
586
629
  r"^(?:most|many|some|all)\s+(?:brands|teams|people|founders|companies|businesses|organizations|leaders)\b|"
587
630
  r"^(?:in\s+)?(?:today'?s|the)\s+(?:fast.paced|ever.evolving|modern|digital|current|contemporary)\s+(?:world|age|era|landscape|economy)\b",
@@ -716,6 +759,535 @@ def infer_argument_pattern(text: str) -> str:
716
759
  return "mixed"
717
760
 
718
761
 
762
+ # =============================================================================
763
+ # VOICE-FIRST ANALYSIS FUNCTIONS
764
+ # =============================================================================
765
+
766
+ def vocabulary_fingerprint(text: str, limit: int = 50) -> dict[str, Any]:
767
+ """Extract vocabulary fingerprint: distinctive words, signature phrases, sentence starters."""
768
+ word_list = [w.lower() for w in words(text)]
769
+ total = len(word_list)
770
+ if total < 10:
771
+ return {"distinctive_words": [], "signature_phrases": [], "sentence_starters": [], "total_words": total}
772
+
773
+ # Word frequency
774
+ freq: dict[str, int] = {}
775
+ for w in word_list:
776
+ freq[w] = freq.get(w, 0) + 1
777
+
778
+ # Distinctive words: appear 2+ times but not in top 50 most common English words
779
+ COMMON_WORDS = {
780
+ "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
781
+ "have", "has", "had", "do", "does", "did", "will", "would", "could",
782
+ "should", "may", "might", "shall", "can", "to", "of", "in", "for",
783
+ "on", "with", "at", "by", "from", "as", "into", "through", "during",
784
+ "before", "after", "above", "below", "between", "and", "but", "or",
785
+ "nor", "not", "so", "yet", "both", "either", "neither", "each",
786
+ "every", "all", "any", "few", "more", "most", "other", "some", "such",
787
+ "no", "only", "own", "same", "than", "too", "very", "just", "because",
788
+ "if", "when", "where", "how", "what", "which", "who", "whom", "this",
789
+ "that", "these", "those", "i", "me", "my", "we", "our", "you", "your",
790
+ "he", "him", "his", "she", "her", "it", "its", "they", "them", "their",
791
+ }
792
+ distinctive = sorted(
793
+ [(w, c) for w, c in freq.items() if c >= 2 and w not in COMMON_WORDS and len(w) > 2],
794
+ key=lambda x: -x[1]
795
+ )[:limit]
796
+
797
+ # Signature phrases: recurring 2-4 word combinations
798
+ bigrams: dict[str, int] = {}
799
+ trigrams: dict[str, int] = {}
800
+ for i in range(len(word_list) - 1):
801
+ bg = f"{word_list[i]} {word_list[i+1]}"
802
+ bigrams[bg] = bigrams.get(bg, 0) + 1
803
+ for i in range(len(word_list) - 2):
804
+ tg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]}"
805
+ trigrams[tg] = trigrams.get(tg, 0) + 1
806
+
807
+ signature_phrases = []
808
+ for phrase, count in sorted(bigrams.items(), key=lambda x: -x[1]):
809
+ if count >= 3 and phrase.split()[0] not in COMMON_WORDS:
810
+ signature_phrases.append({"phrase": phrase, "count": count})
811
+ for phrase, count in sorted(trigrams.items(), key=lambda x: -x[1]):
812
+ if count >= 2:
813
+ signature_phrases.append({"phrase": phrase, "count": count})
814
+ signature_phrases = sorted(signature_phrases, key=lambda x: -x["count"])[:20]
815
+
816
+ # Sentence starters: first 2-3 words of sentences
817
+ sentence_list = sentences(text)
818
+ starters: dict[str, int] = {}
819
+ for sent in sentence_list:
820
+ sw = words(sent.lower())[:3]
821
+ if len(sw) >= 2:
822
+ key = " ".join(sw)
823
+ starters[key] = starters.get(key, 0) + 1
824
+ top_starters = sorted(starters.items(), key=lambda x: -x[1])[:10]
825
+
826
+ return {
827
+ "distinctive_words": [{"word": w, "count": c} for w, c in distinctive],
828
+ "signature_phrases": signature_phrases,
829
+ "sentence_starters": [{"phrase": p, "count": c} for p, c in top_starters],
830
+ "total_words": total,
831
+ "unique_words": len(freq),
832
+ }
833
+
834
+
835
+ def rhythm_markov(text: str) -> dict[str, Any]:
836
+ """Build a Markov transition matrix for sentence length patterns.
837
+ Captures the writer's rhythm: how short sentences follow long ones and vice versa."""
838
+ sentence_list = sentences(text)
839
+ lengths = [len(words(s)) for s in sentence_list if words(s)]
840
+ if len(lengths) < 5:
841
+ return {"transitions": {}, "length_buckets": [], "pattern": "insufficient_data"}
842
+
843
+ # Bucket sentence lengths into: short (1-8), medium (9-16), long (17-25), very_long (26+)
844
+ def bucket(l: int) -> str:
845
+ if l <= 8:
846
+ return "short"
847
+ if l <= 16:
848
+ return "medium"
849
+ if l <= 25:
850
+ return "long"
851
+ return "very_long"
852
+
853
+ bucketed = [bucket(l) for l in lengths]
854
+
855
+ # Build transition counts
856
+ transitions: dict[str, dict[str, int]] = {}
857
+ for i in range(len(bucketed) - 1):
858
+ src = bucketed[i]
859
+ dst = bucketed[i + 1]
860
+ if src not in transitions:
861
+ transitions[src] = {}
862
+ transitions[src][dst] = transitions[src].get(dst, 0) + 1
863
+
864
+ # Normalize to probabilities
865
+ transition_probs: dict[str, dict[str, float]] = {}
866
+ for src, dsts in transitions.items():
867
+ total = sum(dsts.values())
868
+ transition_probs[src] = {dst: round(count / total, 3) for dst, count in dsts.items()}
869
+
870
+ # Compute bucket distribution
871
+ bucket_counts: dict[str, int] = {}
872
+ for b in bucketed:
873
+ bucket_counts[b] = bucket_counts.get(b, 0) + 1
874
+ bucket_dist = {b: round(c / len(bucketed), 3) for b, c in bucket_counts.items()}
875
+
876
+ # Detect dominant rhythm pattern
877
+ dominant = max(bucket_dist, key=bucket_dist.get) if bucket_dist else "mixed"
878
+ if bucket_dist.get("medium", 0) > 0.6:
879
+ pattern = "uniform_medium" # AI-like
880
+ elif bucket_dist.get("short", 0) > 0.4 and bucket_dist.get("long", 0) + bucket_dist.get("very_long", 0) > 0.2:
881
+ pattern = "punchy_mixed" # Human-like conversational
882
+ elif len(set(bucketed)) >= 3:
883
+ pattern = "varied" # Human-like diverse
884
+ else:
885
+ pattern = dominant
886
+
887
+ return {
888
+ "transitions": transition_probs,
889
+ "distribution": bucket_dist,
890
+ "pattern": pattern,
891
+ "avg_length": round(sum(lengths) / len(lengths), 1),
892
+ "length_variance": round(math.sqrt(sum((l - sum(lengths)/len(lengths))**2 for l in lengths) / len(lengths)), 1),
893
+ }
894
+
895
+
896
+ def emotional_tone(text: str) -> dict[str, float]:
897
+ """Score text on simple emotional axes using keyword-based scoring.
898
+ Returns formality, energy, cynicism, warmth scores (0-10)."""
899
+ low = text.lower()
900
+ word_list = [w.lower() for w in words(low)]
901
+ total = max(1, len(word_list))
902
+
903
+ # Formality: formal words vs casual words
904
+ FORMAL = {"therefore", "furthermore", "moreover", "consequently", "nevertheless", "hence",
905
+ "accordingly", "thus", "whereby", "herein", "thereof", "wherein", "shall", "henceforth"}
906
+ CASUAL = {"gonna", "wanna", "gotta", "kinda", "sorta", "yeah", "nah", "yep", "nope",
907
+ "ok", "okay", "cool", "awesome", "stuff", "things", "basically", "honestly",
908
+ "literally", "totally", "pretty", "super", "really", "damn", "hell", "crap"}
909
+ formal_count = sum(1 for w in word_list if w in FORMAL)
910
+ casual_count = sum(1 for w in word_list if w in CASUAL)
911
+ contractions = len(re.findall(r"\b(?:n't|'re|'ve|'ll|'d|'m|'s)\b", low))
912
+ formality = max(0, min(10, 5 + (formal_count - casual_count - contractions * 0.3) * 10 / total))
913
+
914
+ # Energy: exclamation marks, short sentences, action verbs
915
+ exclamations = text.count("!")
916
+ short_sents = sum(1 for s in sentences(text) if len(words(s)) <= 6)
917
+ ACTION_VERBS = {"go", "run", "build", "create", "make", "do", "get", "take", "start",
918
+ "stop", "push", "pull", "drive", "hit", "crush", "nail", "smash", "kill"}
919
+ action_count = sum(1 for w in word_list if w in ACTION_VERBS)
920
+ sent_count = max(1, len(sentences(text)))
921
+ energy = max(0, min(10, 3 + exclamations * 2 / sent_count + short_sents / sent_count * 3 + action_count * 5 / total))
922
+
923
+ # Cynicism: negative qualifiers, hedging, dismissive words
924
+ CYNICAL = {"but", "however", "unfortunately", "sadly", "honestly", "actually", "look",
925
+ "listen", "truth", "reality", "problem", "issue", "broken", "failed", "wrong",
926
+ "terrible", "awful", "garbage", "rubbish", "crap", "bullshit", "stupid"}
927
+ cyn_count = sum(1 for w in word_list if w in CYNICAL)
928
+ cynicism = max(0, min(10, 2 + cyn_count * 8 / total))
929
+
930
+ # Warmth: personal pronouns, empathy words, inclusive language
931
+ WARMTH = {"we", "us", "our", "together", "friend", "love", "care", "hope", "wish",
932
+ "happy", "glad", "grateful", "thankful", "appreciate", "welcome", "please"}
933
+ warmth_count = sum(1 for w in word_list if w in WARMTH)
934
+ first_person = sum(1 for w in word_list if w in {"i", "me", "my", "we", "us", "our"})
935
+ warmth = max(0, min(10, 3 + warmth_count * 8 / total + first_person * 3 / total))
936
+
937
+ return {
938
+ "formality": round(formality, 1),
939
+ "energy": round(energy, 1),
940
+ "cynicism": round(cynicism, 1),
941
+ "warmth": round(warmth, 1),
942
+ }
943
+
944
+
945
+ def vocabulary_diversity(text: str) -> dict[str, float]:
946
+ """Compute vocabulary diversity metrics: TTR, Yule's K, hapax ratio."""
947
+ word_list = [w.lower() for w in words(text)]
948
+ total = len(word_list)
949
+ if total < 20:
950
+ return {"ttr": 0, "yules_k": 0, "hapax_ratio": 0, "total_words": total}
951
+
952
+ freq: dict[str, int] = {}
953
+ for w in word_list:
954
+ freq[w] = freq.get(w, 0) + 1
955
+
956
+ # Type-Token Ratio (unique / total)
957
+ ttr = len(freq) / total
958
+
959
+ # Hapax legomena ratio (words appearing once / total)
960
+ hapax = sum(1 for c in freq.values() if c == 1)
961
+ hapax_ratio = hapax / total
962
+
963
+ # Yule's K (vocabulary richness — lower is more diverse)
964
+ freq_of_freq: dict[int, int] = {}
965
+ for c in freq.values():
966
+ freq_of_freq[c] = freq_of_freq.get(c, 0) + 1
967
+ yules_k = 10000 * sum(i * i * freq_of_freq.get(i, 0) for i in range(1, max(freq_of_freq.keys(), default=0) + 1)) / (total * total) if total > 0 else 0
968
+
969
+ return {
970
+ "ttr": round(ttr, 3),
971
+ "yules_k": round(yules_k, 1),
972
+ "hapax_ratio": round(hapax_ratio, 3),
973
+ "total_words": total,
974
+ "unique_words": len(freq),
975
+ }
976
+
977
+
978
+ def ngram_repetition(text: str) -> dict[str, Any]:
979
+ """Detect repeated n-gram patterns that indicate AI-like repetition."""
980
+ word_list = [w.lower() for w in words(text)]
981
+ if len(word_list) < 20:
982
+ return {"repeated_trigrams": [], "echo_score": 0}
983
+
984
+ # Trigram frequency
985
+ trigrams: dict[str, int] = {}
986
+ for i in range(len(word_list) - 2):
987
+ tg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]}"
988
+ trigrams[tg] = trigrams.get(tg, 0) + 1
989
+
990
+ # Repeated trigrams (3+ times)
991
+ repeated = sorted(
992
+ [(tg, c) for tg, c in trigrams.items() if c >= 3],
993
+ key=lambda x: -x[1]
994
+ )[:20]
995
+
996
+ # 4-gram frequency
997
+ fourgrams: dict[str, int] = {}
998
+ for i in range(len(word_list) - 3):
999
+ fg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]} {word_list[i+3]}"
1000
+ fourgrams[fg] = fourgrams.get(fg, 0) + 1
1001
+ repeated_4 = sorted(
1002
+ [(fg, c) for fg, c in fourgrams.items() if c >= 2],
1003
+ key=lambda x: -x[1]
1004
+ )[:10]
1005
+
1006
+ # Echo score: proportion of words that are part of repeated trigrams
1007
+ words_in_repeats = sum(c * 3 for _, c in repeated)
1008
+ echo_score = min(1.0, words_in_repeats / max(1, len(word_list)))
1009
+
1010
+ return {
1011
+ "repeated_trigrams": [{"phrase": t, "count": c} for t, c in repeated],
1012
+ "repeated_fourgrams": [{"phrase": f, "count": c} for f, c in repeated_4],
1013
+ "echo_score": round(echo_score, 3),
1014
+ }
1015
+
1016
+
1017
+ def perplexity_proxy(text: str) -> dict[str, Any]:
1018
+ """Estimate perplexity using word transition predictability.
1019
+ Low perplexity = predictable = AI-like. High perplexity = surprising = human-like."""
1020
+ word_list = [w.lower() for w in words(text)]
1021
+ if len(word_list) < 10:
1022
+ return {"avg_predictability": 0, "low_perplexity_sentences": [], "score": 0}
1023
+
1024
+ # Build bigram frequencies from the text itself
1025
+ bigrams: dict[str, dict[str, int]] = {}
1026
+ for i in range(len(word_list) - 1):
1027
+ w1, w2 = word_list[i], word_list[i + 1]
1028
+ if w1 not in bigrams:
1029
+ bigrams[w1] = {}
1030
+ bigrams[w1][w2] = bigrams[w1].get(w2, 0) + 1
1031
+
1032
+ # Score each sentence for predictability
1033
+ sentence_list = sentences(text)
1034
+ sentence_scores: list[tuple[int, float, str]] = []
1035
+ for sent in sentence_list:
1036
+ sw = [w.lower() for w in words(sent)]
1037
+ if len(sw) < 3:
1038
+ continue
1039
+ predictability = 0
1040
+ count = 0
1041
+ for i in range(len(sw) - 1):
1042
+ w1, w2 = sw[i], sw[i + 1]
1043
+ if w1 in bigrams:
1044
+ total_transitions = sum(bigrams[w1].values())
1045
+ w2_freq = bigrams[w1].get(w2, 0)
1046
+ predictability += w2_freq / total_transitions
1047
+ count += 1
1048
+ if count > 0:
1049
+ avg_pred = predictability / count
1050
+ line_no = text[:text.find(sent)].count("\n") + 1 if sent in text else 0
1051
+ sentence_scores.append((line_no, avg_pred, sent.strip()[:120]))
1052
+
1053
+ # Flag sentences with unusually high predictability (> 0.7)
1054
+ low_perplexity = [(line, score, sent) for line, score, sent in sentence_scores if score > 0.7]
1055
+ low_perplexity.sort(key=lambda x: -x[1])
1056
+
1057
+ overall_avg = sum(s for _, s, _ in sentence_scores) / max(1, len(sentence_scores))
1058
+
1059
+ return {
1060
+ "avg_predictability": round(overall_avg, 3),
1061
+ "low_perplexity_sentences": [
1062
+ {"line": l, "score": round(s, 3), "text": t}
1063
+ for l, s, t in low_perplexity[:10]
1064
+ ],
1065
+ "score": round(overall_avg, 3), # Higher = more predictable = more AI-like
1066
+ }
1067
+
1068
+
1069
+ def cross_pattern_density(hits: list[dict[str, Any]], text: str) -> list[dict[str, Any]]:
1070
+ """Compute pattern density per paragraph. High density = strong AI signal."""
1071
+ paragraph_list = paragraphs(text)
1072
+ if not paragraph_list:
1073
+ return []
1074
+
1075
+ results = []
1076
+ offset = 0
1077
+ for para in paragraph_list:
1078
+ para_start = text.find(para, offset)
1079
+ if para_start == -1:
1080
+ offset += 1
1081
+ continue
1082
+ para_end = para_start + len(para)
1083
+ para_line = text[:para_start].count("\n") + 1
1084
+ para_word_count = len(words(para))
1085
+
1086
+ # Count hits in this paragraph
1087
+ para_hits = [
1088
+ h for h in hits
1089
+ if h.get("line", 0) >= para_line and h.get("line", 0) <= para_line + para.count("\n")
1090
+ ]
1091
+
1092
+ if para_word_count >= 20:
1093
+ density = len(para_hits) / para_word_count
1094
+ if density > 0.05: # 5% of words trigger patterns
1095
+ results.append({
1096
+ "line": para_line,
1097
+ "density": round(density, 3),
1098
+ "hits": len(para_hits),
1099
+ "words": para_word_count,
1100
+ "text": para.strip()[:160],
1101
+ })
1102
+
1103
+ offset = para_end
1104
+
1105
+ return sorted(results, key=lambda x: -x["density"])[:10]
1106
+
1107
+
1108
+ def storytelling_score(text: str) -> dict[str, Any]:
1109
+ """Score text for storytelling elements (TLS: Time, Location, Senses).
1110
+ Based on Kieran Drew's Magnetic Email principles."""
1111
+ low = text.lower()
1112
+ sentence_list = sentences(text)
1113
+ total_sents = max(1, len(sentence_list))
1114
+
1115
+ # Time references
1116
+ time_pattern = re.compile(
1117
+ r"\b(?:yesterday|last\s+(?:week|month|year|night)|this\s+morning|earlier\s+today|"
1118
+ r"monday|tuesday|wednesday|thursday|friday|saturday|sunday|"
1119
+ r"\d{1,2}(?:am|pm)|o'?clock|morning|evening|afternoon)\b", re.I
1120
+ )
1121
+ time_hits = len(time_pattern.findall(low))
1122
+
1123
+ # Location references
1124
+ location_pattern = re.compile(
1125
+ r"\b(?:at\s+the|in\s+the|on\s+the|inside|outside|upstairs|downstairs|"
1126
+ r"kitchen|office|gym|cafe|coffee\s+shop|restaurant|car|train|plane|bed)\b", re.I
1127
+ )
1128
+ location_hits = len(location_pattern.findall(low))
1129
+
1130
+ # Sensory words
1131
+ senses_pattern = re.compile(
1132
+ r"\b(?:saw|heard|felt|tasted|smelled|smelt|touch|touched|"
1133
+ r"bright|dark|loud|quiet|warm|cold|hot|sweet|bitter|sour|"
1134
+ r"soft|hard|smooth|rough|wet|dry|sharp|dull)\b", re.I
1135
+ )
1136
+ senses_hits = len(senses_pattern.findall(low))
1137
+
1138
+ # Dialogue
1139
+ dialogue_hits = len(re.findall(r'[""\u201c\u201d]', text))
1140
+
1141
+ # Story opener (snapshot pattern)
1142
+ story_opener = bool(STORYTELLING_SIGNALS.search(text[:500]))
1143
+
1144
+ # Compute score
1145
+ tls_score = min(1.0, (time_hits + location_hits + senses_hits + dialogue_hits) / max(1, total_sents * 0.3))
1146
+
1147
+ return {
1148
+ "score": round(tls_score, 3),
1149
+ "time_references": time_hits,
1150
+ "location_references": location_hits,
1151
+ "sensory_words": senses_hits,
1152
+ "dialogue_markers": dialogue_hits,
1153
+ "has_story_opener": story_opener,
1154
+ }
1155
+
1156
+
1157
+ def conversational_score(text: str) -> dict[str, Any]:
1158
+ """Score text for conversational tone vs. lecture/speech tone.
1159
+ Based on 'Write conversations not speeches' principle."""
1160
+ low = text.lower()
1161
+ sentence_list = sentences(text)
1162
+ total_sents = max(1, len(sentence_list))
1163
+
1164
+ # Direct address (you/your)
1165
+ direct_address = len(re.findall(r"\b(?:you|your|you're|you've|you'll)\b", low))
1166
+
1167
+ # Questions (conversational marker)
1168
+ questions = sum(1 for s in sentence_list if s.strip().endswith("?"))
1169
+
1170
+ # Contractions (casual tone)
1171
+ contractions = len(re.findall(r"\b(?:n't|'re|'ve|'ll|'d|'m|'s)\b", low))
1172
+
1173
+ # First person (personal)
1174
+ first_person = len(re.findall(r"\b(?:i|me|my|we|us|our)\b", low))
1175
+
1176
+ # Conversational phrases
1177
+ conv_hits = len(CONVERSATIONAL_SIGNALS.findall(low))
1178
+
1179
+ # Passive voice (anti-conversational)
1180
+ passive = len(re.findall(r"\b(?:is|are|was|were|been|being|be)\s+\w+ed\b", low))
1181
+
1182
+ # Compute score
1183
+ total_words = max(1, len(words(text)))
1184
+ conv_ratio = (direct_address + questions * 3 + contractions + first_person + conv_hits * 2) / total_words
1185
+ passive_ratio = passive / total_sents
1186
+ score = min(1.0, conv_ratio * 10 - passive_ratio * 0.5)
1187
+
1188
+ return {
1189
+ "score": round(max(0, score), 3),
1190
+ "direct_address": direct_address,
1191
+ "questions": questions,
1192
+ "contractions": contractions,
1193
+ "first_person": first_person,
1194
+ "conversational_phrases": conv_hits,
1195
+ "passive_voice": passive,
1196
+ }
1197
+
1198
+
1199
+ def specificity_score(text: str) -> dict[str, Any]:
1200
+ """Score text for specificity: proper nouns, numbers, dates, concrete details.
1201
+ AI text is vague. Human text is specific."""
1202
+ word_list = words(text)
1203
+ total = max(1, len(word_list))
1204
+
1205
+ # Numbers
1206
+ numbers = len(re.findall(r"\b\d+(?:\.\d+)?(?:%|k|K|M|B)?\b", text))
1207
+
1208
+ # Proper nouns (capitalized words not at sentence start)
1209
+ sentences_list = sentences(text)
1210
+ proper_nouns = 0
1211
+ for sent in sentences_list:
1212
+ sw = words(sent)
1213
+ for i, w in enumerate(sw):
1214
+ if i > 0 and w[0].isupper() and w not in {"I", "The", "A", "An"}:
1215
+ proper_nouns += 1
1216
+
1217
+ # Dates
1218
+ dates = len(re.findall(
1219
+ r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|"
1220
+ r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2}(?:,?\s+\d{4})?|"
1221
+ r"\d{4})\b", text
1222
+ ))
1223
+
1224
+ # Quotes (specific attribution)
1225
+ quotes = len(re.findall(r'[""\u201c\u201d]', text)) // 2
1226
+
1227
+ # Specificity ratio
1228
+ specific_items = numbers + proper_nouns + dates + quotes
1229
+ ratio = specific_items / total
1230
+
1231
+ return {
1232
+ "score": round(min(1.0, ratio * 15), 3),
1233
+ "numbers": numbers,
1234
+ "proper_nouns": proper_nouns,
1235
+ "dates": dates,
1236
+ "quotes": quotes,
1237
+ "ratio": round(ratio, 4),
1238
+ }
1239
+
1240
+
1241
+ def profile_strength(profile: dict[str, Any]) -> dict[str, Any]:
1242
+ """Compute profile strength score (0-100) based on source count, word count, diversity."""
1243
+ source_count = profile.get("source_count", 0)
1244
+ word_count = profile.get("word_count", 0)
1245
+ sources = profile.get("sources", [])
1246
+ signature = profile.get("signature", {})
1247
+
1248
+ # Source count score (0-30)
1249
+ source_score = min(30, source_count * 3)
1250
+
1251
+ # Word count score (0-30)
1252
+ word_score = min(30, word_count / 100)
1253
+
1254
+ # Diversity score (0-20): opening moves + anchors + distinctive words
1255
+ opening_moves = len(signature.get("opening_moves", []))
1256
+ anchors = len(signature.get("anchors", []))
1257
+ diversity_score = min(20, (opening_moves + anchors) * 2)
1258
+
1259
+ # Cadence score (0-10): has rhythm data
1260
+ cadence = signature.get("cadence", [])
1261
+ cadence_score = min(10, len(cadence) * 2.5)
1262
+
1263
+ # Recency score (0-10): based on source file modification times
1264
+ recency_score = 5 # default if we can't determine
1265
+
1266
+ total = source_score + word_score + diversity_score + cadence_score + recency_score
1267
+
1268
+ # Label
1269
+ if total >= 80:
1270
+ label = "strong"
1271
+ elif total >= 50:
1272
+ label = "moderate"
1273
+ elif total >= 25:
1274
+ label = "weak"
1275
+ else:
1276
+ label = "insufficient"
1277
+
1278
+ return {
1279
+ "score": round(min(100, total)),
1280
+ "label": label,
1281
+ "breakdown": {
1282
+ "sources": source_score,
1283
+ "words": word_score,
1284
+ "diversity": diversity_score,
1285
+ "cadence": cadence_score,
1286
+ "recency": recency_score,
1287
+ },
1288
+ }
1289
+
1290
+
719
1291
  def first_words(text: str, count: int = 7) -> str:
720
1292
  found = words(text.lower())
721
1293
  return " ".join(found[:count])
@@ -813,7 +1385,7 @@ def build_profile(paths: list[str], name: str) -> dict[str, Any]:
813
1385
  voice_rules.append("study these sample opening moves before drafting: " + "; ".join(opening_moves[:4]))
814
1386
 
815
1387
  return {
816
- "profile_version": "hold-your-voice-portable-v1",
1388
+ "profile_version": "hold-your-voice-portable-v2",
817
1389
  "name": name,
818
1390
  "source_count": len(samples),
819
1391
  "sources": [{"path": sample["path"], "chars": len(sample["text"])} for sample in samples],
@@ -828,6 +1400,11 @@ def build_profile(paths: list[str], name: str) -> dict[str, Any]:
828
1400
  "anchors": anchors,
829
1401
  "never_list": never_list,
830
1402
  },
1403
+ "voice_fingerprint": vocabulary_fingerprint(combined),
1404
+ "rhythm": rhythm_markov(combined),
1405
+ "emotional_tone": emotional_tone(combined),
1406
+ "voice_diversity": vocabulary_diversity(combined),
1407
+ "voice_strength": None, # computed separately via profile_strength()
831
1408
  "voice_rules": voice_rules,
832
1409
  "ai_eliminator": {
833
1410
  "rewrite_scope": "flagged-lines-only",
@@ -1007,6 +1584,19 @@ def scan_text(text: str) -> list[dict[str, Any]]:
1007
1584
  line_no = text[: match.start()].count("\n") + 1
1008
1585
  hits.append({"line": line_no, "rule": rule_id, "phrase": snippet[:160]})
1009
1586
 
1587
+ # Expanded AI vocabulary detection (2025-2026 model fingerprints)
1588
+ for line_no, line in enumerate((text or "").splitlines(), 1):
1589
+ low = line.lower()
1590
+ for term in AI_VOCAB_EXPANDED:
1591
+ if " " in term:
1592
+ # Multi-word phrase
1593
+ if term in low:
1594
+ hits.append({"line": line_no, "rule": "ai_vocab_expanded", "phrase": term})
1595
+ else:
1596
+ # Single word — match with word boundaries
1597
+ if re.search(rf"\b{re.escape(term)}\b", low):
1598
+ hits.append({"line": line_no, "rule": "ai_vocab_expanded", "phrase": term})
1599
+
1010
1600
  for line_no, line in enumerate((text or "").splitlines(), 1):
1011
1601
  for hit in line_style_hits(line):
1012
1602
  hits.append({"line": line_no, "rule": hit["rule"], "phrase": hit["phrase"], "text": line.strip()[:240]})
@@ -1015,6 +1605,17 @@ def scan_text(text: str) -> list[dict[str, Any]]:
1015
1605
  for structural_hit in _structural_analysis(text):
1016
1606
  hits.append(structural_hit)
1017
1607
 
1608
+ # Voice craft signals (from Magnetic Email principles)
1609
+ # Lack of storytelling in long text
1610
+ story_hits = STORYTELLING_SIGNALS.findall(text or "")
1611
+ conv_hits = CONVERSATIONAL_SIGNALS.findall(text or "")
1612
+ word_count = len(words(text or ""))
1613
+ if word_count > 200:
1614
+ if len(story_hits) == 0:
1615
+ hits.append({"line": 0, "rule": "voice_no_storytelling", "phrase": f"no storytelling signals in {word_count} words — text reads like a lecture, not a conversation"})
1616
+ if len(conv_hits) == 0 and word_count > 300:
1617
+ hits.append({"line": 0, "rule": "voice_no_conversation", "phrase": f"no conversational signals in {word_count} words — text speaks at reader, not with them"})
1618
+
1018
1619
  # Staccato triplet detection — only fire when sentences are clearly performative
1019
1620
  sentence_hits = []
1020
1621
  for line_no, line in enumerate((text or "").splitlines(), 1):
@@ -1066,44 +1667,303 @@ def load_draft(path: str) -> tuple[str, str]:
1066
1667
  return str(draft_path), read_text(draft_path)
1067
1668
 
1068
1669
 
1670
+ # --- Pattern fix guidance: tells the LLM HOW to fix each pattern type ---
1671
+ PATTERN_FIX_GUIDANCE = {
1672
+ "landscape_era": "Replace temporal grandstanding with a concrete observation or remove entirely.",
1673
+ "formulaic_connector": "Replace formal transitions (Moreover, Furthermore, Additionally) with natural flow or short sentences.",
1674
+ "lets_invitation": "Remove the invitation to dive/explore. Just start with the point.",
1675
+ "inflated_verbs": "Replace marketing verbs (unlock, leverage, supercharge) with plain verbs (use, build, get).",
1676
+ "truth_harsh_reality": "Remove the 'reality/truth is' framing. State the point directly.",
1677
+ "ai_vocab_density": "Replace AI-buzzwords with specific, concrete language from the writer's vocabulary.",
1678
+ "ai_vocab_expanded": "Replace with plain language. If the phrase is 'it's important to note', just state the point.",
1679
+ "abstract_noun_cluster": "Replace abstract nouns with concrete examples, scenes, or specific actions.",
1680
+ "ux_buzzwords": "Replace buzzwords (robust, seamless, holistic) with specific descriptions of what the thing actually does.",
1681
+ "binary_reframing": "Remove the 'it's not X, it's Y' structure. State the positive claim directly.",
1682
+ "not_just_but": "Remove the 'not just X but Y' structure. Pick the stronger point and lead with it.",
1683
+ "more_than_just": "Remove 'more than just'. State what it actually is.",
1684
+ "founder_cadence": "Remove the performative cadence (here's the thing, the moment X becomes Y). Write plainly.",
1685
+ "staccato_drama": "Break the staccato pattern. Vary sentence length. Add a longer sentence.",
1686
+ "restatement_polish": "Remove 'in other words' / 'which is another way of saying'. Say it once, clearly.",
1687
+ "spoiler_reveal": "Remove 'spoiler alert' and 'here's the truth' framing.",
1688
+ "hedging_noncommittal": "Remove hedging (it depends, no one-size-fits-all). Take a position or cut the sentence.",
1689
+ "balanced_contrast": "Remove 'on the other hand' / 'on the flip side'. Pick a side or use 'but' briefly.",
1690
+ "empathy_opener": "Remove empathy validation (you're not alone, it's easy to feel). Start with the substance.",
1691
+ "journey_cliche": "Remove journey/destination metaphors. State the actual point.",
1692
+ "ai_metaphors": "Replace metaphor clusters (beacon, tapestry, north star) with concrete language.",
1693
+ "guide_framing": "Remove guide framing (step-by-step, key takeaways, actionable tips). Just write the thing.",
1694
+ "wrapping_patterns": "Remove conclusion patterns (at the end of the day, the bottom line). End on a specific detail or thought.",
1695
+ "buyer_psychology": "Remove 'people don't buy X, they buy Y' templates. State the point directly.",
1696
+ "overwhelm_reassurance": "Remove 'it can feel overwhelming but it doesn't have to be'. Just help.",
1697
+ "pros_cons_framing": "Remove pros/cons structure. Make an argument, don't list.",
1698
+ "triple_adjective": "Remove triple-adjective stacks. Pick the one that matters.",
1699
+ "hidden_depth": "Remove 'behind the scenes' / 'beneath the surface'. State the insight directly.",
1700
+ "self_referential": "Remove AI disclaimers (as an AI model, I can't provide).",
1701
+ "placeholder_brackets": "Replace [your brand] placeholders with specific examples or remove.",
1702
+ "story_templates": "Remove 'imagine this / picture this' templates. Use a real scene or observation.",
1703
+ "clickbait_didnt_know": "Remove 'the X you didn't know you needed' framing.",
1704
+ "self_referential_restatement": "Remove 'you asked about X, let's break it down'. Just answer.",
1705
+ "ted_talk_slogan": "Remove the TED-talk contrastive slogan. State the point plainly.",
1706
+ "perfect_marketing_sentence": "This sentence is too polished and generic. Make it specific or cut it.",
1707
+ "abstract_noun_cluster": "Too many abstract nouns. Replace with concrete examples or actions.",
1708
+ "generic_opening_generalization": "Opens with a sweeping generalization. Start with a specific observation or scene.",
1709
+ "voice_question_opener": "Opens with a question. Start with a statement, scene, or observation instead.",
1710
+ "voice_lesson_opener": "Opens with a lesson/inspiration claim. Start with a specific moment or example.",
1711
+ "cta_ending": "Remove the engagement-bait CTA (let me know if you need help). End on substance.",
1712
+ "voice_no_storytelling": "No storytelling signals found. Add a personal scene, specific moment, or concrete example.",
1713
+ "voice_no_conversation": "Text reads like a lecture. Address the reader directly (you/your), add a question, or use contractions.",
1714
+ "low_burstiness": "Sentence lengths are too uniform. Add a very short sentence (under 6 words) or break a long one.",
1715
+ "mechanical_paragraphs": "Paragraphs are all the same length. Combine some, split others, or add a one-liner.",
1716
+ "uniform_paragraph_rhythm": "Sentences within paragraphs are all 12-22 words. Vary: some 5 words, some 25.",
1717
+ "low_contractions": "Too few contractions. Use don't, can't, it's, you're to sound natural.",
1718
+ "formal_hedging_density": "Too many formal hedges (it is important to note). State things directly.",
1719
+ "generic_intensifiers": "Too many intensifiers (remarkably, incredibly). Cut them or use specifics.",
1720
+ "no_fragments": "No sentence fragments at all — reads over-polished. Add a fragment for texture.",
1721
+ "over_structured_lists": "Lists follow a rigid 3-item pattern. Vary list length or break the pattern.",
1722
+ }
1723
+
1724
+
1725
+ def _dedupe_hits(hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
1726
+ """Merge multiple rules per line into one entry with combined rules."""
1727
+ by_line: dict[int, dict[str, Any]] = {}
1728
+ for hit in hits:
1729
+ line = hit.get("line", 0)
1730
+ if line not in by_line:
1731
+ by_line[line] = {"line": line, "rules": [], "phrases": [], "text": hit.get("text", "")}
1732
+ by_line[line]["rules"].append(hit.get("rule", "unknown"))
1733
+ phrase = hit.get("phrase", "")
1734
+ if phrase and phrase not in by_line[line]["phrases"]:
1735
+ by_line[line]["phrases"].append(phrase)
1736
+ return sorted(by_line.values(), key=lambda x: x["line"])
1737
+
1738
+
1739
+ def _compress_profile_for_prompt(profile: dict[str, Any] | None) -> str:
1740
+ """Extract only the actionable voice data from a profile for the LLM prompt.
1741
+ Strips out structural metadata, sources, and raw analysis data."""
1742
+ if not profile:
1743
+ return ""
1744
+
1745
+ sig = profile.get("signature", {})
1746
+ tone = profile.get("emotional_tone", {})
1747
+ fp = profile.get("voice_fingerprint", {})
1748
+
1749
+ lines = []
1750
+
1751
+ # Voice anchors — the single most important thing
1752
+ anchors = sig.get("anchors", [])
1753
+ if anchors:
1754
+ lines.append("SOUND LIKE THIS:")
1755
+ lines.append(f' "{anchors[0][:200]}"')
1756
+ if len(anchors) > 1:
1757
+ lines.append(f' "{anchors[1][:200]}"')
1758
+ lines.append("")
1759
+
1760
+ # Rhythm + tone in one line
1761
+ cadence = sig.get("cadence", [])
1762
+ rhythm_line = cadence[0] if cadence else ""
1763
+ tone_parts = []
1764
+ if tone:
1765
+ if tone.get("formality", 5) < 4:
1766
+ tone_parts.append("casual")
1767
+ elif tone.get("formality", 5) > 6:
1768
+ tone_parts.append("formal")
1769
+ if tone.get("energy", 5) > 6:
1770
+ tone_parts.append("high-energy")
1771
+ if tone.get("cynicism", 5) > 5:
1772
+ tone_parts.append("cynical")
1773
+ if tone.get("warmth", 5) > 5:
1774
+ tone_parts.append("warm")
1775
+ tone_str = ", ".join(tone_parts) if tone_parts else "neutral"
1776
+ if rhythm_line:
1777
+ lines.append(f"RHYTHM: {rhythm_line}. Tone: {tone_str}.")
1778
+ else:
1779
+ lines.append(f"TONE: {tone_str}.")
1780
+ lines.append("")
1781
+
1782
+ # Never list — compact
1783
+ never = sig.get("never_list", [])
1784
+ if never:
1785
+ lines.append("BANNED: " + " | ".join(never[:6]))
1786
+ lines.append("")
1787
+
1788
+ return "\n".join(lines)
1789
+
1790
+
1791
+ def _flagged_line_to_instruction(entry: dict[str, Any]) -> str:
1792
+ """Convert a deduped hit entry into a compact instruction the LLM will actually follow."""
1793
+ line = entry["line"]
1794
+ rules = entry["rules"]
1795
+ phrases = entry["phrases"]
1796
+ phrase_str = phrases[0] if phrases else ""
1797
+
1798
+ # Pick the single most specific fix guidance
1799
+ guidance = ""
1800
+ for rule in rules:
1801
+ if rule in PATTERN_FIX_GUIDANCE:
1802
+ guidance = PATTERN_FIX_GUIDANCE[rule]
1803
+ break
1804
+
1805
+ # Compress: line number + what's wrong + what to do
1806
+ if line == 0:
1807
+ return f"- STRUCTURAL: {guidance}"
1808
+ if guidance:
1809
+ return f"- L{line} \"{phrase_str[:60]}\": {guidance}"
1810
+ return f"- L{line} \"{phrase_str[:60]}\""
1811
+
1812
+
1813
+ def apply_replacements(draft: str, replacements_json: str) -> str:
1814
+ """Apply LLM-returned replacements to a draft. Returns the patched text."""
1815
+ try:
1816
+ data = json.loads(replacements_json)
1817
+ replacements = data.get("replacements", [])
1818
+ except (json.JSONDecodeError, TypeError):
1819
+ return draft
1820
+
1821
+ lines = draft.splitlines()
1822
+ for rep in replacements:
1823
+ line_no = rep.get("line", 0)
1824
+ text = rep.get("text", "")
1825
+ if 1 <= line_no <= len(lines):
1826
+ lines[line_no - 1] = text
1827
+ return "\n".join(lines)
1828
+
1829
+
1830
+ def rewrite_with_verification(
1831
+ draft: str,
1832
+ profile_text: str | None = None,
1833
+ constraints: str = "",
1834
+ meta: dict[str, Any] | None = None,
1835
+ max_passes: int = 3,
1836
+ rewrite_fn=None,
1837
+ ) -> dict[str, Any]:
1838
+ """Scan → rewrite → rescan loop. Up to max_passes iterations.
1839
+
1840
+ Args:
1841
+ draft: the original draft text
1842
+ profile_text: voice profile JSON string (optional)
1843
+ constraints: extra rewrite constraints
1844
+ meta: signal meta for learned pattern filtering
1845
+ max_passes: maximum rewrite attempts (default 3)
1846
+ rewrite_fn: callable(draft, prompt) -> str that returns the LLM's JSON response.
1847
+ If None, returns the prompt only (for external LLM execution).
1848
+
1849
+ Returns dict with:
1850
+ - final_text: the rewritten draft after all passes
1851
+ - initial_hits: pattern count before any rewriting
1852
+ - final_hits: pattern count after last pass
1853
+ - passes_used: how many passes were executed
1854
+ - prompts: list of prompts generated (one per pass)
1855
+ - pass_details: per-pass hit counts
1856
+ """
1857
+ initial_hits = scan_text(draft)
1858
+ if meta:
1859
+ initial_hits = filter_hits_by_weights(initial_hits, meta)
1860
+
1861
+ current_text = draft
1862
+ prompts = []
1863
+ pass_details = []
1864
+
1865
+ for pass_num in range(max_passes):
1866
+ prompt = build_rewrite_prompt("draft", current_text, profile_text, constraints, meta)
1867
+ prompts.append(prompt)
1868
+
1869
+ hits = scan_text(current_text)
1870
+ if meta:
1871
+ hits = filter_hits_by_weights(hits, meta)
1872
+
1873
+ pass_details.append({"pass": pass_num + 1, "hits": len(hits)})
1874
+
1875
+ if not hits:
1876
+ break # clean — no more patterns
1877
+
1878
+ if rewrite_fn is None:
1879
+ # No LLM available — return prompt for external execution
1880
+ break
1881
+
1882
+ # Call the LLM
1883
+ llm_response = rewrite_fn(current_text, prompt)
1884
+ patched = apply_replacements(current_text, llm_response)
1885
+
1886
+ if patched == current_text:
1887
+ break # LLM didn't change anything — stop
1888
+
1889
+ current_text = patched
1890
+
1891
+ final_hits = scan_text(current_text)
1892
+ if meta:
1893
+ final_hits = filter_hits_by_weights(final_hits, meta)
1894
+
1895
+ return {
1896
+ "final_text": current_text,
1897
+ "initial_hits": len(initial_hits),
1898
+ "final_hits": len(final_hits),
1899
+ "passes_used": len(pass_details),
1900
+ "prompts": prompts,
1901
+ "pass_details": pass_details,
1902
+ }
1903
+
1904
+
1069
1905
  def build_rewrite_prompt(draft_name: str, draft: str, profile_text: str | None, constraints: str = "", meta: dict[str, Any] | None = None) -> str:
1070
1906
  hits = scan_text(draft)
1071
1907
  if meta:
1072
1908
  hits = filter_hits_by_weights(hits, meta)
1073
- issue_lines = "\n".join(
1074
- f"- line {hit['line']} [{hit['rule']}]: {hit.get('phrase', '')}"
1075
- for hit in hits
1076
- ) or "- none found by deterministic scan"
1909
+
1910
+ deduped = _dedupe_hits(hits)
1911
+
1912
+ # Build compact issue lines with fix guidance embedded
1913
+ issue_lines = [_flagged_line_to_instruction(entry) for entry in deduped]
1914
+ issue_block = "\n".join(issue_lines) or "- none found"
1077
1915
 
1078
1916
  numbered_draft = "\n".join(f"{idx}: {line}" for idx, line in enumerate(draft.splitlines(), 1))
1079
- profile_block = profile_text.strip() if profile_text and profile_text.strip() else "(no voice profile supplied)"
1080
- constraints_block = constraints.strip() if constraints.strip() else "(none)"
1081
1917
 
1082
- return f"""Rewrite only the flagged lines. Do not rewrite the whole piece.
1918
+ # Compress profile
1919
+ profile_block = ""
1920
+ if profile_text and profile_text.strip():
1921
+ try:
1922
+ profile = json.loads(profile_text)
1923
+ profile_block = _compress_profile_for_prompt(profile)
1924
+ except (json.JSONDecodeError, TypeError):
1925
+ profile_block = ""
1083
1926
 
1084
- Return only valid JSON in this exact shape:
1085
- {{"replacements":[{{"line":1,"text":"replacement line"}}]}}
1927
+ constraints_line = f"\nCONSTRAINTS: {constraints.strip()}" if constraints and constraints.strip() else ""
1086
1928
 
1087
- Rules:
1088
- - Include only flagged line numbers.
1089
- - Preserve unflagged lines exactly by not returning them.
1090
- - Preserve the original argument and local meaning.
1091
- - Use the voice profile as the benchmark when present.
1092
- - Remove AI cadence, polished founder cadence, abstract strategy-deck language, and generic lesson shapes.
1093
- - Do not add new sections, hooks, CTAs, markdown, bullets, or commentary.
1929
+ # Compact prompt — everything the LLM needs, nothing it doesn't
1930
+ prompt = f"""Fix only the flagged lines. Return JSON: {{"replacements":[{{"line":1,"text":"fixed line"}}]}}
1094
1931
 
1095
- Voice profile:
1096
- {profile_block}
1932
+ RULES:
1933
+ - Only return flagged line numbers. Leave everything else untouched.
1934
+ - Keep the original argument. Remove AI patterns — write like a real person.
1935
+ - No hooks, CTAs, summaries, or new sections.{constraints_line}
1097
1936
 
1098
- Extra constraints:
1099
- {constraints_block}
1937
+ {profile_block}FIX THESE:
1938
+ {issue_block}
1100
1939
 
1101
- Flagged lines:
1102
- {issue_lines}
1940
+ DRAFT ({draft_name}):
1941
+ {numbered_draft}"""
1103
1942
 
1104
- Draft with line numbers ({draft_name}):
1105
- {numbered_draft}
1106
- """
1943
+ return prompt
1944
+
1945
+
1946
+ def build_voice_draft_prompt(draft: str, profile: dict[str, Any] | None, angle: str = "", constraints: str = "") -> str:
1947
+ """Generate a prompt for rewriting an entire draft in the writer's voice."""
1948
+ profile_block = _compress_profile_for_prompt(profile) if profile else ""
1949
+
1950
+ angle_line = f"\nANGLE: {angle}" if angle else ""
1951
+ constraints_line = f"\nCONSTRAINTS: {constraints}" if constraints else ""
1952
+
1953
+ prompt = f"""Rewrite this draft in the voice below. Return the full text only — no commentary.
1954
+
1955
+ RULES:
1956
+ - Keep the argument and key points. Match the voice anchors and rhythm.
1957
+ - Open with a specific observation or scene, not a generalization.
1958
+ - Use contractions. Vary sentence length. Write to one person ("you").
1959
+ - No AI patterns (let's dive in, robust, holistic, moreover, furthermore).
1960
+ - No hooks, CTAs, summaries, or motivational closings.
1961
+ - End on a specific detail or quiet thought.{angle_line}{constraints_line}
1962
+
1963
+ {profile_block}DRAFT:
1964
+ {draft}"""
1965
+
1966
+ return prompt
1107
1967
 
1108
1968
 
1109
1969
  DEFAULT_NEVER_LIST = [
@@ -1935,6 +2795,104 @@ def cmd_rewrite_prompt(args: argparse.Namespace) -> int:
1935
2795
  return 0
1936
2796
 
1937
2797
 
2798
+ def cmd_voice_score(args: argparse.Namespace) -> int:
2799
+ """Score text for voice quality: storytelling, conversation, specificity, tone."""
2800
+ name, text = load_draft(args.draft)
2801
+ story = storytelling_score(text)
2802
+ conv = conversational_score(text)
2803
+ spec = specificity_score(text)
2804
+ tone = emotional_tone(text)
2805
+ diversity = vocabulary_diversity(text)
2806
+ perplexity = perplexity_proxy(text)
2807
+ ngrams = ngram_repetition(text)
2808
+
2809
+ result = {
2810
+ "file": name,
2811
+ "word_count": len(words(text)),
2812
+ "storytelling": story,
2813
+ "conversation": conv,
2814
+ "specificity": spec,
2815
+ "emotional_tone": tone,
2816
+ "vocabulary_diversity": diversity,
2817
+ "perplexity_proxy": perplexity,
2818
+ "ngram_repetition": ngrams,
2819
+ "voice_quality": round(
2820
+ (story["score"] * 0.25 + conv["score"] * 0.25 + spec["score"] * 0.2 +
2821
+ (1 - perplexity["score"]) * 0.15 + diversity["ttr"] * 0.15), 3
2822
+ ),
2823
+ }
2824
+
2825
+ if args.format == "json":
2826
+ print(json.dumps(result, indent=2, ensure_ascii=False))
2827
+ else:
2828
+ print(f"Voice Score for: {name}")
2829
+ print(f" Words: {result['word_count']}")
2830
+ print(f" Overall voice quality: {result['voice_quality']:.2f}")
2831
+ print(f" Storytelling: {story['score']:.2f} (time={story['time_references']}, location={story['location_references']}, senses={story['sensory_words']}, dialogue={story['dialogue_markers']})")
2832
+ print(f" Conversation: {conv['score']:.2f} (you/your={conv['direct_address']}, questions={conv['questions']}, contractions={conv['contractions']})")
2833
+ print(f" Specificity: {spec['score']:.2f} (numbers={spec['numbers']}, proper_nouns={spec['proper_nouns']}, quotes={spec['quotes']})")
2834
+ print(f" Tone: formality={tone['formality']}, energy={tone['energy']}, cynicism={tone['cynicism']}, warmth={tone['warmth']}")
2835
+ print(f" Diversity: TTR={diversity['ttr']}, Yule's K={diversity['yules_k']}, hapax={diversity['hapax_ratio']}")
2836
+ print(f" Perplexity: {perplexity['score']:.3f} (higher = more predictable = more AI-like)")
2837
+ print(f" N-gram echo: {ngrams['echo_score']:.3f}")
2838
+ return 0
2839
+
2840
+
2841
+ def cmd_verify(args: argparse.Namespace) -> int:
2842
+ """Scan a draft, report before/after pattern counts."""
2843
+ name, text = load_draft(args.draft)
2844
+ hits = scan_text(text)
2845
+
2846
+ meta: dict[str, Any] = {}
2847
+ if args.meta:
2848
+ meta_path = Path(args.meta).expanduser()
2849
+ if meta_path.exists():
2850
+ try:
2851
+ meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
2852
+ except (json.JSONDecodeError, OSError):
2853
+ pass
2854
+ if meta:
2855
+ hits = filter_hits_by_weights(hits, meta)
2856
+
2857
+ # Group hits by rule
2858
+ rule_counts: dict[str, int] = {}
2859
+ for hit in hits:
2860
+ rule = hit.get("rule", "unknown")
2861
+ rule_counts[rule] = rule_counts.get(rule, 0) + 1
2862
+
2863
+ if args.format == "json":
2864
+ print(json.dumps({
2865
+ "file": name,
2866
+ "total_hits": len(hits),
2867
+ "by_rule": dict(sorted(rule_counts.items(), key=lambda x: -x[1])),
2868
+ "hits": hits,
2869
+ }, indent=2, ensure_ascii=False))
2870
+ else:
2871
+ print(f"Verification: {name}")
2872
+ print(f" Total patterns: {len(hits)}")
2873
+ if rule_counts:
2874
+ print(f" By rule:")
2875
+ for rule, count in sorted(rule_counts.items(), key=lambda x: -x[1]):
2876
+ print(f" {rule}: {count}")
2877
+ else:
2878
+ print(f" No AI patterns detected.")
2879
+ return 2 if args.fail_on_hit and hits else 0
2880
+
2881
+
2882
+ def cmd_voice_draft_prompt(args: argparse.Namespace) -> int:
2883
+ """Generate a full-draft voice rewrite prompt."""
2884
+ name, draft = load_draft(args.draft)
2885
+ profile = None
2886
+ if args.profile:
2887
+ profile_path = Path(args.profile).expanduser()
2888
+ if not profile_path.exists():
2889
+ raise SystemExit(f"profile not found: {profile_path}")
2890
+ profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
2891
+ prompt = build_voice_draft_prompt(draft, profile, args.angle or "", args.constraints or "")
2892
+ write_or_print(prompt, args.out)
2893
+ return 0
2894
+
2895
+
1938
2896
  def build_parser() -> argparse.ArgumentParser:
1939
2897
  parser = argparse.ArgumentParser(description="Portable Hold Your Voice helpers")
1940
2898
  sub = parser.add_subparsers(dest="command", required=True)
@@ -2000,6 +2958,27 @@ def build_parser() -> argparse.ArgumentParser:
2000
2958
  pev.add_argument("--new-samples", nargs="*", default=None, help="additional new writing samples to merge (optional)")
2001
2959
  pev.set_defaults(func=cmd_profile_evolve)
2002
2960
 
2961
+ # NEW: voice-first commands
2962
+ vs = sub.add_parser("voice-score", help="score text for voice quality: storytelling, conversation, specificity, tone")
2963
+ vs.add_argument("draft", help="draft file, or '-' for stdin")
2964
+ vs.add_argument("--format", choices=["json", "text"], default="text")
2965
+ vs.set_defaults(func=cmd_voice_score)
2966
+
2967
+ vf = sub.add_parser("verify", help="scan and report pattern breakdown by rule")
2968
+ vf.add_argument("draft", help="draft file, or '-' for stdin")
2969
+ vf.add_argument("--format", choices=["json", "text"], default="text")
2970
+ vf.add_argument("--fail-on-hit", action="store_true", help="exit 2 when issues are found")
2971
+ vf.add_argument("--meta", help="meta JSON file for learned pattern filtering")
2972
+ vf.set_defaults(func=cmd_verify)
2973
+
2974
+ vdp = sub.add_parser("voice-draft-prompt", help="generate a full-draft voice rewrite prompt")
2975
+ vdp.add_argument("draft", help="draft file, or '-' for stdin")
2976
+ vdp.add_argument("--profile", help="voice profile JSON file")
2977
+ vdp.add_argument("--angle", default="", help="writing angle or intent")
2978
+ vdp.add_argument("--constraints", default="", help="extra constraints")
2979
+ vdp.add_argument("--out", help="write prompt to this path")
2980
+ vdp.set_defaults(func=cmd_voice_draft_prompt)
2981
+
2003
2982
  return parser
2004
2983
 
2005
2984