@holdyourvoice/hyv 2.4.0 → 2.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -582,49 +582,6 @@ ABSTRACT_STYLE_WORDS = {
582
582
  "reinvention",
583
583
  }
584
584
 
585
- # --- Expanded AI vocabulary for 2025-2026 models ---
586
- AI_VOCAB_EXPANDED = {
587
- # GPT-4o / Claude fingerprint words
588
- "inherently", "underscores", "arguably", "notably", "intrinsically",
589
- "fundamentally", "nuanced", "multifaceted", "underscores", "encapsulate",
590
- "underscores", "delve", "tapestry", "underscore", "testament",
591
- # Phrase-level compounds (checked as substrings)
592
- "in the realm of", "it's worth diving into", "the intersection of",
593
- "a nuanced understanding", "the broader implications", "shed light on",
594
- "robust framework", "it's important to note", "worth noting that",
595
- "at the end of the day", "the reality is", "here's the thing",
596
- # 2025-2026 model fingerprints
597
- "it's worth mentioning", "let's unpack", "let's break down",
598
- "to put it simply", "in a nutshell", "the bottom line",
599
- "what's fascinating", "what's interesting", "what's remarkable",
600
- "the key takeaway", "the key insight", "the key difference",
601
- }
602
-
603
- # --- Writing craft signals (from Magnetic Email principles) ---
604
- STORYTELLING_SIGNALS = re.compile(
605
- r"\b(?:yesterday|last\s+(?:week|month|year|night)|this\s+morning|earlier\s+today)\b|"
606
- r"\b(?:i\s+was\s+(?:sitting|standing|walking|driving|lying)|we\s+were\s+(?:enjoying|having|drinking))\b|"
607
- r"\b(?:my\s+(?:wife|husband|friend|mother|father|brother|sister|colleague)\s+(?:said|told|asked|laughed))\b|"
608
- r"\b(?:i\s+remember|i\s+recall|i\s+once|i\s+used\s+to)\b|"
609
- r"\b(?:the\s+sort\s+of|the\s+kind\s+of)\s+\w+\s+(?:you|that)\b",
610
- re.I,
611
- )
612
-
613
- CONVERSATIONAL_SIGNALS = re.compile(
614
- r"\b(?:let'?s\s+be\s+real|look|listen|here'?s\s+what|here'?s\s+why|think\s+about\s+it)\b|"
615
- r"\b(?:you\s+know|right\?|see\?|get\s+it\?|makes\s+sense\?)\b|"
616
- r"\b(?:i'?m\s+not\s+(?:gonna|going\s+to)\s+lie|i'?ll\s+be\s+honest|real\s+talk)\b|"
617
- r"\b(?:picture\s+this|imagine\s+this|close\s+your\s+eyes)\b|"
618
- r"\b(?:by\s+the\s+way|btw|funny\s+thing|random\s+thought)\b",
619
- re.I,
620
- )
621
-
622
- SPECIFICITY_SIGNALS = re.compile(
623
- r"\b\d{1,3}(?:,\d{3})*(?:\.\d+)?(?:%|percent|k|K|M|B)?\b|"
624
- r"\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}\b|"
625
- r"\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+\b", # Proper nouns
626
- )
627
-
628
585
  GENERIC_OPENERS = re.compile(
629
586
  r"^(?:most|many|some|all)\s+(?:brands|teams|people|founders|companies|businesses|organizations|leaders)\b|"
630
587
  r"^(?:in\s+)?(?:today'?s|the)\s+(?:fast.paced|ever.evolving|modern|digital|current|contemporary)\s+(?:world|age|era|landscape|economy)\b",
@@ -759,535 +716,6 @@ def infer_argument_pattern(text: str) -> str:
759
716
  return "mixed"
760
717
 
761
718
 
762
- # =============================================================================
763
- # VOICE-FIRST ANALYSIS FUNCTIONS
764
- # =============================================================================
765
-
766
- def vocabulary_fingerprint(text: str, limit: int = 50) -> dict[str, Any]:
767
- """Extract vocabulary fingerprint: distinctive words, signature phrases, sentence starters."""
768
- word_list = [w.lower() for w in words(text)]
769
- total = len(word_list)
770
- if total < 10:
771
- return {"distinctive_words": [], "signature_phrases": [], "sentence_starters": [], "total_words": total}
772
-
773
- # Word frequency
774
- freq: dict[str, int] = {}
775
- for w in word_list:
776
- freq[w] = freq.get(w, 0) + 1
777
-
778
- # Distinctive words: appear 2+ times but not in top 50 most common English words
779
- COMMON_WORDS = {
780
- "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
781
- "have", "has", "had", "do", "does", "did", "will", "would", "could",
782
- "should", "may", "might", "shall", "can", "to", "of", "in", "for",
783
- "on", "with", "at", "by", "from", "as", "into", "through", "during",
784
- "before", "after", "above", "below", "between", "and", "but", "or",
785
- "nor", "not", "so", "yet", "both", "either", "neither", "each",
786
- "every", "all", "any", "few", "more", "most", "other", "some", "such",
787
- "no", "only", "own", "same", "than", "too", "very", "just", "because",
788
- "if", "when", "where", "how", "what", "which", "who", "whom", "this",
789
- "that", "these", "those", "i", "me", "my", "we", "our", "you", "your",
790
- "he", "him", "his", "she", "her", "it", "its", "they", "them", "their",
791
- }
792
- distinctive = sorted(
793
- [(w, c) for w, c in freq.items() if c >= 2 and w not in COMMON_WORDS and len(w) > 2],
794
- key=lambda x: -x[1]
795
- )[:limit]
796
-
797
- # Signature phrases: recurring 2-4 word combinations
798
- bigrams: dict[str, int] = {}
799
- trigrams: dict[str, int] = {}
800
- for i in range(len(word_list) - 1):
801
- bg = f"{word_list[i]} {word_list[i+1]}"
802
- bigrams[bg] = bigrams.get(bg, 0) + 1
803
- for i in range(len(word_list) - 2):
804
- tg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]}"
805
- trigrams[tg] = trigrams.get(tg, 0) + 1
806
-
807
- signature_phrases = []
808
- for phrase, count in sorted(bigrams.items(), key=lambda x: -x[1]):
809
- if count >= 3 and phrase.split()[0] not in COMMON_WORDS:
810
- signature_phrases.append({"phrase": phrase, "count": count})
811
- for phrase, count in sorted(trigrams.items(), key=lambda x: -x[1]):
812
- if count >= 2:
813
- signature_phrases.append({"phrase": phrase, "count": count})
814
- signature_phrases = sorted(signature_phrases, key=lambda x: -x["count"])[:20]
815
-
816
- # Sentence starters: first 2-3 words of sentences
817
- sentence_list = sentences(text)
818
- starters: dict[str, int] = {}
819
- for sent in sentence_list:
820
- sw = words(sent.lower())[:3]
821
- if len(sw) >= 2:
822
- key = " ".join(sw)
823
- starters[key] = starters.get(key, 0) + 1
824
- top_starters = sorted(starters.items(), key=lambda x: -x[1])[:10]
825
-
826
- return {
827
- "distinctive_words": [{"word": w, "count": c} for w, c in distinctive],
828
- "signature_phrases": signature_phrases,
829
- "sentence_starters": [{"phrase": p, "count": c} for p, c in top_starters],
830
- "total_words": total,
831
- "unique_words": len(freq),
832
- }
833
-
834
-
835
- def rhythm_markov(text: str) -> dict[str, Any]:
836
- """Build a Markov transition matrix for sentence length patterns.
837
- Captures the writer's rhythm: how short sentences follow long ones and vice versa."""
838
- sentence_list = sentences(text)
839
- lengths = [len(words(s)) for s in sentence_list if words(s)]
840
- if len(lengths) < 5:
841
- return {"transitions": {}, "length_buckets": [], "pattern": "insufficient_data"}
842
-
843
- # Bucket sentence lengths into: short (1-8), medium (9-16), long (17-25), very_long (26+)
844
- def bucket(l: int) -> str:
845
- if l <= 8:
846
- return "short"
847
- if l <= 16:
848
- return "medium"
849
- if l <= 25:
850
- return "long"
851
- return "very_long"
852
-
853
- bucketed = [bucket(l) for l in lengths]
854
-
855
- # Build transition counts
856
- transitions: dict[str, dict[str, int]] = {}
857
- for i in range(len(bucketed) - 1):
858
- src = bucketed[i]
859
- dst = bucketed[i + 1]
860
- if src not in transitions:
861
- transitions[src] = {}
862
- transitions[src][dst] = transitions[src].get(dst, 0) + 1
863
-
864
- # Normalize to probabilities
865
- transition_probs: dict[str, dict[str, float]] = {}
866
- for src, dsts in transitions.items():
867
- total = sum(dsts.values())
868
- transition_probs[src] = {dst: round(count / total, 3) for dst, count in dsts.items()}
869
-
870
- # Compute bucket distribution
871
- bucket_counts: dict[str, int] = {}
872
- for b in bucketed:
873
- bucket_counts[b] = bucket_counts.get(b, 0) + 1
874
- bucket_dist = {b: round(c / len(bucketed), 3) for b, c in bucket_counts.items()}
875
-
876
- # Detect dominant rhythm pattern
877
- dominant = max(bucket_dist, key=bucket_dist.get) if bucket_dist else "mixed"
878
- if bucket_dist.get("medium", 0) > 0.6:
879
- pattern = "uniform_medium" # AI-like
880
- elif bucket_dist.get("short", 0) > 0.4 and bucket_dist.get("long", 0) + bucket_dist.get("very_long", 0) > 0.2:
881
- pattern = "punchy_mixed" # Human-like conversational
882
- elif len(set(bucketed)) >= 3:
883
- pattern = "varied" # Human-like diverse
884
- else:
885
- pattern = dominant
886
-
887
- return {
888
- "transitions": transition_probs,
889
- "distribution": bucket_dist,
890
- "pattern": pattern,
891
- "avg_length": round(sum(lengths) / len(lengths), 1),
892
- "length_variance": round(math.sqrt(sum((l - sum(lengths)/len(lengths))**2 for l in lengths) / len(lengths)), 1),
893
- }
894
-
895
-
896
- def emotional_tone(text: str) -> dict[str, float]:
897
- """Score text on simple emotional axes using keyword-based scoring.
898
- Returns formality, energy, cynicism, warmth scores (0-10)."""
899
- low = text.lower()
900
- word_list = [w.lower() for w in words(low)]
901
- total = max(1, len(word_list))
902
-
903
- # Formality: formal words vs casual words
904
- FORMAL = {"therefore", "furthermore", "moreover", "consequently", "nevertheless", "hence",
905
- "accordingly", "thus", "whereby", "herein", "thereof", "wherein", "shall", "henceforth"}
906
- CASUAL = {"gonna", "wanna", "gotta", "kinda", "sorta", "yeah", "nah", "yep", "nope",
907
- "ok", "okay", "cool", "awesome", "stuff", "things", "basically", "honestly",
908
- "literally", "totally", "pretty", "super", "really", "damn", "hell", "crap"}
909
- formal_count = sum(1 for w in word_list if w in FORMAL)
910
- casual_count = sum(1 for w in word_list if w in CASUAL)
911
- contractions = len(re.findall(r"\b(?:n't|'re|'ve|'ll|'d|'m|'s)\b", low))
912
- formality = max(0, min(10, 5 + (formal_count - casual_count - contractions * 0.3) * 10 / total))
913
-
914
- # Energy: exclamation marks, short sentences, action verbs
915
- exclamations = text.count("!")
916
- short_sents = sum(1 for s in sentences(text) if len(words(s)) <= 6)
917
- ACTION_VERBS = {"go", "run", "build", "create", "make", "do", "get", "take", "start",
918
- "stop", "push", "pull", "drive", "hit", "crush", "nail", "smash", "kill"}
919
- action_count = sum(1 for w in word_list if w in ACTION_VERBS)
920
- sent_count = max(1, len(sentences(text)))
921
- energy = max(0, min(10, 3 + exclamations * 2 / sent_count + short_sents / sent_count * 3 + action_count * 5 / total))
922
-
923
- # Cynicism: negative qualifiers, hedging, dismissive words
924
- CYNICAL = {"but", "however", "unfortunately", "sadly", "honestly", "actually", "look",
925
- "listen", "truth", "reality", "problem", "issue", "broken", "failed", "wrong",
926
- "terrible", "awful", "garbage", "rubbish", "crap", "bullshit", "stupid"}
927
- cyn_count = sum(1 for w in word_list if w in CYNICAL)
928
- cynicism = max(0, min(10, 2 + cyn_count * 8 / total))
929
-
930
- # Warmth: personal pronouns, empathy words, inclusive language
931
- WARMTH = {"we", "us", "our", "together", "friend", "love", "care", "hope", "wish",
932
- "happy", "glad", "grateful", "thankful", "appreciate", "welcome", "please"}
933
- warmth_count = sum(1 for w in word_list if w in WARMTH)
934
- first_person = sum(1 for w in word_list if w in {"i", "me", "my", "we", "us", "our"})
935
- warmth = max(0, min(10, 3 + warmth_count * 8 / total + first_person * 3 / total))
936
-
937
- return {
938
- "formality": round(formality, 1),
939
- "energy": round(energy, 1),
940
- "cynicism": round(cynicism, 1),
941
- "warmth": round(warmth, 1),
942
- }
943
-
944
-
945
- def vocabulary_diversity(text: str) -> dict[str, float]:
946
- """Compute vocabulary diversity metrics: TTR, Yule's K, hapax ratio."""
947
- word_list = [w.lower() for w in words(text)]
948
- total = len(word_list)
949
- if total < 20:
950
- return {"ttr": 0, "yules_k": 0, "hapax_ratio": 0, "total_words": total}
951
-
952
- freq: dict[str, int] = {}
953
- for w in word_list:
954
- freq[w] = freq.get(w, 0) + 1
955
-
956
- # Type-Token Ratio (unique / total)
957
- ttr = len(freq) / total
958
-
959
- # Hapax legomena ratio (words appearing once / total)
960
- hapax = sum(1 for c in freq.values() if c == 1)
961
- hapax_ratio = hapax / total
962
-
963
- # Yule's K (vocabulary richness — lower is more diverse)
964
- freq_of_freq: dict[int, int] = {}
965
- for c in freq.values():
966
- freq_of_freq[c] = freq_of_freq.get(c, 0) + 1
967
- yules_k = 10000 * sum(i * i * freq_of_freq.get(i, 0) for i in range(1, max(freq_of_freq.keys(), default=0) + 1)) / (total * total) if total > 0 else 0
968
-
969
- return {
970
- "ttr": round(ttr, 3),
971
- "yules_k": round(yules_k, 1),
972
- "hapax_ratio": round(hapax_ratio, 3),
973
- "total_words": total,
974
- "unique_words": len(freq),
975
- }
976
-
977
-
978
- def ngram_repetition(text: str) -> dict[str, Any]:
979
- """Detect repeated n-gram patterns that indicate AI-like repetition."""
980
- word_list = [w.lower() for w in words(text)]
981
- if len(word_list) < 20:
982
- return {"repeated_trigrams": [], "echo_score": 0}
983
-
984
- # Trigram frequency
985
- trigrams: dict[str, int] = {}
986
- for i in range(len(word_list) - 2):
987
- tg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]}"
988
- trigrams[tg] = trigrams.get(tg, 0) + 1
989
-
990
- # Repeated trigrams (3+ times)
991
- repeated = sorted(
992
- [(tg, c) for tg, c in trigrams.items() if c >= 3],
993
- key=lambda x: -x[1]
994
- )[:20]
995
-
996
- # 4-gram frequency
997
- fourgrams: dict[str, int] = {}
998
- for i in range(len(word_list) - 3):
999
- fg = f"{word_list[i]} {word_list[i+1]} {word_list[i+2]} {word_list[i+3]}"
1000
- fourgrams[fg] = fourgrams.get(fg, 0) + 1
1001
- repeated_4 = sorted(
1002
- [(fg, c) for fg, c in fourgrams.items() if c >= 2],
1003
- key=lambda x: -x[1]
1004
- )[:10]
1005
-
1006
- # Echo score: proportion of words that are part of repeated trigrams
1007
- words_in_repeats = sum(c * 3 for _, c in repeated)
1008
- echo_score = min(1.0, words_in_repeats / max(1, len(word_list)))
1009
-
1010
- return {
1011
- "repeated_trigrams": [{"phrase": t, "count": c} for t, c in repeated],
1012
- "repeated_fourgrams": [{"phrase": f, "count": c} for f, c in repeated_4],
1013
- "echo_score": round(echo_score, 3),
1014
- }
1015
-
1016
-
1017
- def perplexity_proxy(text: str) -> dict[str, Any]:
1018
- """Estimate perplexity using word transition predictability.
1019
- Low perplexity = predictable = AI-like. High perplexity = surprising = human-like."""
1020
- word_list = [w.lower() for w in words(text)]
1021
- if len(word_list) < 10:
1022
- return {"avg_predictability": 0, "low_perplexity_sentences": [], "score": 0}
1023
-
1024
- # Build bigram frequencies from the text itself
1025
- bigrams: dict[str, dict[str, int]] = {}
1026
- for i in range(len(word_list) - 1):
1027
- w1, w2 = word_list[i], word_list[i + 1]
1028
- if w1 not in bigrams:
1029
- bigrams[w1] = {}
1030
- bigrams[w1][w2] = bigrams[w1].get(w2, 0) + 1
1031
-
1032
- # Score each sentence for predictability
1033
- sentence_list = sentences(text)
1034
- sentence_scores: list[tuple[int, float, str]] = []
1035
- for sent in sentence_list:
1036
- sw = [w.lower() for w in words(sent)]
1037
- if len(sw) < 3:
1038
- continue
1039
- predictability = 0
1040
- count = 0
1041
- for i in range(len(sw) - 1):
1042
- w1, w2 = sw[i], sw[i + 1]
1043
- if w1 in bigrams:
1044
- total_transitions = sum(bigrams[w1].values())
1045
- w2_freq = bigrams[w1].get(w2, 0)
1046
- predictability += w2_freq / total_transitions
1047
- count += 1
1048
- if count > 0:
1049
- avg_pred = predictability / count
1050
- line_no = text[:text.find(sent)].count("\n") + 1 if sent in text else 0
1051
- sentence_scores.append((line_no, avg_pred, sent.strip()[:120]))
1052
-
1053
- # Flag sentences with unusually high predictability (> 0.7)
1054
- low_perplexity = [(line, score, sent) for line, score, sent in sentence_scores if score > 0.7]
1055
- low_perplexity.sort(key=lambda x: -x[1])
1056
-
1057
- overall_avg = sum(s for _, s, _ in sentence_scores) / max(1, len(sentence_scores))
1058
-
1059
- return {
1060
- "avg_predictability": round(overall_avg, 3),
1061
- "low_perplexity_sentences": [
1062
- {"line": l, "score": round(s, 3), "text": t}
1063
- for l, s, t in low_perplexity[:10]
1064
- ],
1065
- "score": round(overall_avg, 3), # Higher = more predictable = more AI-like
1066
- }
1067
-
1068
-
1069
- def cross_pattern_density(hits: list[dict[str, Any]], text: str) -> list[dict[str, Any]]:
1070
- """Compute pattern density per paragraph. High density = strong AI signal."""
1071
- paragraph_list = paragraphs(text)
1072
- if not paragraph_list:
1073
- return []
1074
-
1075
- results = []
1076
- offset = 0
1077
- for para in paragraph_list:
1078
- para_start = text.find(para, offset)
1079
- if para_start == -1:
1080
- offset += 1
1081
- continue
1082
- para_end = para_start + len(para)
1083
- para_line = text[:para_start].count("\n") + 1
1084
- para_word_count = len(words(para))
1085
-
1086
- # Count hits in this paragraph
1087
- para_hits = [
1088
- h for h in hits
1089
- if h.get("line", 0) >= para_line and h.get("line", 0) <= para_line + para.count("\n")
1090
- ]
1091
-
1092
- if para_word_count >= 20:
1093
- density = len(para_hits) / para_word_count
1094
- if density > 0.05: # 5% of words trigger patterns
1095
- results.append({
1096
- "line": para_line,
1097
- "density": round(density, 3),
1098
- "hits": len(para_hits),
1099
- "words": para_word_count,
1100
- "text": para.strip()[:160],
1101
- })
1102
-
1103
- offset = para_end
1104
-
1105
- return sorted(results, key=lambda x: -x["density"])[:10]
1106
-
1107
-
1108
- def storytelling_score(text: str) -> dict[str, Any]:
1109
- """Score text for storytelling elements (TLS: Time, Location, Senses).
1110
- Based on Kieran Drew's Magnetic Email principles."""
1111
- low = text.lower()
1112
- sentence_list = sentences(text)
1113
- total_sents = max(1, len(sentence_list))
1114
-
1115
- # Time references
1116
- time_pattern = re.compile(
1117
- r"\b(?:yesterday|last\s+(?:week|month|year|night)|this\s+morning|earlier\s+today|"
1118
- r"monday|tuesday|wednesday|thursday|friday|saturday|sunday|"
1119
- r"\d{1,2}(?:am|pm)|o'?clock|morning|evening|afternoon)\b", re.I
1120
- )
1121
- time_hits = len(time_pattern.findall(low))
1122
-
1123
- # Location references
1124
- location_pattern = re.compile(
1125
- r"\b(?:at\s+the|in\s+the|on\s+the|inside|outside|upstairs|downstairs|"
1126
- r"kitchen|office|gym|cafe|coffee\s+shop|restaurant|car|train|plane|bed)\b", re.I
1127
- )
1128
- location_hits = len(location_pattern.findall(low))
1129
-
1130
- # Sensory words
1131
- senses_pattern = re.compile(
1132
- r"\b(?:saw|heard|felt|tasted|smelled|smelt|touch|touched|"
1133
- r"bright|dark|loud|quiet|warm|cold|hot|sweet|bitter|sour|"
1134
- r"soft|hard|smooth|rough|wet|dry|sharp|dull)\b", re.I
1135
- )
1136
- senses_hits = len(senses_pattern.findall(low))
1137
-
1138
- # Dialogue
1139
- dialogue_hits = len(re.findall(r'[""\u201c\u201d]', text))
1140
-
1141
- # Story opener (snapshot pattern)
1142
- story_opener = bool(STORYTELLING_SIGNALS.search(text[:500]))
1143
-
1144
- # Compute score
1145
- tls_score = min(1.0, (time_hits + location_hits + senses_hits + dialogue_hits) / max(1, total_sents * 0.3))
1146
-
1147
- return {
1148
- "score": round(tls_score, 3),
1149
- "time_references": time_hits,
1150
- "location_references": location_hits,
1151
- "sensory_words": senses_hits,
1152
- "dialogue_markers": dialogue_hits,
1153
- "has_story_opener": story_opener,
1154
- }
1155
-
1156
-
1157
- def conversational_score(text: str) -> dict[str, Any]:
1158
- """Score text for conversational tone vs. lecture/speech tone.
1159
- Based on 'Write conversations not speeches' principle."""
1160
- low = text.lower()
1161
- sentence_list = sentences(text)
1162
- total_sents = max(1, len(sentence_list))
1163
-
1164
- # Direct address (you/your)
1165
- direct_address = len(re.findall(r"\b(?:you|your|you're|you've|you'll)\b", low))
1166
-
1167
- # Questions (conversational marker)
1168
- questions = sum(1 for s in sentence_list if s.strip().endswith("?"))
1169
-
1170
- # Contractions (casual tone)
1171
- contractions = len(re.findall(r"\b(?:n't|'re|'ve|'ll|'d|'m|'s)\b", low))
1172
-
1173
- # First person (personal)
1174
- first_person = len(re.findall(r"\b(?:i|me|my|we|us|our)\b", low))
1175
-
1176
- # Conversational phrases
1177
- conv_hits = len(CONVERSATIONAL_SIGNALS.findall(low))
1178
-
1179
- # Passive voice (anti-conversational)
1180
- passive = len(re.findall(r"\b(?:is|are|was|were|been|being|be)\s+\w+ed\b", low))
1181
-
1182
- # Compute score
1183
- total_words = max(1, len(words(text)))
1184
- conv_ratio = (direct_address + questions * 3 + contractions + first_person + conv_hits * 2) / total_words
1185
- passive_ratio = passive / total_sents
1186
- score = min(1.0, conv_ratio * 10 - passive_ratio * 0.5)
1187
-
1188
- return {
1189
- "score": round(max(0, score), 3),
1190
- "direct_address": direct_address,
1191
- "questions": questions,
1192
- "contractions": contractions,
1193
- "first_person": first_person,
1194
- "conversational_phrases": conv_hits,
1195
- "passive_voice": passive,
1196
- }
1197
-
1198
-
1199
- def specificity_score(text: str) -> dict[str, Any]:
1200
- """Score text for specificity: proper nouns, numbers, dates, concrete details.
1201
- AI text is vague. Human text is specific."""
1202
- word_list = words(text)
1203
- total = max(1, len(word_list))
1204
-
1205
- # Numbers
1206
- numbers = len(re.findall(r"\b\d+(?:\.\d+)?(?:%|k|K|M|B)?\b", text))
1207
-
1208
- # Proper nouns (capitalized words not at sentence start)
1209
- sentences_list = sentences(text)
1210
- proper_nouns = 0
1211
- for sent in sentences_list:
1212
- sw = words(sent)
1213
- for i, w in enumerate(sw):
1214
- if i > 0 and w[0].isupper() and w not in {"I", "The", "A", "An"}:
1215
- proper_nouns += 1
1216
-
1217
- # Dates
1218
- dates = len(re.findall(
1219
- r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|"
1220
- r"(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{1,2}(?:,?\s+\d{4})?|"
1221
- r"\d{4})\b", text
1222
- ))
1223
-
1224
- # Quotes (specific attribution)
1225
- quotes = len(re.findall(r'[""\u201c\u201d]', text)) // 2
1226
-
1227
- # Specificity ratio
1228
- specific_items = numbers + proper_nouns + dates + quotes
1229
- ratio = specific_items / total
1230
-
1231
- return {
1232
- "score": round(min(1.0, ratio * 15), 3),
1233
- "numbers": numbers,
1234
- "proper_nouns": proper_nouns,
1235
- "dates": dates,
1236
- "quotes": quotes,
1237
- "ratio": round(ratio, 4),
1238
- }
1239
-
1240
-
1241
- def profile_strength(profile: dict[str, Any]) -> dict[str, Any]:
1242
- """Compute profile strength score (0-100) based on source count, word count, diversity."""
1243
- source_count = profile.get("source_count", 0)
1244
- word_count = profile.get("word_count", 0)
1245
- sources = profile.get("sources", [])
1246
- signature = profile.get("signature", {})
1247
-
1248
- # Source count score (0-30)
1249
- source_score = min(30, source_count * 3)
1250
-
1251
- # Word count score (0-30)
1252
- word_score = min(30, word_count / 100)
1253
-
1254
- # Diversity score (0-20): opening moves + anchors + distinctive words
1255
- opening_moves = len(signature.get("opening_moves", []))
1256
- anchors = len(signature.get("anchors", []))
1257
- diversity_score = min(20, (opening_moves + anchors) * 2)
1258
-
1259
- # Cadence score (0-10): has rhythm data
1260
- cadence = signature.get("cadence", [])
1261
- cadence_score = min(10, len(cadence) * 2.5)
1262
-
1263
- # Recency score (0-10): based on source file modification times
1264
- recency_score = 5 # default if we can't determine
1265
-
1266
- total = source_score + word_score + diversity_score + cadence_score + recency_score
1267
-
1268
- # Label
1269
- if total >= 80:
1270
- label = "strong"
1271
- elif total >= 50:
1272
- label = "moderate"
1273
- elif total >= 25:
1274
- label = "weak"
1275
- else:
1276
- label = "insufficient"
1277
-
1278
- return {
1279
- "score": round(min(100, total)),
1280
- "label": label,
1281
- "breakdown": {
1282
- "sources": source_score,
1283
- "words": word_score,
1284
- "diversity": diversity_score,
1285
- "cadence": cadence_score,
1286
- "recency": recency_score,
1287
- },
1288
- }
1289
-
1290
-
1291
719
  def first_words(text: str, count: int = 7) -> str:
1292
720
  found = words(text.lower())
1293
721
  return " ".join(found[:count])
@@ -1385,7 +813,7 @@ def build_profile(paths: list[str], name: str) -> dict[str, Any]:
1385
813
  voice_rules.append("study these sample opening moves before drafting: " + "; ".join(opening_moves[:4]))
1386
814
 
1387
815
  return {
1388
- "profile_version": "hold-your-voice-portable-v2",
816
+ "profile_version": "hold-your-voice-portable-v1",
1389
817
  "name": name,
1390
818
  "source_count": len(samples),
1391
819
  "sources": [{"path": sample["path"], "chars": len(sample["text"])} for sample in samples],
@@ -1400,11 +828,6 @@ def build_profile(paths: list[str], name: str) -> dict[str, Any]:
1400
828
  "anchors": anchors,
1401
829
  "never_list": never_list,
1402
830
  },
1403
- "voice_fingerprint": vocabulary_fingerprint(combined),
1404
- "rhythm": rhythm_markov(combined),
1405
- "emotional_tone": emotional_tone(combined),
1406
- "voice_diversity": vocabulary_diversity(combined),
1407
- "voice_strength": None, # computed separately via profile_strength()
1408
831
  "voice_rules": voice_rules,
1409
832
  "ai_eliminator": {
1410
833
  "rewrite_scope": "flagged-lines-only",
@@ -1584,19 +1007,6 @@ def scan_text(text: str) -> list[dict[str, Any]]:
1584
1007
  line_no = text[: match.start()].count("\n") + 1
1585
1008
  hits.append({"line": line_no, "rule": rule_id, "phrase": snippet[:160]})
1586
1009
 
1587
- # Expanded AI vocabulary detection (2025-2026 model fingerprints)
1588
- for line_no, line in enumerate((text or "").splitlines(), 1):
1589
- low = line.lower()
1590
- for term in AI_VOCAB_EXPANDED:
1591
- if " " in term:
1592
- # Multi-word phrase
1593
- if term in low:
1594
- hits.append({"line": line_no, "rule": "ai_vocab_expanded", "phrase": term})
1595
- else:
1596
- # Single word — match with word boundaries
1597
- if re.search(rf"\b{re.escape(term)}\b", low):
1598
- hits.append({"line": line_no, "rule": "ai_vocab_expanded", "phrase": term})
1599
-
1600
1010
  for line_no, line in enumerate((text or "").splitlines(), 1):
1601
1011
  for hit in line_style_hits(line):
1602
1012
  hits.append({"line": line_no, "rule": hit["rule"], "phrase": hit["phrase"], "text": line.strip()[:240]})
@@ -1605,17 +1015,6 @@ def scan_text(text: str) -> list[dict[str, Any]]:
1605
1015
  for structural_hit in _structural_analysis(text):
1606
1016
  hits.append(structural_hit)
1607
1017
 
1608
- # Voice craft signals (from Magnetic Email principles)
1609
- # Lack of storytelling in long text
1610
- story_hits = STORYTELLING_SIGNALS.findall(text or "")
1611
- conv_hits = CONVERSATIONAL_SIGNALS.findall(text or "")
1612
- word_count = len(words(text or ""))
1613
- if word_count > 200:
1614
- if len(story_hits) == 0:
1615
- hits.append({"line": 0, "rule": "voice_no_storytelling", "phrase": f"no storytelling signals in {word_count} words — text reads like a lecture, not a conversation"})
1616
- if len(conv_hits) == 0 and word_count > 300:
1617
- hits.append({"line": 0, "rule": "voice_no_conversation", "phrase": f"no conversational signals in {word_count} words — text speaks at reader, not with them"})
1618
-
1619
1018
  # Staccato triplet detection — only fire when sentences are clearly performative
1620
1019
  sentence_hits = []
1621
1020
  for line_no, line in enumerate((text or "").splitlines(), 1):
@@ -1667,303 +1066,44 @@ def load_draft(path: str) -> tuple[str, str]:
1667
1066
  return str(draft_path), read_text(draft_path)
1668
1067
 
1669
1068
 
1670
- # --- Pattern fix guidance: tells the LLM HOW to fix each pattern type ---
1671
- PATTERN_FIX_GUIDANCE = {
1672
- "landscape_era": "Replace temporal grandstanding with a concrete observation or remove entirely.",
1673
- "formulaic_connector": "Replace formal transitions (Moreover, Furthermore, Additionally) with natural flow or short sentences.",
1674
- "lets_invitation": "Remove the invitation to dive/explore. Just start with the point.",
1675
- "inflated_verbs": "Replace marketing verbs (unlock, leverage, supercharge) with plain verbs (use, build, get).",
1676
- "truth_harsh_reality": "Remove the 'reality/truth is' framing. State the point directly.",
1677
- "ai_vocab_density": "Replace AI-buzzwords with specific, concrete language from the writer's vocabulary.",
1678
- "ai_vocab_expanded": "Replace with plain language. If the phrase is 'it's important to note', just state the point.",
1679
- "abstract_noun_cluster": "Replace abstract nouns with concrete examples, scenes, or specific actions.",
1680
- "ux_buzzwords": "Replace buzzwords (robust, seamless, holistic) with specific descriptions of what the thing actually does.",
1681
- "binary_reframing": "Remove the 'it's not X, it's Y' structure. State the positive claim directly.",
1682
- "not_just_but": "Remove the 'not just X but Y' structure. Pick the stronger point and lead with it.",
1683
- "more_than_just": "Remove 'more than just'. State what it actually is.",
1684
- "founder_cadence": "Remove the performative cadence (here's the thing, the moment X becomes Y). Write plainly.",
1685
- "staccato_drama": "Break the staccato pattern. Vary sentence length. Add a longer sentence.",
1686
- "restatement_polish": "Remove 'in other words' / 'which is another way of saying'. Say it once, clearly.",
1687
- "spoiler_reveal": "Remove 'spoiler alert' and 'here's the truth' framing.",
1688
- "hedging_noncommittal": "Remove hedging (it depends, no one-size-fits-all). Take a position or cut the sentence.",
1689
- "balanced_contrast": "Remove 'on the other hand' / 'on the flip side'. Pick a side or use 'but' briefly.",
1690
- "empathy_opener": "Remove empathy validation (you're not alone, it's easy to feel). Start with the substance.",
1691
- "journey_cliche": "Remove journey/destination metaphors. State the actual point.",
1692
- "ai_metaphors": "Replace metaphor clusters (beacon, tapestry, north star) with concrete language.",
1693
- "guide_framing": "Remove guide framing (step-by-step, key takeaways, actionable tips). Just write the thing.",
1694
- "wrapping_patterns": "Remove conclusion patterns (at the end of the day, the bottom line). End on a specific detail or thought.",
1695
- "buyer_psychology": "Remove 'people don't buy X, they buy Y' templates. State the point directly.",
1696
- "overwhelm_reassurance": "Remove 'it can feel overwhelming but it doesn't have to be'. Just help.",
1697
- "pros_cons_framing": "Remove pros/cons structure. Make an argument, don't list.",
1698
- "triple_adjective": "Remove triple-adjective stacks. Pick the one that matters.",
1699
- "hidden_depth": "Remove 'behind the scenes' / 'beneath the surface'. State the insight directly.",
1700
- "self_referential": "Remove AI disclaimers (as an AI model, I can't provide).",
1701
- "placeholder_brackets": "Replace [your brand] placeholders with specific examples or remove.",
1702
- "story_templates": "Remove 'imagine this / picture this' templates. Use a real scene or observation.",
1703
- "clickbait_didnt_know": "Remove 'the X you didn't know you needed' framing.",
1704
- "self_referential_restatement": "Remove 'you asked about X, let's break it down'. Just answer.",
1705
- "ted_talk_slogan": "Remove the TED-talk contrastive slogan. State the point plainly.",
1706
- "perfect_marketing_sentence": "This sentence is too polished and generic. Make it specific or cut it.",
1707
- "abstract_noun_cluster": "Too many abstract nouns. Replace with concrete examples or actions.",
1708
- "generic_opening_generalization": "Opens with a sweeping generalization. Start with a specific observation or scene.",
1709
- "voice_question_opener": "Opens with a question. Start with a statement, scene, or observation instead.",
1710
- "voice_lesson_opener": "Opens with a lesson/inspiration claim. Start with a specific moment or example.",
1711
- "cta_ending": "Remove the engagement-bait CTA (let me know if you need help). End on substance.",
1712
- "voice_no_storytelling": "No storytelling signals found. Add a personal scene, specific moment, or concrete example.",
1713
- "voice_no_conversation": "Text reads like a lecture. Address the reader directly (you/your), add a question, or use contractions.",
1714
- "low_burstiness": "Sentence lengths are too uniform. Add a very short sentence (under 6 words) or break a long one.",
1715
- "mechanical_paragraphs": "Paragraphs are all the same length. Combine some, split others, or add a one-liner.",
1716
- "uniform_paragraph_rhythm": "Sentences within paragraphs are all 12-22 words. Vary: some 5 words, some 25.",
1717
- "low_contractions": "Too few contractions. Use don't, can't, it's, you're to sound natural.",
1718
- "formal_hedging_density": "Too many formal hedges (it is important to note). State things directly.",
1719
- "generic_intensifiers": "Too many intensifiers (remarkably, incredibly). Cut them or use specifics.",
1720
- "no_fragments": "No sentence fragments at all — reads over-polished. Add a fragment for texture.",
1721
- "over_structured_lists": "Lists follow a rigid 3-item pattern. Vary list length or break the pattern.",
1722
- }
1723
-
1724
-
1725
- def _dedupe_hits(hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
1726
- """Merge multiple rules per line into one entry with combined rules."""
1727
- by_line: dict[int, dict[str, Any]] = {}
1728
- for hit in hits:
1729
- line = hit.get("line", 0)
1730
- if line not in by_line:
1731
- by_line[line] = {"line": line, "rules": [], "phrases": [], "text": hit.get("text", "")}
1732
- by_line[line]["rules"].append(hit.get("rule", "unknown"))
1733
- phrase = hit.get("phrase", "")
1734
- if phrase and phrase not in by_line[line]["phrases"]:
1735
- by_line[line]["phrases"].append(phrase)
1736
- return sorted(by_line.values(), key=lambda x: x["line"])
1737
-
1738
-
1739
- def _compress_profile_for_prompt(profile: dict[str, Any] | None) -> str:
1740
- """Extract only the actionable voice data from a profile for the LLM prompt.
1741
- Strips out structural metadata, sources, and raw analysis data."""
1742
- if not profile:
1743
- return ""
1744
-
1745
- sig = profile.get("signature", {})
1746
- tone = profile.get("emotional_tone", {})
1747
- fp = profile.get("voice_fingerprint", {})
1748
-
1749
- lines = []
1750
-
1751
- # Voice anchors — the single most important thing
1752
- anchors = sig.get("anchors", [])
1753
- if anchors:
1754
- lines.append("SOUND LIKE THIS:")
1755
- lines.append(f' "{anchors[0][:200]}"')
1756
- if len(anchors) > 1:
1757
- lines.append(f' "{anchors[1][:200]}"')
1758
- lines.append("")
1759
-
1760
- # Rhythm + tone in one line
1761
- cadence = sig.get("cadence", [])
1762
- rhythm_line = cadence[0] if cadence else ""
1763
- tone_parts = []
1764
- if tone:
1765
- if tone.get("formality", 5) < 4:
1766
- tone_parts.append("casual")
1767
- elif tone.get("formality", 5) > 6:
1768
- tone_parts.append("formal")
1769
- if tone.get("energy", 5) > 6:
1770
- tone_parts.append("high-energy")
1771
- if tone.get("cynicism", 5) > 5:
1772
- tone_parts.append("cynical")
1773
- if tone.get("warmth", 5) > 5:
1774
- tone_parts.append("warm")
1775
- tone_str = ", ".join(tone_parts) if tone_parts else "neutral"
1776
- if rhythm_line:
1777
- lines.append(f"RHYTHM: {rhythm_line}. Tone: {tone_str}.")
1778
- else:
1779
- lines.append(f"TONE: {tone_str}.")
1780
- lines.append("")
1781
-
1782
- # Never list — compact
1783
- never = sig.get("never_list", [])
1784
- if never:
1785
- lines.append("BANNED: " + " | ".join(never[:6]))
1786
- lines.append("")
1787
-
1788
- return "\n".join(lines)
1789
-
1790
-
1791
- def _flagged_line_to_instruction(entry: dict[str, Any]) -> str:
1792
- """Convert a deduped hit entry into a compact instruction the LLM will actually follow."""
1793
- line = entry["line"]
1794
- rules = entry["rules"]
1795
- phrases = entry["phrases"]
1796
- phrase_str = phrases[0] if phrases else ""
1797
-
1798
- # Pick the single most specific fix guidance
1799
- guidance = ""
1800
- for rule in rules:
1801
- if rule in PATTERN_FIX_GUIDANCE:
1802
- guidance = PATTERN_FIX_GUIDANCE[rule]
1803
- break
1804
-
1805
- # Compress: line number + what's wrong + what to do
1806
- if line == 0:
1807
- return f"- STRUCTURAL: {guidance}"
1808
- if guidance:
1809
- return f"- L{line} \"{phrase_str[:60]}\": {guidance}"
1810
- return f"- L{line} \"{phrase_str[:60]}\""
1811
-
1812
-
1813
- def apply_replacements(draft: str, replacements_json: str) -> str:
1814
- """Apply LLM-returned replacements to a draft. Returns the patched text."""
1815
- try:
1816
- data = json.loads(replacements_json)
1817
- replacements = data.get("replacements", [])
1818
- except (json.JSONDecodeError, TypeError):
1819
- return draft
1820
-
1821
- lines = draft.splitlines()
1822
- for rep in replacements:
1823
- line_no = rep.get("line", 0)
1824
- text = rep.get("text", "")
1825
- if 1 <= line_no <= len(lines):
1826
- lines[line_no - 1] = text
1827
- return "\n".join(lines)
1828
-
1829
-
1830
- def rewrite_with_verification(
1831
- draft: str,
1832
- profile_text: str | None = None,
1833
- constraints: str = "",
1834
- meta: dict[str, Any] | None = None,
1835
- max_passes: int = 3,
1836
- rewrite_fn=None,
1837
- ) -> dict[str, Any]:
1838
- """Scan → rewrite → rescan loop. Up to max_passes iterations.
1839
-
1840
- Args:
1841
- draft: the original draft text
1842
- profile_text: voice profile JSON string (optional)
1843
- constraints: extra rewrite constraints
1844
- meta: signal meta for learned pattern filtering
1845
- max_passes: maximum rewrite attempts (default 3)
1846
- rewrite_fn: callable(draft, prompt) -> str that returns the LLM's JSON response.
1847
- If None, returns the prompt only (for external LLM execution).
1848
-
1849
- Returns dict with:
1850
- - final_text: the rewritten draft after all passes
1851
- - initial_hits: pattern count before any rewriting
1852
- - final_hits: pattern count after last pass
1853
- - passes_used: how many passes were executed
1854
- - prompts: list of prompts generated (one per pass)
1855
- - pass_details: per-pass hit counts
1856
- """
1857
- initial_hits = scan_text(draft)
1858
- if meta:
1859
- initial_hits = filter_hits_by_weights(initial_hits, meta)
1860
-
1861
- current_text = draft
1862
- prompts = []
1863
- pass_details = []
1864
-
1865
- for pass_num in range(max_passes):
1866
- prompt = build_rewrite_prompt("draft", current_text, profile_text, constraints, meta)
1867
- prompts.append(prompt)
1868
-
1869
- hits = scan_text(current_text)
1870
- if meta:
1871
- hits = filter_hits_by_weights(hits, meta)
1872
-
1873
- pass_details.append({"pass": pass_num + 1, "hits": len(hits)})
1874
-
1875
- if not hits:
1876
- break # clean — no more patterns
1877
-
1878
- if rewrite_fn is None:
1879
- # No LLM available — return prompt for external execution
1880
- break
1881
-
1882
- # Call the LLM
1883
- llm_response = rewrite_fn(current_text, prompt)
1884
- patched = apply_replacements(current_text, llm_response)
1885
-
1886
- if patched == current_text:
1887
- break # LLM didn't change anything — stop
1888
-
1889
- current_text = patched
1890
-
1891
- final_hits = scan_text(current_text)
1892
- if meta:
1893
- final_hits = filter_hits_by_weights(final_hits, meta)
1894
-
1895
- return {
1896
- "final_text": current_text,
1897
- "initial_hits": len(initial_hits),
1898
- "final_hits": len(final_hits),
1899
- "passes_used": len(pass_details),
1900
- "prompts": prompts,
1901
- "pass_details": pass_details,
1902
- }
1903
-
1904
-
1905
1069
  def build_rewrite_prompt(draft_name: str, draft: str, profile_text: str | None, constraints: str = "", meta: dict[str, Any] | None = None) -> str:
1906
1070
  hits = scan_text(draft)
1907
1071
  if meta:
1908
1072
  hits = filter_hits_by_weights(hits, meta)
1909
-
1910
- deduped = _dedupe_hits(hits)
1911
-
1912
- # Build compact issue lines with fix guidance embedded
1913
- issue_lines = [_flagged_line_to_instruction(entry) for entry in deduped]
1914
- issue_block = "\n".join(issue_lines) or "- none found"
1073
+ issue_lines = "\n".join(
1074
+ f"- line {hit['line']} [{hit['rule']}]: {hit.get('phrase', '')}"
1075
+ for hit in hits
1076
+ ) or "- none found by deterministic scan"
1915
1077
 
1916
1078
  numbered_draft = "\n".join(f"{idx}: {line}" for idx, line in enumerate(draft.splitlines(), 1))
1079
+ profile_block = profile_text.strip() if profile_text and profile_text.strip() else "(no voice profile supplied)"
1080
+ constraints_block = constraints.strip() if constraints.strip() else "(none)"
1917
1081
 
1918
- # Compress profile
1919
- profile_block = ""
1920
- if profile_text and profile_text.strip():
1921
- try:
1922
- profile = json.loads(profile_text)
1923
- profile_block = _compress_profile_for_prompt(profile)
1924
- except (json.JSONDecodeError, TypeError):
1925
- profile_block = ""
1926
-
1927
- constraints_line = f"\nCONSTRAINTS: {constraints.strip()}" if constraints and constraints.strip() else ""
1082
+ return f"""Rewrite only the flagged lines. Do not rewrite the whole piece.
1928
1083
 
1929
- # Compact prompt everything the LLM needs, nothing it doesn't
1930
- prompt = f"""Fix only the flagged lines. Return JSON: {{"replacements":[{{"line":1,"text":"fixed line"}}]}}
1084
+ Return only valid JSON in this exact shape:
1085
+ {{"replacements":[{{"line":1,"text":"replacement line"}}]}}
1931
1086
 
1932
- RULES:
1933
- - Only return flagged line numbers. Leave everything else untouched.
1934
- - Keep the original argument. Remove AI patterns — write like a real person.
1935
- - No hooks, CTAs, summaries, or new sections.{constraints_line}
1087
+ Rules:
1088
+ - Include only flagged line numbers.
1089
+ - Preserve unflagged lines exactly by not returning them.
1090
+ - Preserve the original argument and local meaning.
1091
+ - Use the voice profile as the benchmark when present.
1092
+ - Remove AI cadence, polished founder cadence, abstract strategy-deck language, and generic lesson shapes.
1093
+ - Do not add new sections, hooks, CTAs, markdown, bullets, or commentary.
1936
1094
 
1937
- {profile_block}FIX THESE:
1938
- {issue_block}
1095
+ Voice profile:
1096
+ {profile_block}
1939
1097
 
1940
- DRAFT ({draft_name}):
1941
- {numbered_draft}"""
1098
+ Extra constraints:
1099
+ {constraints_block}
1942
1100
 
1943
- return prompt
1101
+ Flagged lines:
1102
+ {issue_lines}
1944
1103
 
1945
-
1946
- def build_voice_draft_prompt(draft: str, profile: dict[str, Any] | None, angle: str = "", constraints: str = "") -> str:
1947
- """Generate a prompt for rewriting an entire draft in the writer's voice."""
1948
- profile_block = _compress_profile_for_prompt(profile) if profile else ""
1949
-
1950
- angle_line = f"\nANGLE: {angle}" if angle else ""
1951
- constraints_line = f"\nCONSTRAINTS: {constraints}" if constraints else ""
1952
-
1953
- prompt = f"""Rewrite this draft in the voice below. Return the full text only — no commentary.
1954
-
1955
- RULES:
1956
- - Keep the argument and key points. Match the voice anchors and rhythm.
1957
- - Open with a specific observation or scene, not a generalization.
1958
- - Use contractions. Vary sentence length. Write to one person ("you").
1959
- - No AI patterns (let's dive in, robust, holistic, moreover, furthermore).
1960
- - No hooks, CTAs, summaries, or motivational closings.
1961
- - End on a specific detail or quiet thought.{angle_line}{constraints_line}
1962
-
1963
- {profile_block}DRAFT:
1964
- {draft}"""
1965
-
1966
- return prompt
1104
+ Draft with line numbers ({draft_name}):
1105
+ {numbered_draft}
1106
+ """
1967
1107
 
1968
1108
 
1969
1109
  DEFAULT_NEVER_LIST = [
@@ -2795,104 +1935,6 @@ def cmd_rewrite_prompt(args: argparse.Namespace) -> int:
2795
1935
  return 0
2796
1936
 
2797
1937
 
2798
- def cmd_voice_score(args: argparse.Namespace) -> int:
2799
- """Score text for voice quality: storytelling, conversation, specificity, tone."""
2800
- name, text = load_draft(args.draft)
2801
- story = storytelling_score(text)
2802
- conv = conversational_score(text)
2803
- spec = specificity_score(text)
2804
- tone = emotional_tone(text)
2805
- diversity = vocabulary_diversity(text)
2806
- perplexity = perplexity_proxy(text)
2807
- ngrams = ngram_repetition(text)
2808
-
2809
- result = {
2810
- "file": name,
2811
- "word_count": len(words(text)),
2812
- "storytelling": story,
2813
- "conversation": conv,
2814
- "specificity": spec,
2815
- "emotional_tone": tone,
2816
- "vocabulary_diversity": diversity,
2817
- "perplexity_proxy": perplexity,
2818
- "ngram_repetition": ngrams,
2819
- "voice_quality": round(
2820
- (story["score"] * 0.25 + conv["score"] * 0.25 + spec["score"] * 0.2 +
2821
- (1 - perplexity["score"]) * 0.15 + diversity["ttr"] * 0.15), 3
2822
- ),
2823
- }
2824
-
2825
- if args.format == "json":
2826
- print(json.dumps(result, indent=2, ensure_ascii=False))
2827
- else:
2828
- print(f"Voice Score for: {name}")
2829
- print(f" Words: {result['word_count']}")
2830
- print(f" Overall voice quality: {result['voice_quality']:.2f}")
2831
- print(f" Storytelling: {story['score']:.2f} (time={story['time_references']}, location={story['location_references']}, senses={story['sensory_words']}, dialogue={story['dialogue_markers']})")
2832
- print(f" Conversation: {conv['score']:.2f} (you/your={conv['direct_address']}, questions={conv['questions']}, contractions={conv['contractions']})")
2833
- print(f" Specificity: {spec['score']:.2f} (numbers={spec['numbers']}, proper_nouns={spec['proper_nouns']}, quotes={spec['quotes']})")
2834
- print(f" Tone: formality={tone['formality']}, energy={tone['energy']}, cynicism={tone['cynicism']}, warmth={tone['warmth']}")
2835
- print(f" Diversity: TTR={diversity['ttr']}, Yule's K={diversity['yules_k']}, hapax={diversity['hapax_ratio']}")
2836
- print(f" Perplexity: {perplexity['score']:.3f} (higher = more predictable = more AI-like)")
2837
- print(f" N-gram echo: {ngrams['echo_score']:.3f}")
2838
- return 0
2839
-
2840
-
2841
- def cmd_verify(args: argparse.Namespace) -> int:
2842
- """Scan a draft, report before/after pattern counts."""
2843
- name, text = load_draft(args.draft)
2844
- hits = scan_text(text)
2845
-
2846
- meta: dict[str, Any] = {}
2847
- if args.meta:
2848
- meta_path = Path(args.meta).expanduser()
2849
- if meta_path.exists():
2850
- try:
2851
- meta = json.loads(meta_path.read_text(encoding="utf-8", errors="ignore"))
2852
- except (json.JSONDecodeError, OSError):
2853
- pass
2854
- if meta:
2855
- hits = filter_hits_by_weights(hits, meta)
2856
-
2857
- # Group hits by rule
2858
- rule_counts: dict[str, int] = {}
2859
- for hit in hits:
2860
- rule = hit.get("rule", "unknown")
2861
- rule_counts[rule] = rule_counts.get(rule, 0) + 1
2862
-
2863
- if args.format == "json":
2864
- print(json.dumps({
2865
- "file": name,
2866
- "total_hits": len(hits),
2867
- "by_rule": dict(sorted(rule_counts.items(), key=lambda x: -x[1])),
2868
- "hits": hits,
2869
- }, indent=2, ensure_ascii=False))
2870
- else:
2871
- print(f"Verification: {name}")
2872
- print(f" Total patterns: {len(hits)}")
2873
- if rule_counts:
2874
- print(f" By rule:")
2875
- for rule, count in sorted(rule_counts.items(), key=lambda x: -x[1]):
2876
- print(f" {rule}: {count}")
2877
- else:
2878
- print(f" No AI patterns detected.")
2879
- return 2 if args.fail_on_hit and hits else 0
2880
-
2881
-
2882
- def cmd_voice_draft_prompt(args: argparse.Namespace) -> int:
2883
- """Generate a full-draft voice rewrite prompt."""
2884
- name, draft = load_draft(args.draft)
2885
- profile = None
2886
- if args.profile:
2887
- profile_path = Path(args.profile).expanduser()
2888
- if not profile_path.exists():
2889
- raise SystemExit(f"profile not found: {profile_path}")
2890
- profile = json.loads(profile_path.read_text(encoding="utf-8", errors="ignore"))
2891
- prompt = build_voice_draft_prompt(draft, profile, args.angle or "", args.constraints or "")
2892
- write_or_print(prompt, args.out)
2893
- return 0
2894
-
2895
-
2896
1938
  def build_parser() -> argparse.ArgumentParser:
2897
1939
  parser = argparse.ArgumentParser(description="Portable Hold Your Voice helpers")
2898
1940
  sub = parser.add_subparsers(dest="command", required=True)
@@ -2958,27 +2000,6 @@ def build_parser() -> argparse.ArgumentParser:
2958
2000
  pev.add_argument("--new-samples", nargs="*", default=None, help="additional new writing samples to merge (optional)")
2959
2001
  pev.set_defaults(func=cmd_profile_evolve)
2960
2002
 
2961
- # NEW: voice-first commands
2962
- vs = sub.add_parser("voice-score", help="score text for voice quality: storytelling, conversation, specificity, tone")
2963
- vs.add_argument("draft", help="draft file, or '-' for stdin")
2964
- vs.add_argument("--format", choices=["json", "text"], default="text")
2965
- vs.set_defaults(func=cmd_voice_score)
2966
-
2967
- vf = sub.add_parser("verify", help="scan and report pattern breakdown by rule")
2968
- vf.add_argument("draft", help="draft file, or '-' for stdin")
2969
- vf.add_argument("--format", choices=["json", "text"], default="text")
2970
- vf.add_argument("--fail-on-hit", action="store_true", help="exit 2 when issues are found")
2971
- vf.add_argument("--meta", help="meta JSON file for learned pattern filtering")
2972
- vf.set_defaults(func=cmd_verify)
2973
-
2974
- vdp = sub.add_parser("voice-draft-prompt", help="generate a full-draft voice rewrite prompt")
2975
- vdp.add_argument("draft", help="draft file, or '-' for stdin")
2976
- vdp.add_argument("--profile", help="voice profile JSON file")
2977
- vdp.add_argument("--angle", default="", help="writing angle or intent")
2978
- vdp.add_argument("--constraints", default="", help="extra constraints")
2979
- vdp.add_argument("--out", help="write prompt to this path")
2980
- vdp.set_defaults(func=cmd_voice_draft_prompt)
2981
-
2982
2003
  return parser
2983
2004
 
2984
2005