humanizer-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,637 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Humanizer
4
+ module Patterns
5
+ Pattern = Struct.new(:id, :name, :category, :description, :weight, :detector, keyword_init: true)
6
+
7
+ # ── Phrase Lists ─────────────────────────────────────
8
+
9
+ SIGNIFICANCE_PHRASES = [
10
+ /marking a pivotal/i, /pivotal moment/i, /pivotal role/i,
11
+ /key role/i, /crucial role/i, /vital role/i, /significant role/i,
12
+ /is a testament/i, /stands as a testament/i, /serves as a testament/i,
13
+ /serves as a reminder/i, /reflects broader/i, /broader trends/i,
14
+ /broader movement/i, /evolving landscape/i, /evolving world/i,
15
+ /setting the stage for/i, /marking a shift/i, /key turning point/i,
16
+ /indelible mark/i, /deeply rooted/i, /focal point/i,
17
+ /symbolizing its ongoing/i, /enduring legacy/i, /lasting impact/i,
18
+ /contributing to the/i, /underscores the importance/i,
19
+ /highlights the significance/i, /represents a shift/i,
20
+ /shaping the future/i, /the evolution of/i, /rich tapestry/i,
21
+ /rich heritage/i, /stands as a beacon/i, /marks a milestone/i,
22
+ /paving the way/i, /charting a course/i,
23
+ ].freeze
24
+
25
+ PROMOTIONAL_WORDS = [
26
+ /\bnestled\b/i, /\bin the heart of\b/i, /\bbreathtaking\b/i,
27
+ /\bmust-visit\b/i, /\bstunning\b/i, /\brenowned\b/i,
28
+ /\bnatural beauty\b/i, /\brich cultural heritage\b/i,
29
+ /\brich history\b/i, /\bcommitment to\b/i, /\bexemplifies\b/i,
30
+ /\bworld-class\b/i, /\bstate-of-the-art\b/i, /\bgame-changing\b/i,
31
+ /\bgame changer\b/i, /\bunparalleled\b/i, /\bprofound\b/i,
32
+ /\bbest-in-class\b/i, /\btrailblazing\b/i, /\bvisionary\b/i,
33
+ /\bcutting-edge\b/i, /\bworldwide recognition\b/i,
34
+ ].freeze
35
+
36
+ VAGUE_ATTRIBUTION_PHRASES = [
37
+ /\bexperts (?:believe|argue|say|suggest|note|agree|contend|have noted)\b/i,
38
+ /\bindustry (?:reports|observers|experts|analysts|leaders|insiders)\b/i,
39
+ /\bobservers have (?:cited|noted|pointed out)\b/i,
40
+ /\bsome critics argue\b/i,
41
+ /\bsome experts (?:say|believe|suggest)\b/i,
42
+ /\bseveral sources\b/i, /\baccording to reports\b/i,
43
+ /\bwidely (?:regarded|considered|recognized|acknowledged)\b/i,
44
+ /\bit is widely (?:known|believed|accepted)\b/i,
45
+ /\bmany (?:experts|scholars|researchers|analysts) (?:believe|argue|suggest)\b/i,
46
+ /\bstudies (?:show|suggest|indicate|have shown)\b/i,
47
+ /\bresearch (?:shows|suggests|indicates|has shown)\b/i,
48
+ /\bsources close to\b/i, /\bpeople familiar with\b/i,
49
+ ].freeze
50
+
51
+ CHALLENGES_PHRASES = [
52
+ /despite (?:its|these|the|their) (?:challenges|setbacks|obstacles|difficulties|limitations)/i,
53
+ /faces (?:several|many|numerous|various) challenges/i,
54
+ /continues to thrive/i, /continues to grow/i,
55
+ /future (?:outlook|prospects) (?:remain|look|appear)/i,
56
+ /challenges and (?:future|legacy|opportunities)/i,
57
+ /despite these (?:challenges|hurdles|obstacles)/i,
58
+ /overcoming (?:obstacles|challenges|adversity)/i,
59
+ /weather(?:ing|ed) the storm/i,
60
+ ].freeze
61
+
62
+ COPULA_AVOIDANCE = [
63
+ /\bserves as(?: a)?\b/i, /\bstands as(?: a)?\b/i,
64
+ /\bmarks a\b/i, /\brepresents a\b/i,
65
+ /\bboasts (?:a|an|over|more)\b/i, /\bfeatures (?:a|an|over|more)\b/i,
66
+ /\boffers (?:a|an)\b/i, /\bfunctions as\b/i,
67
+ /\bacts as(?: a)?\b/i, /\boperates as(?: a)?\b/i,
68
+ ].freeze
69
+
70
+ # ── Helper Methods ───────────────────────────────────
71
+
72
+ module_function
73
+
74
+ def scan_word_list(text, word_list, suggestion_prefix, confidence: "high")
75
+ results = []
76
+ word_list.each do |word|
77
+ regex = TextUtils.word_regex(word)
78
+ matches = TextUtils.find_matches(
79
+ text, regex,
80
+ "#{suggestion_prefix}: \"#{word}\". Use a simpler, more specific alternative.",
81
+ confidence: confidence
82
+ )
83
+ results.concat(matches)
84
+ end
85
+ results
86
+ end
87
+
88
+ def scan_phrases(text, phrases, tier_filter: nil)
89
+ results = []
90
+ phrases.each do |phrase|
91
+ next if tier_filter && phrase[:tier] != tier_filter
92
+
93
+ fix = phrase[:fix]
94
+ suggestion = fix.start_with?("(") ? fix : "Replace with: #{fix}"
95
+ confidence = case phrase[:tier]
96
+ when 1 then "high"
97
+ when 2 then "medium"
98
+ else "low"
99
+ end
100
+
101
+ matches = TextUtils.find_matches(text, phrase[:pattern], suggestion, confidence: confidence)
102
+ results.concat(matches)
103
+ end
104
+ results
105
+ end
106
+
107
+ # ── Pattern Definitions ──────────────────────────────
108
+
109
+ PATTERNS = [
110
+ # ── CONTENT PATTERNS (1-6) ────────────────────────
111
+
112
+ Pattern.new(
113
+ id: 1, name: "Significance inflation", category: :content, weight: 4,
114
+ description: "Inflated claims about significance, legacy, or broader trends. LLMs puff up importance of mundane things.",
115
+ detector: ->(text) {
116
+ SIGNIFICANCE_PHRASES.flat_map { |re|
117
+ TextUtils.find_matches(text, re, "Remove inflated significance claim. State concrete facts instead.", confidence: "high")
118
+ }
119
+ }
120
+ ),
121
+
122
+ Pattern.new(
123
+ id: 2, name: "Notability name-dropping", category: :content, weight: 3,
124
+ description: "Listing media outlets or sources to claim notability without providing context or specific claims.",
125
+ detector: ->(text) {
126
+ media_list = /\b(?:cited|featured|covered|mentioned|reported|published|recognized|highlighted) (?:in|by) .{0,20}(?:The New York Times|BBC|CNN|The Washington Post|The Guardian|Wired|Forbes|Reuters|Bloomberg|Financial Times|The Verge|TechCrunch|The Hindu|Al Jazeera|Time|Newsweek|The Economist|Nature|Science).{0,100}(?:,\s*(?:and\s+)?(?:The New York Times|BBC|CNN|The Washington Post|The Guardian|Wired|Forbes|Reuters|Bloomberg|Financial Times|The Verge|TechCrunch|The Hindu|Al Jazeera|Time|Newsweek|The Economist|Nature|Science))+/i
127
+ results = TextUtils.find_matches(text, media_list, "Instead of listing outlets, cite one specific claim from one source.", confidence: "high")
128
+ results.concat TextUtils.find_matches(text, /\bactive social media presence\b/i, "Remove — not meaningful without specific context.", confidence: "high")
129
+ results.concat TextUtils.find_matches(text, /\bwritten by a leading expert\b/i, "Name the expert and their specific credential.", confidence: "medium")
130
+ results.concat TextUtils.find_matches(text, /\bhas been (?:featured|recognized|acknowledged) (?:by|in)\b/i, "Cite the specific feature with a concrete claim.", confidence: "medium")
131
+ results
132
+ }
133
+ ),
134
+
135
+ Pattern.new(
136
+ id: 3, name: "Superficial -ing analyses", category: :content, weight: 4,
137
+ description: 'Tacking "-ing" participial phrases onto sentences to fake depth.',
138
+ detector: ->(text) {
139
+ ing_phrases = /,\s*(?:highlighting|underscoring|emphasizing|ensuring|reflecting|symbolizing|contributing to|cultivating|fostering|encompassing|showcasing|demonstrating|illustrating|representing|signaling|indicating|solidifying|reinforcing|cementing|underscoring|bolstering|reaffirming|illuminating|epitomizing)\b[^.]{5,}/i
140
+ TextUtils.find_matches(text, ing_phrases, "Remove trailing -ing phrase. If the point matters, give it its own sentence with specifics.", confidence: "high")
141
+ }
142
+ ),
143
+
144
+ Pattern.new(
145
+ id: 4, name: "Promotional language", category: :content, weight: 3,
146
+ description: "Ad-copy language that sounds like a tourism brochure or press release.",
147
+ detector: ->(text) {
148
+ PROMOTIONAL_WORDS.flat_map { |re|
149
+ TextUtils.find_matches(text, re, "Replace promotional language with neutral, factual description.", confidence: "high")
150
+ }
151
+ }
152
+ ),
153
+
154
+ Pattern.new(
155
+ id: 5, name: "Vague attributions", category: :content, weight: 4,
156
+ description: "Attributing claims to unnamed experts, industry reports, or vague authorities.",
157
+ detector: ->(text) {
158
+ VAGUE_ATTRIBUTION_PHRASES.flat_map { |re|
159
+ TextUtils.find_matches(text, re, "Name the specific source, study, or person. If you can't, remove the claim.", confidence: "high")
160
+ }
161
+ }
162
+ ),
163
+
164
+ Pattern.new(
165
+ id: 6, name: "Formulaic challenges", category: :content, weight: 3,
166
+ description: 'Boilerplate "Despite challenges... continues to thrive" sections.',
167
+ detector: ->(text) {
168
+ CHALLENGES_PHRASES.flat_map { |re|
169
+ TextUtils.find_matches(text, re, "Replace with specific challenges and concrete outcomes.", confidence: "high")
170
+ }
171
+ }
172
+ ),
173
+
174
+ # ── LANGUAGE PATTERNS (7-12) ──────────────────────
175
+
176
+ Pattern.new(
177
+ id: 7, name: "AI vocabulary", category: :language, weight: 5,
178
+ description: "Words and phrases that appear far more frequently in AI-generated text. 500+ words tracked across 3 tiers.",
179
+ detector: ->(text) {
180
+ results = []
181
+ words = TextUtils.word_count(text)
182
+
183
+ # Tier 1: always flag
184
+ all_tier1 = Vocabulary::TIER_1 + Vocabulary::TIER_1_PHRASES
185
+ results.concat Patterns.scan_word_list(text, all_tier1, "Tier 1 AI word", confidence: "high")
186
+
187
+ # Tier 2: flag if 2+ tier-2 words appear
188
+ tier2_matches = Patterns.scan_word_list(text, Vocabulary::TIER_2, "Tier 2 AI word", confidence: "medium")
189
+ results.concat(tier2_matches) if tier2_matches.length >= 2
190
+
191
+ # Tier 3: flag only at high density (>3% of words are tier-3)
192
+ if words > 50
193
+ tier3_count = Vocabulary::TIER_3.sum { |word|
194
+ TextUtils.count_matches(text, TextUtils.word_regex(word))
195
+ }
196
+ density = tier3_count.to_f / words
197
+ if density > 0.03
198
+ results.concat Patterns.scan_word_list(text, Vocabulary::TIER_3, "Tier 3 AI word (high density)", confidence: "low")
199
+ end
200
+ end
201
+
202
+ # AI phrases (non-remove, non-simple-replacement)
203
+ filtered = Vocabulary::AI_PHRASES.select { |p|
204
+ p[:fix] &&
205
+ !p[:fix].start_with?("(remove") &&
206
+ !%w[to because now if can first finally].include?(p[:fix])
207
+ }
208
+ results.concat Patterns.scan_phrases(text, filtered)
209
+
210
+ results
211
+ }
212
+ ),
213
+
214
+ Pattern.new(
215
+ id: 8, name: "Copula avoidance", category: :language, weight: 3,
216
+ description: 'Using "serves as", "functions as", "boasts" instead of simple "is", "has", "are".',
217
+ detector: ->(text) {
218
+ COPULA_AVOIDANCE.flat_map { |re|
219
+ TextUtils.find_matches(text, re, 'Use simple "is", "are", or "has" instead.', confidence: "high")
220
+ }
221
+ }
222
+ ),
223
+
224
+ Pattern.new(
225
+ id: 9, name: "Negative parallelisms", category: :language, weight: 3,
226
+ description: '"It\'s not just X, it\'s Y" or "Not only X but Y" constructions — overused by LLMs.',
227
+ detector: ->(text) {
228
+ neg_parallel = /\b(?:it'?s|this is) not (?:just|merely|only|simply) .{3,60}(?:,|;|\u2014)\s*(?:it'?s|this is|but)\b/i
229
+ not_only = /\bnot only .{3,60} but (?:also )?\b/i
230
+ results = TextUtils.find_matches(text, neg_parallel, 'Rewrite directly. State what the thing IS, not what it "isn\'t just".', confidence: "high")
231
+ results.concat TextUtils.find_matches(text, not_only, 'Simplify. Remove the "not only...but also" frame.', confidence: "medium")
232
+ results
233
+ }
234
+ ),
235
+
236
+ Pattern.new(
237
+ id: 10, name: "Rule of three", category: :language, weight: 2,
238
+ description: 'Forcing ideas into groups of three. LLMs love triads that sound "comprehensive".',
239
+ detector: ->(text) {
240
+ # Abstract noun triads
241
+ buzzy_triad = /\b(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence),\s+(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence),\s+and\s+(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence)\b/i
242
+ results = TextUtils.find_matches(text, buzzy_triad, "Rule of three with abstract nouns. Pick the one or two that actually matter.", confidence: "medium")
243
+
244
+ buzz_adj = %w[seamless intuitive powerful innovative dynamic robust comprehensive cutting-edge scalable agile efficient effective engaging impactful meaningful transformative sustainable resilient inclusive accessible]
245
+ adj_pattern = buzz_adj.join("|")
246
+ adj_triad = /\b(#{adj_pattern}),\s+(#{adj_pattern}),\s+and\s+(#{adj_pattern})\b/i
247
+ results.concat TextUtils.find_matches(text, adj_triad, "Buzzy adjective triad. Pick one and make it specific.", confidence: "medium")
248
+
249
+ results
250
+ }
251
+ ),
252
+
253
+ Pattern.new(
254
+ id: 11, name: "Synonym cycling", category: :language, weight: 2,
255
+ description: "Referring to the same thing by different names in consecutive sentences to avoid repetition.",
256
+ detector: ->(text) {
257
+ synonym_sets = [
258
+ %w[protagonist main\ character central\ figure hero lead\ character lead],
259
+ %w[company firm organization enterprise corporation establishment entity],
260
+ %w[city metropolis urban\ center municipality locale township],
261
+ %w[building structure edifice facility complex establishment],
262
+ %w[tool instrument mechanism apparatus device utility],
263
+ %w[country nation state republic sovereign\ state],
264
+ %w[problem challenge issue obstacle hurdle difficulty],
265
+ %w[solution approach methodology framework strategy paradigm],
266
+ ]
267
+
268
+ results = []
269
+ sentences = text.split(/[.!?]+/).select { |s| s.strip.length > 0 }
270
+
271
+ synonym_sets.each do |synonyms|
272
+ (0...sentences.length - 1).each do |i|
273
+ found = []
274
+ (i...[i + 4, sentences.length].min).each do |j|
275
+ lower = sentences[j].downcase
276
+ synonyms.each do |syn|
277
+ if lower.include?(syn) && !found.include?(syn)
278
+ found << syn
279
+ end
280
+ end
281
+ end
282
+ if found.length >= 3
283
+ idx = text.index(sentences[i]) || 0
284
+ line_num = text[0...idx].count("\n") + 1
285
+ results << {
286
+ match: "Synonym cycling: #{found.join(' → ')}",
287
+ index: idx,
288
+ line: line_num,
289
+ column: 1,
290
+ suggestion: "Pick one term and stick with it. Found \"#{found.join('", "')}\" used as synonyms in nearby sentences.",
291
+ confidence: "medium",
292
+ }
293
+ break
294
+ end
295
+ end
296
+ end
297
+ results
298
+ }
299
+ ),
300
+
301
+ Pattern.new(
302
+ id: 12, name: "False ranges", category: :language, weight: 2,
303
+ description: '"From X to Y" where X and Y aren\'t on a meaningful scale.',
304
+ detector: ->(text) {
305
+ double_range = /\bfrom .{3,40} to .{3,40},\s*from .{3,40} to .{3,40}/i
306
+ results = TextUtils.find_matches(text, double_range, "False range — X and Y probably aren't on a meaningful scale. Just list the topics.", confidence: "high")
307
+
308
+ abstract_range = /\bfrom (?:the )?(?:dawn|birth|inception|beginning|advent|emergence|rise|earliest) .{3,60} to (?:the )?(?:modern|current|present|contemporary|latest|cutting-edge|digital|future)/i
309
+ results.concat TextUtils.find_matches(text, abstract_range, "Unnecessarily broad range. Be specific about what you're actually covering.", confidence: "medium")
310
+
311
+ results
312
+ }
313
+ ),
314
+
315
+ # ── STYLE PATTERNS (13-18) ────────────────────────
316
+
317
+ Pattern.new(
318
+ id: 13, name: "Em dash overuse", category: :style, weight: 2,
319
+ description: "LLMs overuse em dashes (\u2014) as a crutch for punchy writing.",
320
+ detector: ->(text) {
321
+ em_dashes = text.scan(/\u2014/).length
322
+ words = TextUtils.word_count(text)
323
+ ratio = words > 0 ? em_dashes.to_f / (words / 100.0) : 0
324
+
325
+ if ratio > 1.0 && em_dashes >= 2
326
+ TextUtils.find_matches(text, /\u2014/, "High em dash density (#{em_dashes} in #{words} words). Replace most with commas, periods, or parentheses.", confidence: "medium")
327
+ else
328
+ []
329
+ end
330
+ }
331
+ ),
332
+
333
+ Pattern.new(
334
+ id: 14, name: "Boldface overuse", category: :style, weight: 2,
335
+ description: "Mechanical emphasis of phrases in bold. AI uses **bold** as a highlighting crutch.",
336
+ detector: ->(text) {
337
+ bold_matches = text.scan(/\*\*[^*]+\*\*/)
338
+ if bold_matches.length >= 3
339
+ TextUtils.find_matches(text, /\*\*[^*]+\*\*/, "Excessive boldface. Remove emphasis — let the writing carry the weight.", confidence: "medium")
340
+ else
341
+ []
342
+ end
343
+ }
344
+ ),
345
+
346
+ Pattern.new(
347
+ id: 15, name: "Inline-header lists", category: :style, weight: 3,
348
+ description: "Lists where each item starts with a bolded header followed by a colon.",
349
+ detector: ->(text) {
350
+ inline_headers = /^[*\-]\s+\*\*[^*]+:\*\*\s/m
351
+ matches = text.scan(inline_headers)
352
+ if matches.length >= 2
353
+ TextUtils.find_matches(text, inline_headers, "Inline-header list pattern. Convert to a paragraph or use a simpler list.", confidence: "high")
354
+ else
355
+ []
356
+ end
357
+ }
358
+ ),
359
+
360
+ Pattern.new(
361
+ id: 16, name: "Title Case headings", category: :style, weight: 1,
362
+ description: "Capitalizing Every Main Word In Headings. AI chatbots default to this.",
363
+ detector: ->(text) {
364
+ results = []
365
+ skip_words = /^(?:I|AI|API|CLI|URL|HTML|CSS|JS|TS|NPM|NYC|USA|UK|EU|LLM|GPT|SaaS|IoT|CEO|CTO|VP|PR|HR|IT|UI|UX)\b/
366
+
367
+ text.scan(/^(\#{1,6}\s+(.+))$/m) do
368
+ m = Regexp.last_match
369
+ heading = m[2].strip
370
+ words = heading.split(/\s+/)
371
+ if words.length >= 3
372
+ capitalized_count = words.count { |w| w =~ /^[A-Z]/ && w !~ skip_words }
373
+ if capitalized_count.to_f / words.length > 0.7
374
+ idx = text.index(m[0]) || 0
375
+ line_num = text[0...idx].count("\n") + 1
376
+ results << {
377
+ match: m[0],
378
+ index: idx,
379
+ line: line_num,
380
+ column: 1,
381
+ suggestion: "Use sentence case for headings (only capitalize first word and proper nouns).",
382
+ confidence: "medium",
383
+ }
384
+ end
385
+ end
386
+ end
387
+ results
388
+ }
389
+ ),
390
+
391
+ Pattern.new(
392
+ id: 17, name: "Emoji overuse", category: :style, weight: 2,
393
+ description: "Decorating headings or bullet points with emojis in professional/technical text.",
394
+ detector: ->(text) {
395
+ emoji_count = TextUtils.count_matches(text, /[\u{1F300}-\u{1F9FF}\u{2600}-\u{27BF}]/u)
396
+ if emoji_count >= 3
397
+ TextUtils.find_matches(text, /[\u{1F300}-\u{1F9FF}\u{2600}-\u{27BF}\u{2300}-\u{23FF}\u{2B50}]/u, "Remove emoji decoration from professional text.", confidence: "high")
398
+ else
399
+ []
400
+ end
401
+ }
402
+ ),
403
+
404
+ Pattern.new(
405
+ id: 18, name: "Curly quotes", category: :style, weight: 1,
406
+ description: "ChatGPT uses Unicode curly quotes (\u201C\u201D\u2018\u2019) instead of straight quotes.",
407
+ detector: ->(text) {
408
+ TextUtils.find_matches(text, /[\u201C\u201D\u2018\u2019]/, "Replace curly quotes with straight quotes.", confidence: "high")
409
+ }
410
+ ),
411
+
412
+ # ── COMMUNICATION PATTERNS (19-21) ─────────────────
413
+
414
+ Pattern.new(
415
+ id: 19, name: "Chatbot artifacts", category: :communication, weight: 5,
416
+ description: 'Leftover chatbot phrases: "I hope this helps!", "Let me know if...", "Here is an overview".',
417
+ detector: ->(text) {
418
+ filtered = Vocabulary::AI_PHRASES.select { |p|
419
+ p[:fix] == "(remove)" || p[:fix] == "(remove — start with the content)"
420
+ }
421
+ Patterns.scan_phrases(text, filtered)
422
+ }
423
+ ),
424
+
425
+ Pattern.new(
426
+ id: 20, name: "Cutoff disclaimers", category: :communication, weight: 4,
427
+ description: "AI knowledge-cutoff disclaimers left in text.",
428
+ detector: ->(text) {
429
+ filtered = Vocabulary::AI_PHRASES.select { |p|
430
+ p[:fix] == "(remove)" &&
431
+ (p[:pattern].source.include?("training") ||
432
+ p[:pattern].source.include?("details are") ||
433
+ p[:pattern].source.include?("available"))
434
+ }
435
+ Patterns.scan_phrases(text, filtered)
436
+ }
437
+ ),
438
+
439
+ Pattern.new(
440
+ id: 21, name: "Sycophantic tone", category: :communication, weight: 4,
441
+ description: 'Overly positive, people-pleasing language: "Great question!", "You\'re absolutely right!".',
442
+ detector: ->(text) {
443
+ filtered = Vocabulary::AI_PHRASES.select { |p|
444
+ p[:fix] &&
445
+ (p[:fix].include?("(remove)") || p[:fix].include?("address the substance")) &&
446
+ (p[:pattern].source.include?("question") ||
447
+ p[:pattern].source.include?("point") ||
448
+ p[:pattern].source.include?("right") ||
449
+ p[:pattern].source.include?("observation"))
450
+ }
451
+ Patterns.scan_phrases(text, filtered)
452
+ }
453
+ ),
454
+
455
+ # ── FILLER & HEDGING (22-24) ──────────────────────
456
+
457
+ Pattern.new(
458
+ id: 22, name: "Filler phrases", category: :filler, weight: 3,
459
+ description: 'Wordy filler that can be shortened: "in order to" → "to", "due to the fact that" → "because".',
460
+ detector: ->(text) {
461
+ simple_fixes = %w[to because now if can first finally].push("to / for", "for / regarding", "because / since")
462
+ filtered = Vocabulary::AI_PHRASES.select { |p|
463
+ p[:fix] && !p[:fix].start_with?("(") && simple_fixes.include?(p[:fix])
464
+ }
465
+ Patterns.scan_phrases(text, filtered)
466
+ }
467
+ ),
468
+
469
+ Pattern.new(
470
+ id: 23, name: "Excessive hedging", category: :filler, weight: 3,
471
+ description: 'Stacking qualifiers: "could potentially possibly", "might arguably perhaps".',
472
+ detector: ->(text) {
473
+ filtered = Vocabulary::AI_PHRASES.select { |p|
474
+ p[:fix] &&
475
+ (p[:fix].include?("could") || p[:fix].include?("might") ||
476
+ p[:fix].include?("may") || p[:fix].include?("perhaps") ||
477
+ p[:fix].include?("maybe"))
478
+ }
479
+ Patterns.scan_phrases(text, filtered)
480
+ }
481
+ ),
482
+
483
+ Pattern.new(
484
+ id: 24, name: "Generic conclusions", category: :filler, weight: 3,
485
+ description: 'Vague upbeat endings: "The future looks bright", "Exciting times lie ahead".',
486
+ detector: ->(text) {
487
+ filtered = Vocabulary::AI_PHRASES.select { |p|
488
+ p[:fix] &&
489
+ (p[:fix].include?("specific fact") || p[:fix].include?("concrete") ||
490
+ p[:fix].include?("cite evidence") || p[:fix].include?("what you do know") ||
491
+ p[:fix].include?("what happens next"))
492
+ }
493
+ Patterns.scan_phrases(text, filtered)
494
+ }
495
+ ),
496
+
497
+ # ── NEW PATTERNS (v2.2) ───────────────────────────
498
+
499
+ Pattern.new(
500
+ id: 25, name: "Reasoning chain artifacts", category: :communication, weight: 4,
501
+ description: 'Exposed chain-of-thought reasoning: "Let me think...", "Step 1:", "Breaking this down..."',
502
+ detector: ->(text) {
503
+ reasoning_patterns = [
504
+ /\blet me think(?: about this| through this| step by step)?\b/i,
505
+ /\blet's (?:think|reason|work) (?:about|through|this out)\b/i,
506
+ /\bbreaking (?:this|it) down\b/i,
507
+ /\bto approach this (?:systematically|methodically|logically)\b/i,
508
+ /\reasoning through (?:this|the problem|it)\b/i,
509
+ /\bworking through the logic\b/i,
510
+ /\bstep (?:[1-9]|one|two|three|four|five):/i,
511
+ /\bfirst,? let'?s consider\b/i,
512
+ /\bthinking about this (?:carefully|logically|systematically)\b/i,
513
+ /\bhere'?s my (?:thought process|reasoning|thinking)\b/i,
514
+ ]
515
+ reasoning_patterns.flat_map { |re|
516
+ TextUtils.find_matches(text, re, "Hide reasoning or make it natural: \"Here's my take:\" instead of \"Let me think step by step:\"")
517
+ }
518
+ }
519
+ ),
520
+
521
+ Pattern.new(
522
+ id: 26, name: "Excessive structure", category: :style, weight: 3,
523
+ description: "Over-formatted responses: too many headers, nested bullets, or numbered lists for simple content.",
524
+ detector: ->(text) {
525
+ results = []
526
+ words = TextUtils.word_count(text)
527
+
528
+ headers = (text.scan(/^\#{1,6}\s+.+$/m) || []).length
529
+ bullets = (text.scan(/^\s*[-*+]\s+/m) || []).length
530
+ numbered = (text.scan(/^\s*\d+\.\s+/m) || []).length
531
+
532
+ if words < 300 && headers >= 3
533
+ results << {
534
+ match: "#{headers} headers in #{words} words",
535
+ index: 0, line: 1, column: 1,
536
+ suggestion: "Too many headers for short content. Use prose instead.",
537
+ confidence: "medium",
538
+ }
539
+ end
540
+
541
+ if words < 200 && (bullets + numbered) >= 8
542
+ results << {
543
+ match: "#{bullets + numbered} list items in #{words} words",
544
+ index: 0, line: 1, column: 1,
545
+ suggestion: "Excessive lists. Could this be a paragraph instead?",
546
+ confidence: "medium",
547
+ }
548
+ end
549
+
550
+ structure_headers = /^#+\s*(?:overview|key (?:points|takeaways)|summary|conclusion|introduction|background)\s*:?\s*$/im
551
+ results.concat TextUtils.find_matches(text, structure_headers, "Formulaic structure. Let content flow naturally.", confidence: "medium")
552
+
553
+ results
554
+ }
555
+ ),
556
+
557
+ Pattern.new(
558
+ id: 27, name: "Confidence calibration", category: :communication, weight: 3,
559
+ description: 'Artificially hedged or over-confident phrasing: "I\'m confident that...", "It\'s worth noting..."',
560
+ detector: ->(text) {
561
+ calibration_patterns = [
562
+ { regex: /\bI'?m confident (?:that|in)\b/i, fix: "State the fact without prefacing confidence" },
563
+ { regex: /\bit'?s worth (?:noting|mentioning|pointing out) that\b/i, fix: "Just say it" },
564
+ { regex: /\binterestingly (?:enough)?,?\b/i, fix: "Let reader decide if interesting" },
565
+ { regex: /\bsurprisingly,?\s/i, fix: "State the fact; surprise is implied" },
566
+ { regex: /\bimportantly,?\s/i, fix: "Let reader judge importance" },
567
+ { regex: /\bsignificantly,?\s/i, fix: "Be specific about the significance" },
568
+ { regex: /\bnotably,?\s/i, fix: "Just state the notable thing" },
569
+ { regex: /\bcertainly,?\s/i, fix: "Remove or state with evidence" },
570
+ { regex: /\bundoubtedly,?\s/i, fix: "Remove or cite evidence" },
571
+ { regex: /\bwithout (?:a )?doubt,?\s/i, fix: "Remove or cite evidence" },
572
+ ]
573
+ calibration_patterns.flat_map { |p|
574
+ TextUtils.find_matches(text, p[:regex], p[:fix])
575
+ }
576
+ }
577
+ ),
578
+
579
+ Pattern.new(
580
+ id: 28, name: "Acknowledgment loops", category: :communication, weight: 4,
581
+ description: 'Restating the question before answering: "You\'re asking about X. X is..."',
582
+ detector: ->(text) {
583
+ ack_patterns = [
584
+ /\byou'?re asking (?:about|whether|if|how|why|what)\b/i,
585
+ /\bthe question of (?:whether|how|why|what)\b/i,
586
+ /\bwhen it comes to your question\b/i,
587
+ /\bin (?:terms of|response to|answer to) your question\b/i,
588
+ /\bto (?:answer|address) your question\b/i,
589
+ /\byour question (?:about|regarding|concerning)\b/i,
590
+ /\bthat'?s a (?:great|good|interesting) question\. (?:the|it|so)\b/i,
591
+ /\bI understand you'?re (?:asking|wondering|curious)\b/i,
592
+ ]
593
+ ack_patterns.flat_map { |re|
594
+ TextUtils.find_matches(text, re, "Just answer. Don't restate the question.")
595
+ }
596
+ }
597
+ ),
598
+ ].freeze
599
+
600
+ # ── Registry ─────────────────────────────────────────
601
+
602
+ class Registry
603
+ def initialize
604
+ @patterns = PATTERNS.dup
605
+ end
606
+
607
+ def all
608
+ @patterns
609
+ end
610
+
611
+ def find(id)
612
+ @patterns.find { |p| p.id == id }
613
+ end
614
+
615
+ def by_category(category)
616
+ @patterns.select { |p| p.category == category }
617
+ end
618
+
619
+ def categories
620
+ @patterns.map(&:category).uniq
621
+ end
622
+
623
+ def list
624
+ @patterns.map { |p| { id: p.id, name: p.name, category: p.category, weight: p.weight } }
625
+ end
626
+
627
+ def add(pattern)
628
+ raise ArgumentError, "Pattern must have id, name, and detector" unless pattern.id && pattern.name && pattern.detector
629
+ @patterns << pattern
630
+ end
631
+
632
+ def remove(id)
633
+ @patterns.reject! { |p| p.id == id }
634
+ end
635
+ end
636
+ end
637
+ end