humanizer-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +12 -0
- data/LICENSE +21 -0
- data/README.md +133 -0
- data/bin/humanizer +388 -0
- data/lib/humanizer/analyzer.rb +319 -0
- data/lib/humanizer/humanizer_engine.rb +192 -0
- data/lib/humanizer/patterns.rb +637 -0
- data/lib/humanizer/stats.rb +198 -0
- data/lib/humanizer/text_utils.rb +53 -0
- data/lib/humanizer/version.rb +5 -0
- data/lib/humanizer/vocabulary.rb +260 -0
- data/lib/humanizer.rb +33 -0
- metadata +61 -0
|
@@ -0,0 +1,637 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Humanizer
|
|
4
|
+
module Patterns
|
|
5
|
+
Pattern = Struct.new(:id, :name, :category, :description, :weight, :detector, keyword_init: true)
|
|
6
|
+
|
|
7
|
+
# ── Phrase Lists ─────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
SIGNIFICANCE_PHRASES = [
|
|
10
|
+
/marking a pivotal/i, /pivotal moment/i, /pivotal role/i,
|
|
11
|
+
/key role/i, /crucial role/i, /vital role/i, /significant role/i,
|
|
12
|
+
/is a testament/i, /stands as a testament/i, /serves as a testament/i,
|
|
13
|
+
/serves as a reminder/i, /reflects broader/i, /broader trends/i,
|
|
14
|
+
/broader movement/i, /evolving landscape/i, /evolving world/i,
|
|
15
|
+
/setting the stage for/i, /marking a shift/i, /key turning point/i,
|
|
16
|
+
/indelible mark/i, /deeply rooted/i, /focal point/i,
|
|
17
|
+
/symbolizing its ongoing/i, /enduring legacy/i, /lasting impact/i,
|
|
18
|
+
/contributing to the/i, /underscores the importance/i,
|
|
19
|
+
/highlights the significance/i, /represents a shift/i,
|
|
20
|
+
/shaping the future/i, /the evolution of/i, /rich tapestry/i,
|
|
21
|
+
/rich heritage/i, /stands as a beacon/i, /marks a milestone/i,
|
|
22
|
+
/paving the way/i, /charting a course/i,
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
PROMOTIONAL_WORDS = [
|
|
26
|
+
/\bnestled\b/i, /\bin the heart of\b/i, /\bbreathtaking\b/i,
|
|
27
|
+
/\bmust-visit\b/i, /\bstunning\b/i, /\brenowned\b/i,
|
|
28
|
+
/\bnatural beauty\b/i, /\brich cultural heritage\b/i,
|
|
29
|
+
/\brich history\b/i, /\bcommitment to\b/i, /\bexemplifies\b/i,
|
|
30
|
+
/\bworld-class\b/i, /\bstate-of-the-art\b/i, /\bgame-changing\b/i,
|
|
31
|
+
/\bgame changer\b/i, /\bunparalleled\b/i, /\bprofound\b/i,
|
|
32
|
+
/\bbest-in-class\b/i, /\btrailblazing\b/i, /\bvisionary\b/i,
|
|
33
|
+
/\bcutting-edge\b/i, /\bworldwide recognition\b/i,
|
|
34
|
+
].freeze
|
|
35
|
+
|
|
36
|
+
VAGUE_ATTRIBUTION_PHRASES = [
|
|
37
|
+
/\bexperts (?:believe|argue|say|suggest|note|agree|contend|have noted)\b/i,
|
|
38
|
+
/\bindustry (?:reports|observers|experts|analysts|leaders|insiders)\b/i,
|
|
39
|
+
/\bobservers have (?:cited|noted|pointed out)\b/i,
|
|
40
|
+
/\bsome critics argue\b/i,
|
|
41
|
+
/\bsome experts (?:say|believe|suggest)\b/i,
|
|
42
|
+
/\bseveral sources\b/i, /\baccording to reports\b/i,
|
|
43
|
+
/\bwidely (?:regarded|considered|recognized|acknowledged)\b/i,
|
|
44
|
+
/\bit is widely (?:known|believed|accepted)\b/i,
|
|
45
|
+
/\bmany (?:experts|scholars|researchers|analysts) (?:believe|argue|suggest)\b/i,
|
|
46
|
+
/\bstudies (?:show|suggest|indicate|have shown)\b/i,
|
|
47
|
+
/\bresearch (?:shows|suggests|indicates|has shown)\b/i,
|
|
48
|
+
/\bsources close to\b/i, /\bpeople familiar with\b/i,
|
|
49
|
+
].freeze
|
|
50
|
+
|
|
51
|
+
CHALLENGES_PHRASES = [
|
|
52
|
+
/despite (?:its|these|the|their) (?:challenges|setbacks|obstacles|difficulties|limitations)/i,
|
|
53
|
+
/faces (?:several|many|numerous|various) challenges/i,
|
|
54
|
+
/continues to thrive/i, /continues to grow/i,
|
|
55
|
+
/future (?:outlook|prospects) (?:remain|look|appear)/i,
|
|
56
|
+
/challenges and (?:future|legacy|opportunities)/i,
|
|
57
|
+
/despite these (?:challenges|hurdles|obstacles)/i,
|
|
58
|
+
/overcoming (?:obstacles|challenges|adversity)/i,
|
|
59
|
+
/weather(?:ing|ed) the storm/i,
|
|
60
|
+
].freeze
|
|
61
|
+
|
|
62
|
+
COPULA_AVOIDANCE = [
|
|
63
|
+
/\bserves as(?: a)?\b/i, /\bstands as(?: a)?\b/i,
|
|
64
|
+
/\bmarks a\b/i, /\brepresents a\b/i,
|
|
65
|
+
/\bboasts (?:a|an|over|more)\b/i, /\bfeatures (?:a|an|over|more)\b/i,
|
|
66
|
+
/\boffers (?:a|an)\b/i, /\bfunctions as\b/i,
|
|
67
|
+
/\bacts as(?: a)?\b/i, /\boperates as(?: a)?\b/i,
|
|
68
|
+
].freeze
|
|
69
|
+
|
|
70
|
+
# ── Helper Methods ───────────────────────────────────
|
|
71
|
+
|
|
72
|
+
module_function
|
|
73
|
+
|
|
74
|
+
def scan_word_list(text, word_list, suggestion_prefix, confidence: "high")
|
|
75
|
+
results = []
|
|
76
|
+
word_list.each do |word|
|
|
77
|
+
regex = TextUtils.word_regex(word)
|
|
78
|
+
matches = TextUtils.find_matches(
|
|
79
|
+
text, regex,
|
|
80
|
+
"#{suggestion_prefix}: \"#{word}\". Use a simpler, more specific alternative.",
|
|
81
|
+
confidence: confidence
|
|
82
|
+
)
|
|
83
|
+
results.concat(matches)
|
|
84
|
+
end
|
|
85
|
+
results
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def scan_phrases(text, phrases, tier_filter: nil)
|
|
89
|
+
results = []
|
|
90
|
+
phrases.each do |phrase|
|
|
91
|
+
next if tier_filter && phrase[:tier] != tier_filter
|
|
92
|
+
|
|
93
|
+
fix = phrase[:fix]
|
|
94
|
+
suggestion = fix.start_with?("(") ? fix : "Replace with: #{fix}"
|
|
95
|
+
confidence = case phrase[:tier]
|
|
96
|
+
when 1 then "high"
|
|
97
|
+
when 2 then "medium"
|
|
98
|
+
else "low"
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
matches = TextUtils.find_matches(text, phrase[:pattern], suggestion, confidence: confidence)
|
|
102
|
+
results.concat(matches)
|
|
103
|
+
end
|
|
104
|
+
results
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# ── Pattern Definitions ──────────────────────────────
|
|
108
|
+
|
|
109
|
+
PATTERNS = [
|
|
110
|
+
# ── CONTENT PATTERNS (1-6) ────────────────────────
|
|
111
|
+
|
|
112
|
+
Pattern.new(
|
|
113
|
+
id: 1, name: "Significance inflation", category: :content, weight: 4,
|
|
114
|
+
description: "Inflated claims about significance, legacy, or broader trends. LLMs puff up importance of mundane things.",
|
|
115
|
+
detector: ->(text) {
|
|
116
|
+
SIGNIFICANCE_PHRASES.flat_map { |re|
|
|
117
|
+
TextUtils.find_matches(text, re, "Remove inflated significance claim. State concrete facts instead.", confidence: "high")
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
),
|
|
121
|
+
|
|
122
|
+
Pattern.new(
|
|
123
|
+
id: 2, name: "Notability name-dropping", category: :content, weight: 3,
|
|
124
|
+
description: "Listing media outlets or sources to claim notability without providing context or specific claims.",
|
|
125
|
+
detector: ->(text) {
|
|
126
|
+
media_list = /\b(?:cited|featured|covered|mentioned|reported|published|recognized|highlighted) (?:in|by) .{0,20}(?:The New York Times|BBC|CNN|The Washington Post|The Guardian|Wired|Forbes|Reuters|Bloomberg|Financial Times|The Verge|TechCrunch|The Hindu|Al Jazeera|Time|Newsweek|The Economist|Nature|Science).{0,100}(?:,\s*(?:and\s+)?(?:The New York Times|BBC|CNN|The Washington Post|The Guardian|Wired|Forbes|Reuters|Bloomberg|Financial Times|The Verge|TechCrunch|The Hindu|Al Jazeera|Time|Newsweek|The Economist|Nature|Science))+/i
|
|
127
|
+
results = TextUtils.find_matches(text, media_list, "Instead of listing outlets, cite one specific claim from one source.", confidence: "high")
|
|
128
|
+
results.concat TextUtils.find_matches(text, /\bactive social media presence\b/i, "Remove — not meaningful without specific context.", confidence: "high")
|
|
129
|
+
results.concat TextUtils.find_matches(text, /\bwritten by a leading expert\b/i, "Name the expert and their specific credential.", confidence: "medium")
|
|
130
|
+
results.concat TextUtils.find_matches(text, /\bhas been (?:featured|recognized|acknowledged) (?:by|in)\b/i, "Cite the specific feature with a concrete claim.", confidence: "medium")
|
|
131
|
+
results
|
|
132
|
+
}
|
|
133
|
+
),
|
|
134
|
+
|
|
135
|
+
Pattern.new(
|
|
136
|
+
id: 3, name: "Superficial -ing analyses", category: :content, weight: 4,
|
|
137
|
+
description: 'Tacking "-ing" participial phrases onto sentences to fake depth.',
|
|
138
|
+
detector: ->(text) {
|
|
139
|
+
ing_phrases = /,\s*(?:highlighting|underscoring|emphasizing|ensuring|reflecting|symbolizing|contributing to|cultivating|fostering|encompassing|showcasing|demonstrating|illustrating|representing|signaling|indicating|solidifying|reinforcing|cementing|underscoring|bolstering|reaffirming|illuminating|epitomizing)\b[^.]{5,}/i
|
|
140
|
+
TextUtils.find_matches(text, ing_phrases, "Remove trailing -ing phrase. If the point matters, give it its own sentence with specifics.", confidence: "high")
|
|
141
|
+
}
|
|
142
|
+
),
|
|
143
|
+
|
|
144
|
+
Pattern.new(
|
|
145
|
+
id: 4, name: "Promotional language", category: :content, weight: 3,
|
|
146
|
+
description: "Ad-copy language that sounds like a tourism brochure or press release.",
|
|
147
|
+
detector: ->(text) {
|
|
148
|
+
PROMOTIONAL_WORDS.flat_map { |re|
|
|
149
|
+
TextUtils.find_matches(text, re, "Replace promotional language with neutral, factual description.", confidence: "high")
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
),
|
|
153
|
+
|
|
154
|
+
Pattern.new(
|
|
155
|
+
id: 5, name: "Vague attributions", category: :content, weight: 4,
|
|
156
|
+
description: "Attributing claims to unnamed experts, industry reports, or vague authorities.",
|
|
157
|
+
detector: ->(text) {
|
|
158
|
+
VAGUE_ATTRIBUTION_PHRASES.flat_map { |re|
|
|
159
|
+
TextUtils.find_matches(text, re, "Name the specific source, study, or person. If you can't, remove the claim.", confidence: "high")
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
),
|
|
163
|
+
|
|
164
|
+
Pattern.new(
|
|
165
|
+
id: 6, name: "Formulaic challenges", category: :content, weight: 3,
|
|
166
|
+
description: 'Boilerplate "Despite challenges... continues to thrive" sections.',
|
|
167
|
+
detector: ->(text) {
|
|
168
|
+
CHALLENGES_PHRASES.flat_map { |re|
|
|
169
|
+
TextUtils.find_matches(text, re, "Replace with specific challenges and concrete outcomes.", confidence: "high")
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
),
|
|
173
|
+
|
|
174
|
+
# ── LANGUAGE PATTERNS (7-12) ──────────────────────
|
|
175
|
+
|
|
176
|
+
Pattern.new(
|
|
177
|
+
id: 7, name: "AI vocabulary", category: :language, weight: 5,
|
|
178
|
+
description: "Words and phrases that appear far more frequently in AI-generated text. 500+ words tracked across 3 tiers.",
|
|
179
|
+
detector: ->(text) {
|
|
180
|
+
results = []
|
|
181
|
+
words = TextUtils.word_count(text)
|
|
182
|
+
|
|
183
|
+
# Tier 1: always flag
|
|
184
|
+
all_tier1 = Vocabulary::TIER_1 + Vocabulary::TIER_1_PHRASES
|
|
185
|
+
results.concat Patterns.scan_word_list(text, all_tier1, "Tier 1 AI word", confidence: "high")
|
|
186
|
+
|
|
187
|
+
# Tier 2: flag if 2+ tier-2 words appear
|
|
188
|
+
tier2_matches = Patterns.scan_word_list(text, Vocabulary::TIER_2, "Tier 2 AI word", confidence: "medium")
|
|
189
|
+
results.concat(tier2_matches) if tier2_matches.length >= 2
|
|
190
|
+
|
|
191
|
+
# Tier 3: flag only at high density (>3% of words are tier-3)
|
|
192
|
+
if words > 50
|
|
193
|
+
tier3_count = Vocabulary::TIER_3.sum { |word|
|
|
194
|
+
TextUtils.count_matches(text, TextUtils.word_regex(word))
|
|
195
|
+
}
|
|
196
|
+
density = tier3_count.to_f / words
|
|
197
|
+
if density > 0.03
|
|
198
|
+
results.concat Patterns.scan_word_list(text, Vocabulary::TIER_3, "Tier 3 AI word (high density)", confidence: "low")
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# AI phrases (non-remove, non-simple-replacement)
|
|
203
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
204
|
+
p[:fix] &&
|
|
205
|
+
!p[:fix].start_with?("(remove") &&
|
|
206
|
+
!%w[to because now if can first finally].include?(p[:fix])
|
|
207
|
+
}
|
|
208
|
+
results.concat Patterns.scan_phrases(text, filtered)
|
|
209
|
+
|
|
210
|
+
results
|
|
211
|
+
}
|
|
212
|
+
),
|
|
213
|
+
|
|
214
|
+
Pattern.new(
|
|
215
|
+
id: 8, name: "Copula avoidance", category: :language, weight: 3,
|
|
216
|
+
description: 'Using "serves as", "functions as", "boasts" instead of simple "is", "has", "are".',
|
|
217
|
+
detector: ->(text) {
|
|
218
|
+
COPULA_AVOIDANCE.flat_map { |re|
|
|
219
|
+
TextUtils.find_matches(text, re, 'Use simple "is", "are", or "has" instead.', confidence: "high")
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
),
|
|
223
|
+
|
|
224
|
+
Pattern.new(
|
|
225
|
+
id: 9, name: "Negative parallelisms", category: :language, weight: 3,
|
|
226
|
+
description: '"It\'s not just X, it\'s Y" or "Not only X but Y" constructions — overused by LLMs.',
|
|
227
|
+
detector: ->(text) {
|
|
228
|
+
neg_parallel = /\b(?:it'?s|this is) not (?:just|merely|only|simply) .{3,60}(?:,|;|\u2014)\s*(?:it'?s|this is|but)\b/i
|
|
229
|
+
not_only = /\bnot only .{3,60} but (?:also )?\b/i
|
|
230
|
+
results = TextUtils.find_matches(text, neg_parallel, 'Rewrite directly. State what the thing IS, not what it "isn\'t just".', confidence: "high")
|
|
231
|
+
results.concat TextUtils.find_matches(text, not_only, 'Simplify. Remove the "not only...but also" frame.', confidence: "medium")
|
|
232
|
+
results
|
|
233
|
+
}
|
|
234
|
+
),
|
|
235
|
+
|
|
236
|
+
Pattern.new(
|
|
237
|
+
id: 10, name: "Rule of three", category: :language, weight: 2,
|
|
238
|
+
description: 'Forcing ideas into groups of three. LLMs love triads that sound "comprehensive".',
|
|
239
|
+
detector: ->(text) {
|
|
240
|
+
# Abstract noun triads
|
|
241
|
+
buzzy_triad = /\b(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence),\s+(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence),\s+and\s+(\w+tion|\w+ity|\w+ment|\w+ness|\w+ance|\w+ence)\b/i
|
|
242
|
+
results = TextUtils.find_matches(text, buzzy_triad, "Rule of three with abstract nouns. Pick the one or two that actually matter.", confidence: "medium")
|
|
243
|
+
|
|
244
|
+
buzz_adj = %w[seamless intuitive powerful innovative dynamic robust comprehensive cutting-edge scalable agile efficient effective engaging impactful meaningful transformative sustainable resilient inclusive accessible]
|
|
245
|
+
adj_pattern = buzz_adj.join("|")
|
|
246
|
+
adj_triad = /\b(#{adj_pattern}),\s+(#{adj_pattern}),\s+and\s+(#{adj_pattern})\b/i
|
|
247
|
+
results.concat TextUtils.find_matches(text, adj_triad, "Buzzy adjective triad. Pick one and make it specific.", confidence: "medium")
|
|
248
|
+
|
|
249
|
+
results
|
|
250
|
+
}
|
|
251
|
+
),
|
|
252
|
+
|
|
253
|
+
Pattern.new(
|
|
254
|
+
id: 11, name: "Synonym cycling", category: :language, weight: 2,
|
|
255
|
+
description: "Referring to the same thing by different names in consecutive sentences to avoid repetition.",
|
|
256
|
+
detector: ->(text) {
|
|
257
|
+
synonym_sets = [
|
|
258
|
+
%w[protagonist main\ character central\ figure hero lead\ character lead],
|
|
259
|
+
%w[company firm organization enterprise corporation establishment entity],
|
|
260
|
+
%w[city metropolis urban\ center municipality locale township],
|
|
261
|
+
%w[building structure edifice facility complex establishment],
|
|
262
|
+
%w[tool instrument mechanism apparatus device utility],
|
|
263
|
+
%w[country nation state republic sovereign\ state],
|
|
264
|
+
%w[problem challenge issue obstacle hurdle difficulty],
|
|
265
|
+
%w[solution approach methodology framework strategy paradigm],
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
results = []
|
|
269
|
+
sentences = text.split(/[.!?]+/).select { |s| s.strip.length > 0 }
|
|
270
|
+
|
|
271
|
+
synonym_sets.each do |synonyms|
|
|
272
|
+
(0...sentences.length - 1).each do |i|
|
|
273
|
+
found = []
|
|
274
|
+
(i...[i + 4, sentences.length].min).each do |j|
|
|
275
|
+
lower = sentences[j].downcase
|
|
276
|
+
synonyms.each do |syn|
|
|
277
|
+
if lower.include?(syn) && !found.include?(syn)
|
|
278
|
+
found << syn
|
|
279
|
+
end
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
if found.length >= 3
|
|
283
|
+
idx = text.index(sentences[i]) || 0
|
|
284
|
+
line_num = text[0...idx].count("\n") + 1
|
|
285
|
+
results << {
|
|
286
|
+
match: "Synonym cycling: #{found.join(' → ')}",
|
|
287
|
+
index: idx,
|
|
288
|
+
line: line_num,
|
|
289
|
+
column: 1,
|
|
290
|
+
suggestion: "Pick one term and stick with it. Found \"#{found.join('", "')}\" used as synonyms in nearby sentences.",
|
|
291
|
+
confidence: "medium",
|
|
292
|
+
}
|
|
293
|
+
break
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
results
|
|
298
|
+
}
|
|
299
|
+
),
|
|
300
|
+
|
|
301
|
+
Pattern.new(
|
|
302
|
+
id: 12, name: "False ranges", category: :language, weight: 2,
|
|
303
|
+
description: '"From X to Y" where X and Y aren\'t on a meaningful scale.',
|
|
304
|
+
detector: ->(text) {
|
|
305
|
+
double_range = /\bfrom .{3,40} to .{3,40},\s*from .{3,40} to .{3,40}/i
|
|
306
|
+
results = TextUtils.find_matches(text, double_range, "False range — X and Y probably aren't on a meaningful scale. Just list the topics.", confidence: "high")
|
|
307
|
+
|
|
308
|
+
abstract_range = /\bfrom (?:the )?(?:dawn|birth|inception|beginning|advent|emergence|rise|earliest) .{3,60} to (?:the )?(?:modern|current|present|contemporary|latest|cutting-edge|digital|future)/i
|
|
309
|
+
results.concat TextUtils.find_matches(text, abstract_range, "Unnecessarily broad range. Be specific about what you're actually covering.", confidence: "medium")
|
|
310
|
+
|
|
311
|
+
results
|
|
312
|
+
}
|
|
313
|
+
),
|
|
314
|
+
|
|
315
|
+
# ── STYLE PATTERNS (13-18) ────────────────────────
|
|
316
|
+
|
|
317
|
+
Pattern.new(
|
|
318
|
+
id: 13, name: "Em dash overuse", category: :style, weight: 2,
|
|
319
|
+
description: "LLMs overuse em dashes (\u2014) as a crutch for punchy writing.",
|
|
320
|
+
detector: ->(text) {
|
|
321
|
+
em_dashes = text.scan(/\u2014/).length
|
|
322
|
+
words = TextUtils.word_count(text)
|
|
323
|
+
ratio = words > 0 ? em_dashes.to_f / (words / 100.0) : 0
|
|
324
|
+
|
|
325
|
+
if ratio > 1.0 && em_dashes >= 2
|
|
326
|
+
TextUtils.find_matches(text, /\u2014/, "High em dash density (#{em_dashes} in #{words} words). Replace most with commas, periods, or parentheses.", confidence: "medium")
|
|
327
|
+
else
|
|
328
|
+
[]
|
|
329
|
+
end
|
|
330
|
+
}
|
|
331
|
+
),
|
|
332
|
+
|
|
333
|
+
Pattern.new(
|
|
334
|
+
id: 14, name: "Boldface overuse", category: :style, weight: 2,
|
|
335
|
+
description: "Mechanical emphasis of phrases in bold. AI uses **bold** as a highlighting crutch.",
|
|
336
|
+
detector: ->(text) {
|
|
337
|
+
bold_matches = text.scan(/\*\*[^*]+\*\*/)
|
|
338
|
+
if bold_matches.length >= 3
|
|
339
|
+
TextUtils.find_matches(text, /\*\*[^*]+\*\*/, "Excessive boldface. Remove emphasis — let the writing carry the weight.", confidence: "medium")
|
|
340
|
+
else
|
|
341
|
+
[]
|
|
342
|
+
end
|
|
343
|
+
}
|
|
344
|
+
),
|
|
345
|
+
|
|
346
|
+
Pattern.new(
|
|
347
|
+
id: 15, name: "Inline-header lists", category: :style, weight: 3,
|
|
348
|
+
description: "Lists where each item starts with a bolded header followed by a colon.",
|
|
349
|
+
detector: ->(text) {
|
|
350
|
+
inline_headers = /^[*\-]\s+\*\*[^*]+:\*\*\s/m
|
|
351
|
+
matches = text.scan(inline_headers)
|
|
352
|
+
if matches.length >= 2
|
|
353
|
+
TextUtils.find_matches(text, inline_headers, "Inline-header list pattern. Convert to a paragraph or use a simpler list.", confidence: "high")
|
|
354
|
+
else
|
|
355
|
+
[]
|
|
356
|
+
end
|
|
357
|
+
}
|
|
358
|
+
),
|
|
359
|
+
|
|
360
|
+
Pattern.new(
|
|
361
|
+
id: 16, name: "Title Case headings", category: :style, weight: 1,
|
|
362
|
+
description: "Capitalizing Every Main Word In Headings. AI chatbots default to this.",
|
|
363
|
+
detector: ->(text) {
|
|
364
|
+
results = []
|
|
365
|
+
skip_words = /^(?:I|AI|API|CLI|URL|HTML|CSS|JS|TS|NPM|NYC|USA|UK|EU|LLM|GPT|SaaS|IoT|CEO|CTO|VP|PR|HR|IT|UI|UX)\b/
|
|
366
|
+
|
|
367
|
+
text.scan(/^(\#{1,6}\s+(.+))$/m) do
|
|
368
|
+
m = Regexp.last_match
|
|
369
|
+
heading = m[2].strip
|
|
370
|
+
words = heading.split(/\s+/)
|
|
371
|
+
if words.length >= 3
|
|
372
|
+
capitalized_count = words.count { |w| w =~ /^[A-Z]/ && w !~ skip_words }
|
|
373
|
+
if capitalized_count.to_f / words.length > 0.7
|
|
374
|
+
idx = text.index(m[0]) || 0
|
|
375
|
+
line_num = text[0...idx].count("\n") + 1
|
|
376
|
+
results << {
|
|
377
|
+
match: m[0],
|
|
378
|
+
index: idx,
|
|
379
|
+
line: line_num,
|
|
380
|
+
column: 1,
|
|
381
|
+
suggestion: "Use sentence case for headings (only capitalize first word and proper nouns).",
|
|
382
|
+
confidence: "medium",
|
|
383
|
+
}
|
|
384
|
+
end
|
|
385
|
+
end
|
|
386
|
+
end
|
|
387
|
+
results
|
|
388
|
+
}
|
|
389
|
+
),
|
|
390
|
+
|
|
391
|
+
Pattern.new(
|
|
392
|
+
id: 17, name: "Emoji overuse", category: :style, weight: 2,
|
|
393
|
+
description: "Decorating headings or bullet points with emojis in professional/technical text.",
|
|
394
|
+
detector: ->(text) {
|
|
395
|
+
emoji_count = TextUtils.count_matches(text, /[\u{1F300}-\u{1F9FF}\u{2600}-\u{27BF}]/u)
|
|
396
|
+
if emoji_count >= 3
|
|
397
|
+
TextUtils.find_matches(text, /[\u{1F300}-\u{1F9FF}\u{2600}-\u{27BF}\u{2300}-\u{23FF}\u{2B50}]/u, "Remove emoji decoration from professional text.", confidence: "high")
|
|
398
|
+
else
|
|
399
|
+
[]
|
|
400
|
+
end
|
|
401
|
+
}
|
|
402
|
+
),
|
|
403
|
+
|
|
404
|
+
Pattern.new(
|
|
405
|
+
id: 18, name: "Curly quotes", category: :style, weight: 1,
|
|
406
|
+
description: "ChatGPT uses Unicode curly quotes (\u201C\u201D\u2018\u2019) instead of straight quotes.",
|
|
407
|
+
detector: ->(text) {
|
|
408
|
+
TextUtils.find_matches(text, /[\u201C\u201D\u2018\u2019]/, "Replace curly quotes with straight quotes.", confidence: "high")
|
|
409
|
+
}
|
|
410
|
+
),
|
|
411
|
+
|
|
412
|
+
# ── COMMUNICATION PATTERNS (19-21) ─────────────────
|
|
413
|
+
|
|
414
|
+
Pattern.new(
|
|
415
|
+
id: 19, name: "Chatbot artifacts", category: :communication, weight: 5,
|
|
416
|
+
description: 'Leftover chatbot phrases: "I hope this helps!", "Let me know if...", "Here is an overview".',
|
|
417
|
+
detector: ->(text) {
|
|
418
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
419
|
+
p[:fix] == "(remove)" || p[:fix] == "(remove — start with the content)"
|
|
420
|
+
}
|
|
421
|
+
Patterns.scan_phrases(text, filtered)
|
|
422
|
+
}
|
|
423
|
+
),
|
|
424
|
+
|
|
425
|
+
Pattern.new(
|
|
426
|
+
id: 20, name: "Cutoff disclaimers", category: :communication, weight: 4,
|
|
427
|
+
description: "AI knowledge-cutoff disclaimers left in text.",
|
|
428
|
+
detector: ->(text) {
|
|
429
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
430
|
+
p[:fix] == "(remove)" &&
|
|
431
|
+
(p[:pattern].source.include?("training") ||
|
|
432
|
+
p[:pattern].source.include?("details are") ||
|
|
433
|
+
p[:pattern].source.include?("available"))
|
|
434
|
+
}
|
|
435
|
+
Patterns.scan_phrases(text, filtered)
|
|
436
|
+
}
|
|
437
|
+
),
|
|
438
|
+
|
|
439
|
+
Pattern.new(
|
|
440
|
+
id: 21, name: "Sycophantic tone", category: :communication, weight: 4,
|
|
441
|
+
description: 'Overly positive, people-pleasing language: "Great question!", "You\'re absolutely right!".',
|
|
442
|
+
detector: ->(text) {
|
|
443
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
444
|
+
p[:fix] &&
|
|
445
|
+
(p[:fix].include?("(remove)") || p[:fix].include?("address the substance")) &&
|
|
446
|
+
(p[:pattern].source.include?("question") ||
|
|
447
|
+
p[:pattern].source.include?("point") ||
|
|
448
|
+
p[:pattern].source.include?("right") ||
|
|
449
|
+
p[:pattern].source.include?("observation"))
|
|
450
|
+
}
|
|
451
|
+
Patterns.scan_phrases(text, filtered)
|
|
452
|
+
}
|
|
453
|
+
),
|
|
454
|
+
|
|
455
|
+
# ── FILLER & HEDGING (22-24) ──────────────────────
|
|
456
|
+
|
|
457
|
+
Pattern.new(
|
|
458
|
+
id: 22, name: "Filler phrases", category: :filler, weight: 3,
|
|
459
|
+
description: 'Wordy filler that can be shortened: "in order to" → "to", "due to the fact that" → "because".',
|
|
460
|
+
detector: ->(text) {
|
|
461
|
+
simple_fixes = %w[to because now if can first finally].push("to / for", "for / regarding", "because / since")
|
|
462
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
463
|
+
p[:fix] && !p[:fix].start_with?("(") && simple_fixes.include?(p[:fix])
|
|
464
|
+
}
|
|
465
|
+
Patterns.scan_phrases(text, filtered)
|
|
466
|
+
}
|
|
467
|
+
),
|
|
468
|
+
|
|
469
|
+
Pattern.new(
|
|
470
|
+
id: 23, name: "Excessive hedging", category: :filler, weight: 3,
|
|
471
|
+
description: 'Stacking qualifiers: "could potentially possibly", "might arguably perhaps".',
|
|
472
|
+
detector: ->(text) {
|
|
473
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
474
|
+
p[:fix] &&
|
|
475
|
+
(p[:fix].include?("could") || p[:fix].include?("might") ||
|
|
476
|
+
p[:fix].include?("may") || p[:fix].include?("perhaps") ||
|
|
477
|
+
p[:fix].include?("maybe"))
|
|
478
|
+
}
|
|
479
|
+
Patterns.scan_phrases(text, filtered)
|
|
480
|
+
}
|
|
481
|
+
),
|
|
482
|
+
|
|
483
|
+
Pattern.new(
|
|
484
|
+
id: 24, name: "Generic conclusions", category: :filler, weight: 3,
|
|
485
|
+
description: 'Vague upbeat endings: "The future looks bright", "Exciting times lie ahead".',
|
|
486
|
+
detector: ->(text) {
|
|
487
|
+
filtered = Vocabulary::AI_PHRASES.select { |p|
|
|
488
|
+
p[:fix] &&
|
|
489
|
+
(p[:fix].include?("specific fact") || p[:fix].include?("concrete") ||
|
|
490
|
+
p[:fix].include?("cite evidence") || p[:fix].include?("what you do know") ||
|
|
491
|
+
p[:fix].include?("what happens next"))
|
|
492
|
+
}
|
|
493
|
+
Patterns.scan_phrases(text, filtered)
|
|
494
|
+
}
|
|
495
|
+
),
|
|
496
|
+
|
|
497
|
+
# ── NEW PATTERNS (v2.2) ───────────────────────────
|
|
498
|
+
|
|
499
|
+
Pattern.new(
|
|
500
|
+
id: 25, name: "Reasoning chain artifacts", category: :communication, weight: 4,
|
|
501
|
+
description: 'Exposed chain-of-thought reasoning: "Let me think...", "Step 1:", "Breaking this down..."',
|
|
502
|
+
detector: ->(text) {
|
|
503
|
+
reasoning_patterns = [
|
|
504
|
+
/\blet me think(?: about this| through this| step by step)?\b/i,
|
|
505
|
+
/\blet's (?:think|reason|work) (?:about|through|this out)\b/i,
|
|
506
|
+
/\bbreaking (?:this|it) down\b/i,
|
|
507
|
+
/\bto approach this (?:systematically|methodically|logically)\b/i,
|
|
508
|
+
/\reasoning through (?:this|the problem|it)\b/i,
|
|
509
|
+
/\bworking through the logic\b/i,
|
|
510
|
+
/\bstep (?:[1-9]|one|two|three|four|five):/i,
|
|
511
|
+
/\bfirst,? let'?s consider\b/i,
|
|
512
|
+
/\bthinking about this (?:carefully|logically|systematically)\b/i,
|
|
513
|
+
/\bhere'?s my (?:thought process|reasoning|thinking)\b/i,
|
|
514
|
+
]
|
|
515
|
+
reasoning_patterns.flat_map { |re|
|
|
516
|
+
TextUtils.find_matches(text, re, "Hide reasoning or make it natural: \"Here's my take:\" instead of \"Let me think step by step:\"")
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
),
|
|
520
|
+
|
|
521
|
+
Pattern.new(
|
|
522
|
+
id: 26, name: "Excessive structure", category: :style, weight: 3,
|
|
523
|
+
description: "Over-formatted responses: too many headers, nested bullets, or numbered lists for simple content.",
|
|
524
|
+
detector: ->(text) {
|
|
525
|
+
results = []
|
|
526
|
+
words = TextUtils.word_count(text)
|
|
527
|
+
|
|
528
|
+
headers = (text.scan(/^\#{1,6}\s+.+$/m) || []).length
|
|
529
|
+
bullets = (text.scan(/^\s*[-*+]\s+/m) || []).length
|
|
530
|
+
numbered = (text.scan(/^\s*\d+\.\s+/m) || []).length
|
|
531
|
+
|
|
532
|
+
if words < 300 && headers >= 3
|
|
533
|
+
results << {
|
|
534
|
+
match: "#{headers} headers in #{words} words",
|
|
535
|
+
index: 0, line: 1, column: 1,
|
|
536
|
+
suggestion: "Too many headers for short content. Use prose instead.",
|
|
537
|
+
confidence: "medium",
|
|
538
|
+
}
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
if words < 200 && (bullets + numbered) >= 8
|
|
542
|
+
results << {
|
|
543
|
+
match: "#{bullets + numbered} list items in #{words} words",
|
|
544
|
+
index: 0, line: 1, column: 1,
|
|
545
|
+
suggestion: "Excessive lists. Could this be a paragraph instead?",
|
|
546
|
+
confidence: "medium",
|
|
547
|
+
}
|
|
548
|
+
end
|
|
549
|
+
|
|
550
|
+
structure_headers = /^#+\s*(?:overview|key (?:points|takeaways)|summary|conclusion|introduction|background)\s*:?\s*$/im
|
|
551
|
+
results.concat TextUtils.find_matches(text, structure_headers, "Formulaic structure. Let content flow naturally.", confidence: "medium")
|
|
552
|
+
|
|
553
|
+
results
|
|
554
|
+
}
|
|
555
|
+
),
|
|
556
|
+
|
|
557
|
+
Pattern.new(
|
|
558
|
+
id: 27, name: "Confidence calibration", category: :communication, weight: 3,
|
|
559
|
+
description: 'Artificially hedged or over-confident phrasing: "I\'m confident that...", "It\'s worth noting..."',
|
|
560
|
+
detector: ->(text) {
|
|
561
|
+
calibration_patterns = [
|
|
562
|
+
{ regex: /\bI'?m confident (?:that|in)\b/i, fix: "State the fact without prefacing confidence" },
|
|
563
|
+
{ regex: /\bit'?s worth (?:noting|mentioning|pointing out) that\b/i, fix: "Just say it" },
|
|
564
|
+
{ regex: /\binterestingly (?:enough)?,?\b/i, fix: "Let reader decide if interesting" },
|
|
565
|
+
{ regex: /\bsurprisingly,?\s/i, fix: "State the fact; surprise is implied" },
|
|
566
|
+
{ regex: /\bimportantly,?\s/i, fix: "Let reader judge importance" },
|
|
567
|
+
{ regex: /\bsignificantly,?\s/i, fix: "Be specific about the significance" },
|
|
568
|
+
{ regex: /\bnotably,?\s/i, fix: "Just state the notable thing" },
|
|
569
|
+
{ regex: /\bcertainly,?\s/i, fix: "Remove or state with evidence" },
|
|
570
|
+
{ regex: /\bundoubtedly,?\s/i, fix: "Remove or cite evidence" },
|
|
571
|
+
{ regex: /\bwithout (?:a )?doubt,?\s/i, fix: "Remove or cite evidence" },
|
|
572
|
+
]
|
|
573
|
+
calibration_patterns.flat_map { |p|
|
|
574
|
+
TextUtils.find_matches(text, p[:regex], p[:fix])
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
),
|
|
578
|
+
|
|
579
|
+
Pattern.new(
|
|
580
|
+
id: 28, name: "Acknowledgment loops", category: :communication, weight: 4,
|
|
581
|
+
description: 'Restating the question before answering: "You\'re asking about X. X is..."',
|
|
582
|
+
detector: ->(text) {
|
|
583
|
+
ack_patterns = [
|
|
584
|
+
/\byou'?re asking (?:about|whether|if|how|why|what)\b/i,
|
|
585
|
+
/\bthe question of (?:whether|how|why|what)\b/i,
|
|
586
|
+
/\bwhen it comes to your question\b/i,
|
|
587
|
+
/\bin (?:terms of|response to|answer to) your question\b/i,
|
|
588
|
+
/\bto (?:answer|address) your question\b/i,
|
|
589
|
+
/\byour question (?:about|regarding|concerning)\b/i,
|
|
590
|
+
/\bthat'?s a (?:great|good|interesting) question\. (?:the|it|so)\b/i,
|
|
591
|
+
/\bI understand you'?re (?:asking|wondering|curious)\b/i,
|
|
592
|
+
]
|
|
593
|
+
ack_patterns.flat_map { |re|
|
|
594
|
+
TextUtils.find_matches(text, re, "Just answer. Don't restate the question.")
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
),
|
|
598
|
+
].freeze
|
|
599
|
+
|
|
600
|
+
# ── Registry ─────────────────────────────────────────
|
|
601
|
+
|
|
602
|
+
class Registry
|
|
603
|
+
def initialize
|
|
604
|
+
@patterns = PATTERNS.dup
|
|
605
|
+
end
|
|
606
|
+
|
|
607
|
+
def all
|
|
608
|
+
@patterns
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
def find(id)
|
|
612
|
+
@patterns.find { |p| p.id == id }
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
def by_category(category)
|
|
616
|
+
@patterns.select { |p| p.category == category }
|
|
617
|
+
end
|
|
618
|
+
|
|
619
|
+
def categories
|
|
620
|
+
@patterns.map(&:category).uniq
|
|
621
|
+
end
|
|
622
|
+
|
|
623
|
+
def list
|
|
624
|
+
@patterns.map { |p| { id: p.id, name: p.name, category: p.category, weight: p.weight } }
|
|
625
|
+
end
|
|
626
|
+
|
|
627
|
+
def add(pattern)
|
|
628
|
+
raise ArgumentError, "Pattern must have id, name, and detector" unless pattern.id && pattern.name && pattern.detector
|
|
629
|
+
@patterns << pattern
|
|
630
|
+
end
|
|
631
|
+
|
|
632
|
+
def remove(id)
|
|
633
|
+
@patterns.reject! { |p| p.id == id }
|
|
634
|
+
end
|
|
635
|
+
end
|
|
636
|
+
end
|
|
637
|
+
end
|