humanizer-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Humanizer
4
+ module Analyzer
5
+ CATEGORY_LABELS = {
6
+ content: "Content patterns",
7
+ language: "Language & grammar",
8
+ style: "Style patterns",
9
+ communication: "Communication artifacts",
10
+ filler: "Filler & hedging",
11
+ }.freeze
12
+
13
+ Result = Struct.new(
14
+ :score, :pattern_score, :uniformity_score, :total_matches,
15
+ :word_count, :stats, :categories, :findings, :summary,
16
+ keyword_init: true
17
+ )
18
+
19
+ module_function
20
+
21
+ def analyze(text, verbose: false, patterns_to_check: nil, include_stats: true)
22
+ return empty_result if text.nil? || !text.is_a?(String)
23
+
24
+ trimmed = text.strip
25
+ return empty_result if trimmed.empty?
26
+
27
+ words = TextUtils.word_count(trimmed)
28
+
29
+ # Compute text statistics
30
+ stats = include_stats ? Stats.compute(trimmed) : nil
31
+ uniformity = if stats && stats.word_count >= 20 && stats.sentence_count >= 3
32
+ Stats.uniformity_score(stats)
33
+ else
34
+ 0
35
+ end
36
+
37
+ # Run pattern detectors
38
+ findings = []
39
+ category_scores = {}
40
+ CATEGORY_LABELS.each_key { |cat| category_scores[cat] = { matches: 0, weighted_score: 0, patterns: [] } }
41
+
42
+ active_patterns = if patterns_to_check
43
+ Patterns::PATTERNS.select { |p| patterns_to_check.include?(p.id) }
44
+ else
45
+ Patterns::PATTERNS
46
+ end
47
+
48
+ active_patterns.each do |pattern|
49
+ matches = pattern.detector.call(trimmed)
50
+ next if matches.empty?
51
+
52
+ finding = {
53
+ pattern_id: pattern.id,
54
+ pattern_name: pattern.name,
55
+ category: pattern.category,
56
+ description: pattern.description,
57
+ weight: pattern.weight,
58
+ match_count: matches.length,
59
+ matches: verbose ? matches : matches.first(5),
60
+ truncated: !verbose && matches.length > 5,
61
+ }
62
+
63
+ findings << finding
64
+ cat = pattern.category
65
+ category_scores[cat][:matches] += matches.length
66
+ category_scores[cat][:weighted_score] += matches.length * pattern.weight
67
+ category_scores[cat][:patterns] << pattern.name
68
+ end
69
+
70
+ # Calculate composite score
71
+ pattern_score = calculate_pattern_score(findings, words)
72
+ composite_score = calculate_composite_score(pattern_score, uniformity, findings)
73
+
74
+ # Build category summary
75
+ categories = {}
76
+ CATEGORY_LABELS.each do |cat, label|
77
+ data = category_scores[cat]
78
+ categories[cat] = {
79
+ label: label,
80
+ matches: data[:matches],
81
+ weighted_score: data[:weighted_score],
82
+ patterns_detected: data[:patterns],
83
+ }
84
+ end
85
+
86
+ total_matches = findings.sum { |f| f[:match_count] }
87
+
88
+ Result.new(
89
+ score: composite_score,
90
+ pattern_score: pattern_score,
91
+ uniformity_score: uniformity,
92
+ total_matches: total_matches,
93
+ word_count: words,
94
+ stats: stats,
95
+ categories: categories,
96
+ findings: findings,
97
+ summary: build_summary(composite_score, total_matches, findings, words, stats),
98
+ )
99
+ end
100
+
101
+ def score(text)
102
+ analyze(text).score
103
+ end
104
+
105
+ # ── Scoring ──────────────────────────────────────────
106
+
107
+ def calculate_pattern_score(findings, words)
108
+ return 0 if words == 0 || findings.empty?
109
+
110
+ weighted_total = findings.sum { |f| f[:match_count] * f[:weight] }
111
+
112
+ # Density: weighted hits per 100 words (log scale)
113
+ density = (weighted_total.to_f / words) * 100
114
+ density_score = [Math.log2(density + 1) * 13, 65].min
115
+
116
+ # Breadth: unique pattern types (max 20)
117
+ breadth_bonus = [findings.length * 2, 20].min
118
+
119
+ # Category diversity (max 15)
120
+ categories_hit = findings.map { |f| f[:category] }.uniq.length
121
+ category_bonus = [categories_hit * 3, 15].min
122
+
123
+ [((density_score + breadth_bonus + category_bonus).round), 100].min
124
+ end
125
+
126
+ def calculate_composite_score(pattern_score, uniformity_score, findings)
127
+ return 0 if pattern_score == 0 && uniformity_score == 0
128
+
129
+ # If no patterns detected, uniformity alone isn't enough to accuse
130
+ if findings.empty?
131
+ return [(uniformity_score * 0.15).round, 15].min
132
+ end
133
+
134
+ # Weighted blend: patterns dominate, stats supplement
135
+ blended = pattern_score * 0.7 + uniformity_score * 0.3
136
+ [blended.round, 100].min
137
+ end
138
+
139
+ def build_summary(final_score, total_matches, findings, words, stats)
140
+ if total_matches == 0 && final_score < 10
141
+ return "No significant AI writing patterns detected. The text looks human-written."
142
+ end
143
+
144
+ level = if final_score >= 70 then "heavily AI-generated"
145
+ elsif final_score >= 45 then "moderately AI-influenced"
146
+ elsif final_score >= 20 then "lightly AI-touched"
147
+ else "mostly human-sounding"
148
+ end
149
+
150
+ top_patterns = findings
151
+ .sort_by { |f| -(f[:match_count] * f[:weight]) }
152
+ .first(3)
153
+ .map { |f| f[:pattern_name] }
154
+
155
+ summary = "Score: #{final_score}/100 (#{level}). Found #{total_matches} matches across #{findings.length} pattern types in #{words} words."
156
+
157
+ if top_patterns.any?
158
+ summary += " Top issues: #{top_patterns.join(', ')}."
159
+ end
160
+
161
+ if stats && stats.sentence_count > 3
162
+ if stats.burstiness < 0.25
163
+ summary += " Sentence rhythm is very uniform (low burstiness) — typical of AI text."
164
+ end
165
+ if stats.type_token_ratio < 0.4 && words > 100
166
+ summary += " Vocabulary diversity is low."
167
+ end
168
+ end
169
+
170
+ summary
171
+ end
172
+
173
+ def empty_result
174
+ Result.new(
175
+ score: 0, pattern_score: 0, uniformity_score: 0,
176
+ total_matches: 0, word_count: 0, stats: nil,
177
+ categories: {}, findings: [], summary: "No text provided.",
178
+ )
179
+ end
180
+
181
+ # ── Formatting ───────────────────────────────────────
182
+
183
+ def format_report(result)
184
+ lines = []
185
+ lines << ""
186
+ lines << "\u2554#{'═' * 50}\u2557"
187
+ lines << "\u2551 AI WRITING PATTERN ANALYSIS \u2551"
188
+ lines << "\u255A#{'═' * 50}\u255D"
189
+ lines << ""
190
+
191
+ filled = (result.score / 5.0).round
192
+ bar = "\u2588" * filled + "\u2591" * (20 - filled)
193
+ lines << " Score: #{result.score}/100 [#{bar}]"
194
+ lines << " Words: #{result.word_count} | Matches: #{result.total_matches} | Pattern: #{result.pattern_score} | Uniformity: #{result.uniformity_score}"
195
+ lines << ""
196
+ lines << " #{result.summary}"
197
+ lines << ""
198
+
199
+ if result.stats
200
+ s = result.stats
201
+ lines << "── Text Statistics ─────────────────────────────────"
202
+ lines << " Sentences: #{s.sentence_count} | Paragraphs: #{s.paragraph_count}"
203
+ lines << " Avg sentence length: #{s.avg_sentence_length} words (σ #{s.sentence_length_std_dev})"
204
+ lines << " Burstiness: #{s.burstiness}"
205
+ lines << " Vocabulary diversity (TTR): #{s.type_token_ratio}"
206
+ lines << " Function word ratio: #{s.function_word_ratio}"
207
+ lines << " Trigram repetition: #{s.trigram_repetition}"
208
+ lines << " Readability (FK grade): #{s.flesch_kincaid}"
209
+ lines << ""
210
+ end
211
+
212
+ lines << "── Categories ──────────────────────────────────────"
213
+ result.categories.each do |_, data|
214
+ if data[:matches] > 0
215
+ lines << " #{data[:label]}: #{data[:matches]} matches (#{data[:patterns_detected].join(', ')})"
216
+ end
217
+ end
218
+ lines << ""
219
+
220
+ if result.findings.any?
221
+ lines << "── Findings ────────────────────────────────────────"
222
+ result.findings.each do |finding|
223
+ lines << ""
224
+ lines << " [#{finding[:pattern_id]}] #{finding[:pattern_name]} (×#{finding[:match_count]}, weight: #{finding[:weight]})"
225
+ lines << " #{finding[:description]}"
226
+ finding[:matches].each do |match|
227
+ loc = match[:line] ? "L#{match[:line]}:#{match[:column] || ''}" : ""
228
+ preview = match[:match].is_a?(String) ? match[:match][0, 80] : ""
229
+ preview += "..." if match[:match].is_a?(String) && match[:match].length > 80
230
+ conf = match[:confidence] ? " [#{match[:confidence]}]" : ""
231
+ lines << " #{loc}: \"#{preview}\"#{conf}"
232
+ lines << " → #{match[:suggestion]}" if match[:suggestion]
233
+ end
234
+ if finding[:truncated]
235
+ lines << " ... and #{finding[:match_count] - finding[:matches].length} more"
236
+ end
237
+ end
238
+ end
239
+
240
+ lines << ""
241
+ lines << "════════════════════════════════════════════════════"
242
+ lines.join("\n")
243
+ end
244
+
245
+ def format_markdown(result)
246
+ lines = []
247
+ lines << "# AI writing pattern analysis"
248
+ lines << ""
249
+
250
+ level = if result.score >= 70 then "Heavily AI-generated"
251
+ elsif result.score >= 45 then "Moderately AI-influenced"
252
+ elsif result.score >= 20 then "Lightly AI-touched"
253
+ else "Mostly human-sounding"
254
+ end
255
+
256
+ lines << "**Score: #{result.score}/100** — #{level}"
257
+ lines << ""
258
+ lines << "Words: #{result.word_count} | Matches: #{result.total_matches} | Pattern score: #{result.pattern_score} | Uniformity score: #{result.uniformity_score}"
259
+ lines << ""
260
+ lines << result.summary
261
+ lines << ""
262
+
263
+ if result.stats
264
+ s = result.stats
265
+ lines << "## Text statistics"
266
+ lines << ""
267
+ lines << "| Metric | Value | Assessment |"
268
+ lines << "|--------|-------|------------|"
269
+ lines << "| Avg sentence length | #{s.avg_sentence_length} words | #{s.avg_sentence_length > 25 ? 'Long' : s.avg_sentence_length < 12 ? 'Short' : 'Normal'} |"
270
+ lines << "| Sentence variation | σ #{s.sentence_length_std_dev} | #{s.sentence_length_std_dev > 8 ? 'High (human-like)' : s.sentence_length_std_dev < 4 ? 'Low (AI-like)' : 'Moderate'} |"
271
+ lines << "| Burstiness | #{s.burstiness} | #{burstiness_label(s.burstiness)} |"
272
+ lines << "| Vocabulary diversity | #{s.type_token_ratio} | #{ttr_label(s.type_token_ratio, s.word_count)} |"
273
+ lines << "| Trigram repetition | #{s.trigram_repetition} | #{s.trigram_repetition > 0.1 ? 'High (AI-like)' : 'Normal'} |"
274
+ lines << "| Readability | FK grade #{s.flesch_kincaid} | #{s.flesch_kincaid > 12 ? 'Academic' : s.flesch_kincaid > 8 ? 'Standard' : 'Easy'} |"
275
+ lines << ""
276
+ end
277
+
278
+ if result.findings.any?
279
+ lines << "## Findings"
280
+ lines << ""
281
+ result.findings.each do |finding|
282
+ lines << "### #{finding[:pattern_id]}. #{finding[:pattern_name]} (×#{finding[:match_count]})"
283
+ lines << "*#{finding[:description]}*"
284
+ lines << ""
285
+ finding[:matches].each do |match|
286
+ loc = match[:line] ? "Line #{match[:line]}" : ""
287
+ preview = match[:match].is_a?(String) ? match[:match][0, 80] : ""
288
+ lines << "- #{loc}: `#{preview}`"
289
+ lines << " - #{match[:suggestion]}" if match[:suggestion]
290
+ end
291
+ lines << ""
292
+ end
293
+ end
294
+
295
+ lines.join("\n")
296
+ end
297
+
298
+ def format_json(result)
299
+ require "json"
300
+ JSON.pretty_generate(result.to_h)
301
+ end
302
+
303
+ def burstiness_label(b)
304
+ if b >= 0.7 then "(high — human-like)"
305
+ elsif b >= 0.45 then "(moderate)"
306
+ elsif b >= 0.25 then "(low — somewhat uniform)"
307
+ else "(very low — AI-like uniformity)"
308
+ end
309
+ end
310
+
311
+ def ttr_label(ttr, wc)
312
+ if wc < 100 then "(too short to assess)"
313
+ elsif ttr >= 0.6 then "(high — diverse vocabulary)"
314
+ elsif ttr >= 0.45 then "(moderate)"
315
+ else "(low — repetitive vocabulary)"
316
+ end
317
+ end
318
+ end
319
+ end
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Humanizer
4
+ module HumanizerEngine
5
+ module_function
6
+
7
+ # Apply safe, mechanical fixes that don't require judgment.
8
+ def auto_fix(text)
9
+ result = text.dup
10
+ fixes = []
11
+
12
+ # Curly quotes → straight quotes
13
+ if result.match?(/[\u201C\u201D]/)
14
+ result.gsub!(/[\u201C\u201D]/, '"')
15
+ fixes << "Replaced curly double quotes with straight quotes"
16
+ end
17
+ if result.match?(/[\u2018\u2019]/)
18
+ result.gsub!(/[\u2018\u2019]/, "'")
19
+ fixes << "Replaced curly single quotes with straight quotes"
20
+ end
21
+
22
+ # Filler phrase replacements (unambiguous)
23
+ safe_fills = [
24
+ { from: /\bin order to\b/i, to: "to", label: '"in order to" → "to"' },
25
+ { from: /\bdue to the fact that\b/i, to: "because", label: '"due to the fact that" → "because"' },
26
+ { from: /\bat this point in time\b/i, to: "now", label: '"at this point in time" → "now"' },
27
+ { from: /\bin the event that\b/i, to: "if", label: '"in the event that" → "if"' },
28
+ { from: /\bhas the ability to\b/i, to: "can", label: '"has the ability to" → "can"' },
29
+ { from: /\bfor the purpose of\b/i, to: "to", label: '"for the purpose of" → "to"' },
30
+ { from: /\bfirst and foremost\b/i, to: "first", label: '"first and foremost" → "first"' },
31
+ { from: /\bin light of the fact that\b/i, to: "because", label: '"in light of the fact that" → "because"' },
32
+ { from: /\bin the realm of\b/i, to: "in", label: '"in the realm of" → "in"' },
33
+ { from: /\butilize\b/i, to: "use", label: '"utilize" → "use"' },
34
+ { from: /\butilizing\b/i, to: "using", label: '"utilizing" → "using"' },
35
+ { from: /\butilization\b/i, to: "use", label: '"utilization" → "use"' },
36
+ ]
37
+
38
+ safe_fills.each do |fill|
39
+ if result.match?(fill[:from])
40
+ result.gsub!(fill[:from], fill[:to])
41
+ fixes << fill[:label]
42
+ end
43
+ end
44
+
45
+ # Chatbot artifact removal (start of text)
46
+ chatbot_start = [
47
+ /^(?:Here is|Here's) (?:a |an |the )?(?:comprehensive |brief |quick )?(?:overview|summary|breakdown|list|guide|explanation|look)[^.]*\.\s*/i,
48
+ /^(?:Of course|Certainly|Absolutely|Sure)!\s*/i,
49
+ /^(?:Great|Excellent|Good|Wonderful|Fantastic) question!\s*/i,
50
+ /^(?:That's|That is) a (?:great|excellent|good|wonderful|fantastic) (?:question|point)!\s*/i,
51
+ ]
52
+ chatbot_start.each do |regex|
53
+ if result.match?(regex)
54
+ result.sub!(regex, "")
55
+ fixes << "Removed chatbot opening artifact"
56
+ end
57
+ end
58
+
59
+ # Chatbot artifact removal (end of text)
60
+ chatbot_end = [
61
+ /\s*(?:I hope this helps|Let me know if you(?:'d| would) like|Feel free to|Don't hesitate to|Is there anything else)[^.]*[.!]\s*$/i,
62
+ /\s*Happy to help[.!]?\s*$/i,
63
+ ]
64
+ chatbot_end.each do |regex|
65
+ if result.match?(regex)
66
+ result.sub!(regex, "")
67
+ fixes << "Removed chatbot closing artifact"
68
+ end
69
+ end
70
+
71
+ { text: result.strip, fixes: fixes }
72
+ end
73
+
74
+ # Generate humanization suggestions
75
+ def humanize(text, autofix: false, verbose: false, include_stats: true)
76
+ analysis = Analyzer.analyze(text, verbose: true, include_stats: include_stats)
77
+
78
+ critical = []
79
+ important = []
80
+ minor = []
81
+
82
+ analysis.findings.each do |finding|
83
+ suggestions = finding[:matches].map { |m|
84
+ {
85
+ pattern: finding[:pattern_name],
86
+ pattern_id: finding[:pattern_id],
87
+ category: finding[:category],
88
+ weight: finding[:weight],
89
+ text: m[:match],
90
+ line: m[:line],
91
+ column: m[:column],
92
+ suggestion: m[:suggestion],
93
+ confidence: m[:confidence] || "high",
94
+ }
95
+ }
96
+
97
+ if finding[:weight] >= 4
98
+ critical.concat(suggestions)
99
+ elsif finding[:weight] >= 2
100
+ important.concat(suggestions)
101
+ else
102
+ minor.concat(suggestions)
103
+ end
104
+ end
105
+
106
+ fix_result = autofix ? auto_fix(text) : nil
107
+
108
+ guidance = build_guidance(analysis)
109
+ style_tips = include_stats && analysis.stats ? build_style_tips(analysis.stats) : []
110
+
111
+ {
112
+ score: analysis.score,
113
+ pattern_score: analysis.pattern_score,
114
+ uniformity_score: analysis.uniformity_score,
115
+ word_count: analysis.word_count,
116
+ total_issues: analysis.total_matches,
117
+ stats: analysis.stats,
118
+ critical: critical,
119
+ important: important,
120
+ minor: minor,
121
+ autofix: fix_result,
122
+ guidance: guidance,
123
+ style_tips: style_tips,
124
+ }
125
+ end
126
+
127
+ def build_guidance(analysis)
128
+ tips = []
129
+ ids = analysis.findings.map { |f| f[:pattern_id] }.to_set
130
+
131
+ tips << "Replace inflated/promotional language with concrete facts. What specifically happened? Give dates, numbers, names." if ids.include?(1) || ids.include?(4)
132
+ tips << "Cut trailing -ing phrases. If the point matters enough to mention, give it its own sentence." if ids.include?(3)
133
+ tips << "Name your sources. \"Experts say\" means nothing — who said it, when, and where?" if ids.include?(5)
134
+ tips << "Replace formulaic \"despite challenges\" sections with specific problems and concrete outcomes." if ids.include?(6)
135
+ tips << "Swap AI vocabulary for plainer words. \"Delve\" → \"look at\". \"Tapestry\" → (be specific). \"Showcase\" → \"show\"." if ids.include?(7)
136
+ tips << "Use \"is\" and \"has\" freely. \"Serves as\" and \"boasts\" are needlessly fancy." if ids.include?(8)
137
+ tips << "Drop \"not just X, it's Y\" frames. Just say what the thing is." if ids.include?(9)
138
+ tips << "Break up triads. You don't always need three of everything." if ids.include?(10)
139
+ tips << "Ease up on em dashes. Use commas, periods, or parentheses for variety." if ids.include?(13)
140
+ tips << "Strip mechanical bold formatting and inline-header lists. Let prose do the work." if ids.include?(14) || ids.include?(15)
141
+ tips << "Remove emojis from professional text. They signal chatbot output." if ids.include?(17)
142
+ tips << "Remove chatbot filler (\"I hope this helps!\", \"Great question!\"). Just deliver the content." if ids.include?(19) || ids.include?(21)
143
+ tips << "Delete knowledge-cutoff disclaimers. Either research it or leave it out." if ids.include?(20)
144
+ tips << "Trim filler and hedging. \"In order to\" → \"to\". One qualifier per claim is enough." if ids.include?(22) || ids.include?(23)
145
+ tips << "Cut generic conclusions. End with a specific fact instead of \"the future looks bright\"." if ids.include?(24)
146
+
147
+ if analysis.score >= 50
148
+ tips << "Consider rewriting from scratch. When AI patterns are this dense, patching individual phrases isn't enough — the structure itself needs rethinking."
149
+ end
150
+
151
+ tips
152
+ end
153
+
154
+ def build_style_tips(stats)
155
+ tips = []
156
+
157
+ if stats.burstiness < 0.25 && stats.sentence_count > 4
158
+ tips << { metric: "burstiness", value: stats.burstiness,
159
+ tip: "Sentence rhythm is very uniform. Mix short punchy sentences (3-8 words) with longer flowing ones (20+). Fragments work too. Like this." }
160
+ end
161
+
162
+ if stats.sentence_length_variation < 0.3 && stats.sentence_count > 4
163
+ tips << { metric: "sentence_length_variation", value: stats.sentence_length_variation,
164
+ tip: "Sentences are all roughly #{stats.avg_sentence_length.round} words. Vary your rhythm — alternate between short and long." }
165
+ end
166
+
167
+ if stats.avg_sentence_length > 28
168
+ tips << { metric: "avg_sentence_length", value: stats.avg_sentence_length,
169
+ tip: "Average sentence is quite long. Break some into shorter ones. Not every thought needs a subordinate clause." }
170
+ end
171
+
172
+ if stats.type_token_ratio < 0.4 && stats.word_count > 100
173
+ tips << { metric: "type_token_ratio", value: stats.type_token_ratio,
174
+ tip: "Vocabulary is repetitive. Try using more varied word choices — but don't synonym-cycle (that's also an AI tell)." }
175
+ end
176
+
177
+ if stats.trigram_repetition > 0.1 && stats.word_count > 100
178
+ tips << { metric: "trigram_repetition", value: stats.trigram_repetition,
179
+ tip: "Repeated 3-word phrases detected. Vary your sentence structures." }
180
+ end
181
+
182
+ if tips.length >= 2
183
+ tips << { metric: "general", value: nil,
184
+ tip: "Try the read-aloud test: read the text out loud. If it sounds weird or robotic, rewrite those parts until they sound like something you'd actually say." }
185
+ tips << { metric: "general", value: nil,
186
+ tip: "Add first-person perspective where it fits: \"I found\", \"We noticed\", \"In my experience\". Real humans write from a point of view." }
187
+ end
188
+
189
+ tips
190
+ end
191
+ end
192
+ end