RubyGems - humanizer-rb - Versions diffs - 0.1.0 - Mend

humanizer-rb 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +12 -0
data/LICENSE +21 -0
data/README.md +133 -0
data/bin/humanizer +388 -0
data/lib/humanizer/analyzer.rb +319 -0
data/lib/humanizer/humanizer_engine.rb +192 -0
data/lib/humanizer/patterns.rb +637 -0
data/lib/humanizer/stats.rb +198 -0
data/lib/humanizer/text_utils.rb +53 -0
data/lib/humanizer/version.rb +5 -0
data/lib/humanizer/vocabulary.rb +260 -0
data/lib/humanizer.rb +33 -0
metadata +61 -0

data/lib/humanizer/analyzer.rb ADDED Viewed

@@ -0,0 +1,319 @@
+# frozen_string_literal: true
+module Humanizer
+  module Analyzer
+    CATEGORY_LABELS = {
+      content: "Content patterns",
+      language: "Language & grammar",
+      style: "Style patterns",
+      communication: "Communication artifacts",
+      filler: "Filler & hedging",
+    }.freeze
+    Result = Struct.new(
+      :score, :pattern_score, :uniformity_score, :total_matches,
+      :word_count, :stats, :categories, :findings, :summary,
+      keyword_init: true
+    )
+    module_function
+    def analyze(text, verbose: false, patterns_to_check: nil, include_stats: true)
+      return empty_result if text.nil? || !text.is_a?(String)
+      trimmed = text.strip
+      return empty_result if trimmed.empty?
+      words = TextUtils.word_count(trimmed)
+      # Compute text statistics
+      stats = include_stats ? Stats.compute(trimmed) : nil
+      uniformity = if stats && stats.word_count >= 20 && stats.sentence_count >= 3
+                     Stats.uniformity_score(stats)
+                   else
+                     0
+                   end
+      # Run pattern detectors
+      findings = []
+      category_scores = {}
+      CATEGORY_LABELS.each_key { |cat| category_scores[cat] = { matches: 0, weighted_score: 0, patterns: [] } }
+      active_patterns = if patterns_to_check
+                          Patterns::PATTERNS.select { |p| patterns_to_check.include?(p.id) }
+                        else
+                          Patterns::PATTERNS
+                        end
+      active_patterns.each do |pattern|
+        matches = pattern.detector.call(trimmed)
+        next if matches.empty?
+        finding = {
+          pattern_id: pattern.id,
+          pattern_name: pattern.name,
+          category: pattern.category,
+          description: pattern.description,
+          weight: pattern.weight,
+          match_count: matches.length,
+          matches: verbose ? matches : matches.first(5),
+          truncated: !verbose && matches.length > 5,
+        }
+        findings << finding
+        cat = pattern.category
+        category_scores[cat][:matches] += matches.length
+        category_scores[cat][:weighted_score] += matches.length * pattern.weight
+        category_scores[cat][:patterns] << pattern.name
+      end
+      # Calculate composite score
+      pattern_score = calculate_pattern_score(findings, words)
+      composite_score = calculate_composite_score(pattern_score, uniformity, findings)
+      # Build category summary
+      categories = {}
+      CATEGORY_LABELS.each do |cat, label|
+        data = category_scores[cat]
+        categories[cat] = {
+          label: label,
+          matches: data[:matches],
+          weighted_score: data[:weighted_score],
+          patterns_detected: data[:patterns],
+        }
+      end
+      total_matches = findings.sum { |f| f[:match_count] }
+      Result.new(
+        score: composite_score,
+        pattern_score: pattern_score,
+        uniformity_score: uniformity,
+        total_matches: total_matches,
+        word_count: words,
+        stats: stats,
+        categories: categories,
+        findings: findings,
+        summary: build_summary(composite_score, total_matches, findings, words, stats),
+      )
+    end
+    def score(text)
+      analyze(text).score
+    end
+    # ── Scoring ──────────────────────────────────────────
+    def calculate_pattern_score(findings, words)
+      return 0 if words == 0 || findings.empty?
+      weighted_total = findings.sum { |f| f[:match_count] * f[:weight] }
+      # Density: weighted hits per 100 words (log scale)
+      density = (weighted_total.to_f / words) * 100
+      density_score = [Math.log2(density + 1) * 13, 65].min
+      # Breadth: unique pattern types (max 20)
+      breadth_bonus = [findings.length * 2, 20].min
+      # Category diversity (max 15)
+      categories_hit = findings.map { |f| f[:category] }.uniq.length
+      category_bonus = [categories_hit * 3, 15].min
+      [((density_score + breadth_bonus + category_bonus).round), 100].min
+    end
+    def calculate_composite_score(pattern_score, uniformity_score, findings)
+      return 0 if pattern_score == 0 && uniformity_score == 0
+      # If no patterns detected, uniformity alone isn't enough to accuse
+      if findings.empty?
+        return [(uniformity_score * 0.15).round, 15].min
+      end
+      # Weighted blend: patterns dominate, stats supplement
+      blended = pattern_score * 0.7 + uniformity_score * 0.3
+      [blended.round, 100].min
+    end
+    def build_summary(final_score, total_matches, findings, words, stats)
+      if total_matches == 0 && final_score < 10
+        return "No significant AI writing patterns detected. The text looks human-written."
+      end
+      level = if final_score >= 70 then "heavily AI-generated"
+              elsif final_score >= 45 then "moderately AI-influenced"
+              elsif final_score >= 20 then "lightly AI-touched"
+              else "mostly human-sounding"
+              end
+      top_patterns = findings
+        .sort_by { |f| -(f[:match_count] * f[:weight]) }
+        .first(3)
+        .map { |f| f[:pattern_name] }
+      summary = "Score: #{final_score}/100 (#{level}). Found #{total_matches} matches across #{findings.length} pattern types in #{words} words."
+      if top_patterns.any?
+        summary += " Top issues: #{top_patterns.join(', ')}."
+      end
+      if stats && stats.sentence_count > 3
+        if stats.burstiness < 0.25
+          summary += " Sentence rhythm is very uniform (low burstiness) — typical of AI text."
+        end
+        if stats.type_token_ratio < 0.4 && words > 100
+          summary += " Vocabulary diversity is low."
+        end
+      end
+      summary
+    end
+    def empty_result
+      Result.new(
+        score: 0, pattern_score: 0, uniformity_score: 0,
+        total_matches: 0, word_count: 0, stats: nil,
+        categories: {}, findings: [], summary: "No text provided.",
+      )
+    end
+    # ── Formatting ───────────────────────────────────────
+    def format_report(result)
+      lines = []
+      lines << ""
+      lines << "\u2554#{'═' * 50}\u2557"
+      lines << "\u2551          AI WRITING PATTERN ANALYSIS             \u2551"
+      lines << "\u255A#{'═' * 50}\u255D"
+      lines << ""
+      filled = (result.score / 5.0).round
+      bar = "\u2588" * filled + "\u2591" * (20 - filled)
+      lines << "  Score: #{result.score}/100  [#{bar}]"
+      lines << "  Words: #{result.word_count}  |  Matches: #{result.total_matches}  |  Pattern: #{result.pattern_score}  |  Uniformity: #{result.uniformity_score}"
+      lines << ""
+      lines << "  #{result.summary}"
+      lines << ""
+      if result.stats
+        s = result.stats
+        lines << "── Text Statistics ─────────────────────────────────"
+        lines << "  Sentences: #{s.sentence_count}  |  Paragraphs: #{s.paragraph_count}"
+        lines << "  Avg sentence length: #{s.avg_sentence_length} words (σ #{s.sentence_length_std_dev})"
+        lines << "  Burstiness: #{s.burstiness}"
+        lines << "  Vocabulary diversity (TTR): #{s.type_token_ratio}"
+        lines << "  Function word ratio: #{s.function_word_ratio}"
+        lines << "  Trigram repetition: #{s.trigram_repetition}"
+        lines << "  Readability (FK grade): #{s.flesch_kincaid}"
+        lines << ""
+      end
+      lines << "── Categories ──────────────────────────────────────"
+      result.categories.each do |_, data|
+        if data[:matches] > 0
+          lines << "  #{data[:label]}: #{data[:matches]} matches (#{data[:patterns_detected].join(', ')})"
+        end
+      end
+      lines << ""
+      if result.findings.any?
+        lines << "── Findings ────────────────────────────────────────"
+        result.findings.each do |finding|
+          lines << ""
+          lines << "  [#{finding[:pattern_id]}] #{finding[:pattern_name]} (×#{finding[:match_count]}, weight: #{finding[:weight]})"
+          lines << "      #{finding[:description]}"
+          finding[:matches].each do |match|
+            loc = match[:line] ? "L#{match[:line]}:#{match[:column] || ''}" : ""
+            preview = match[:match].is_a?(String) ? match[:match][0, 80] : ""
+            preview += "..." if match[:match].is_a?(String) && match[:match].length > 80
+            conf = match[:confidence] ? " [#{match[:confidence]}]" : ""
+            lines << "      #{loc}: \"#{preview}\"#{conf}"
+            lines << "            → #{match[:suggestion]}" if match[:suggestion]
+          end
+          if finding[:truncated]
+            lines << "      ... and #{finding[:match_count] - finding[:matches].length} more"
+          end
+        end
+      end
+      lines << ""
+      lines << "════════════════════════════════════════════════════"
+      lines.join("\n")
+    end
+    def format_markdown(result)
+      lines = []
+      lines << "# AI writing pattern analysis"
+      lines << ""
+      level = if result.score >= 70 then "Heavily AI-generated"
+              elsif result.score >= 45 then "Moderately AI-influenced"
+              elsif result.score >= 20 then "Lightly AI-touched"
+              else "Mostly human-sounding"
+              end
+      lines << "**Score: #{result.score}/100** — #{level}"
+      lines << ""
+      lines << "Words: #{result.word_count} | Matches: #{result.total_matches} | Pattern score: #{result.pattern_score} | Uniformity score: #{result.uniformity_score}"
+      lines << ""
+      lines << result.summary
+      lines << ""
+      if result.stats
+        s = result.stats
+        lines << "## Text statistics"
+        lines << ""
+        lines << "| Metric | Value | Assessment |"
+        lines << "|--------|-------|------------|"
+        lines << "| Avg sentence length | #{s.avg_sentence_length} words | #{s.avg_sentence_length > 25 ? 'Long' : s.avg_sentence_length < 12 ? 'Short' : 'Normal'} |"
+        lines << "| Sentence variation | σ #{s.sentence_length_std_dev} | #{s.sentence_length_std_dev > 8 ? 'High (human-like)' : s.sentence_length_std_dev < 4 ? 'Low (AI-like)' : 'Moderate'} |"
+        lines << "| Burstiness | #{s.burstiness} | #{burstiness_label(s.burstiness)} |"
+        lines << "| Vocabulary diversity | #{s.type_token_ratio} | #{ttr_label(s.type_token_ratio, s.word_count)} |"
+        lines << "| Trigram repetition | #{s.trigram_repetition} | #{s.trigram_repetition > 0.1 ? 'High (AI-like)' : 'Normal'} |"
+        lines << "| Readability | FK grade #{s.flesch_kincaid} | #{s.flesch_kincaid > 12 ? 'Academic' : s.flesch_kincaid > 8 ? 'Standard' : 'Easy'} |"
+        lines << ""
+      end
+      if result.findings.any?
+        lines << "## Findings"
+        lines << ""
+        result.findings.each do |finding|
+          lines << "### #{finding[:pattern_id]}. #{finding[:pattern_name]} (×#{finding[:match_count]})"
+          lines << "*#{finding[:description]}*"
+          lines << ""
+          finding[:matches].each do |match|
+            loc = match[:line] ? "Line #{match[:line]}" : ""
+            preview = match[:match].is_a?(String) ? match[:match][0, 80] : ""
+            lines << "- #{loc}: `#{preview}`"
+            lines << "  - #{match[:suggestion]}" if match[:suggestion]
+          end
+          lines << ""
+        end
+      end
+      lines.join("\n")
+    end
+    def format_json(result)
+      require "json"
+      JSON.pretty_generate(result.to_h)
+    end
+    def burstiness_label(b)
+      if b >= 0.7 then "(high — human-like)"
+      elsif b >= 0.45 then "(moderate)"
+      elsif b >= 0.25 then "(low — somewhat uniform)"
+      else "(very low — AI-like uniformity)"
+      end
+    end
+    def ttr_label(ttr, wc)
+      if wc < 100 then "(too short to assess)"
+      elsif ttr >= 0.6 then "(high — diverse vocabulary)"
+      elsif ttr >= 0.45 then "(moderate)"
+      else "(low — repetitive vocabulary)"
+      end
+    end
+  end
+end

data/lib/humanizer/humanizer_engine.rb ADDED Viewed

@@ -0,0 +1,192 @@
+# frozen_string_literal: true
+module Humanizer
+  module HumanizerEngine
+    module_function
+    # Apply safe, mechanical fixes that don't require judgment.
+    def auto_fix(text)
+      result = text.dup
+      fixes = []
+      # Curly quotes → straight quotes
+      if result.match?(/[\u201C\u201D]/)
+        result.gsub!(/[\u201C\u201D]/, '"')
+        fixes << "Replaced curly double quotes with straight quotes"
+      end
+      if result.match?(/[\u2018\u2019]/)
+        result.gsub!(/[\u2018\u2019]/, "'")
+        fixes << "Replaced curly single quotes with straight quotes"
+      end
+      # Filler phrase replacements (unambiguous)
+      safe_fills = [
+        { from: /\bin order to\b/i, to: "to", label: '"in order to" → "to"' },
+        { from: /\bdue to the fact that\b/i, to: "because", label: '"due to the fact that" → "because"' },
+        { from: /\bat this point in time\b/i, to: "now", label: '"at this point in time" → "now"' },
+        { from: /\bin the event that\b/i, to: "if", label: '"in the event that" → "if"' },
+        { from: /\bhas the ability to\b/i, to: "can", label: '"has the ability to" → "can"' },
+        { from: /\bfor the purpose of\b/i, to: "to", label: '"for the purpose of" → "to"' },
+        { from: /\bfirst and foremost\b/i, to: "first", label: '"first and foremost" → "first"' },
+        { from: /\bin light of the fact that\b/i, to: "because", label: '"in light of the fact that" → "because"' },
+        { from: /\bin the realm of\b/i, to: "in", label: '"in the realm of" → "in"' },
+        { from: /\butilize\b/i, to: "use", label: '"utilize" → "use"' },
+        { from: /\butilizing\b/i, to: "using", label: '"utilizing" → "using"' },
+        { from: /\butilization\b/i, to: "use", label: '"utilization" → "use"' },
+      ]
+      safe_fills.each do |fill|
+        if result.match?(fill[:from])
+          result.gsub!(fill[:from], fill[:to])
+          fixes << fill[:label]
+        end
+      end
+      # Chatbot artifact removal (start of text)
+      chatbot_start = [
+        /^(?:Here is|Here's) (?:a |an |the )?(?:comprehensive |brief |quick )?(?:overview|summary|breakdown|list|guide|explanation|look)[^.]*\.\s*/i,
+        /^(?:Of course|Certainly|Absolutely|Sure)!\s*/i,
+        /^(?:Great|Excellent|Good|Wonderful|Fantastic) question!\s*/i,
+        /^(?:That's|That is) a (?:great|excellent|good|wonderful|fantastic) (?:question|point)!\s*/i,
+      ]
+      chatbot_start.each do |regex|
+        if result.match?(regex)
+          result.sub!(regex, "")
+          fixes << "Removed chatbot opening artifact"
+        end
+      end
+      # Chatbot artifact removal (end of text)
+      chatbot_end = [
+        /\s*(?:I hope this helps|Let me know if you(?:'d| would) like|Feel free to|Don't hesitate to|Is there anything else)[^.]*[.!]\s*$/i,
+        /\s*Happy to help[.!]?\s*$/i,
+      ]
+      chatbot_end.each do |regex|
+        if result.match?(regex)
+          result.sub!(regex, "")
+          fixes << "Removed chatbot closing artifact"
+        end
+      end
+      { text: result.strip, fixes: fixes }
+    end
+    # Generate humanization suggestions
+    def humanize(text, autofix: false, verbose: false, include_stats: true)
+      analysis = Analyzer.analyze(text, verbose: true, include_stats: include_stats)
+      critical = []
+      important = []
+      minor = []
+      analysis.findings.each do |finding|
+        suggestions = finding[:matches].map { |m|
+          {
+            pattern: finding[:pattern_name],
+            pattern_id: finding[:pattern_id],
+            category: finding[:category],
+            weight: finding[:weight],
+            text: m[:match],
+            line: m[:line],
+            column: m[:column],
+            suggestion: m[:suggestion],
+            confidence: m[:confidence] || "high",
+          }
+        }
+        if finding[:weight] >= 4
+          critical.concat(suggestions)
+        elsif finding[:weight] >= 2
+          important.concat(suggestions)
+        else
+          minor.concat(suggestions)
+        end
+      end
+      fix_result = autofix ? auto_fix(text) : nil
+      guidance = build_guidance(analysis)
+      style_tips = include_stats && analysis.stats ? build_style_tips(analysis.stats) : []
+      {
+        score: analysis.score,
+        pattern_score: analysis.pattern_score,
+        uniformity_score: analysis.uniformity_score,
+        word_count: analysis.word_count,
+        total_issues: analysis.total_matches,
+        stats: analysis.stats,
+        critical: critical,
+        important: important,
+        minor: minor,
+        autofix: fix_result,
+        guidance: guidance,
+        style_tips: style_tips,
+      }
+    end
+    def build_guidance(analysis)
+      tips = []
+      ids = analysis.findings.map { |f| f[:pattern_id] }.to_set
+      tips << "Replace inflated/promotional language with concrete facts. What specifically happened? Give dates, numbers, names." if ids.include?(1) || ids.include?(4)
+      tips << "Cut trailing -ing phrases. If the point matters enough to mention, give it its own sentence." if ids.include?(3)
+      tips << "Name your sources. \"Experts say\" means nothing — who said it, when, and where?" if ids.include?(5)
+      tips << "Replace formulaic \"despite challenges\" sections with specific problems and concrete outcomes." if ids.include?(6)
+      tips << "Swap AI vocabulary for plainer words. \"Delve\" → \"look at\". \"Tapestry\" → (be specific). \"Showcase\" → \"show\"." if ids.include?(7)
+      tips << "Use \"is\" and \"has\" freely. \"Serves as\" and \"boasts\" are needlessly fancy." if ids.include?(8)
+      tips << "Drop \"not just X, it's Y\" frames. Just say what the thing is." if ids.include?(9)
+      tips << "Break up triads. You don't always need three of everything." if ids.include?(10)
+      tips << "Ease up on em dashes. Use commas, periods, or parentheses for variety." if ids.include?(13)
+      tips << "Strip mechanical bold formatting and inline-header lists. Let prose do the work." if ids.include?(14) || ids.include?(15)
+      tips << "Remove emojis from professional text. They signal chatbot output." if ids.include?(17)
+      tips << "Remove chatbot filler (\"I hope this helps!\", \"Great question!\"). Just deliver the content." if ids.include?(19) || ids.include?(21)
+      tips << "Delete knowledge-cutoff disclaimers. Either research it or leave it out." if ids.include?(20)
+      tips << "Trim filler and hedging. \"In order to\" → \"to\". One qualifier per claim is enough." if ids.include?(22) || ids.include?(23)
+      tips << "Cut generic conclusions. End with a specific fact instead of \"the future looks bright\"." if ids.include?(24)
+      if analysis.score >= 50
+        tips << "Consider rewriting from scratch. When AI patterns are this dense, patching individual phrases isn't enough — the structure itself needs rethinking."
+      end
+      tips
+    end
+    def build_style_tips(stats)
+      tips = []
+      if stats.burstiness < 0.25 && stats.sentence_count > 4
+        tips << { metric: "burstiness", value: stats.burstiness,
+                  tip: "Sentence rhythm is very uniform. Mix short punchy sentences (3-8 words) with longer flowing ones (20+). Fragments work too. Like this." }
+      end
+      if stats.sentence_length_variation < 0.3 && stats.sentence_count > 4
+        tips << { metric: "sentence_length_variation", value: stats.sentence_length_variation,
+                  tip: "Sentences are all roughly #{stats.avg_sentence_length.round} words. Vary your rhythm — alternate between short and long." }
+      end
+      if stats.avg_sentence_length > 28
+        tips << { metric: "avg_sentence_length", value: stats.avg_sentence_length,
+                  tip: "Average sentence is quite long. Break some into shorter ones. Not every thought needs a subordinate clause." }
+      end
+      if stats.type_token_ratio < 0.4 && stats.word_count > 100
+        tips << { metric: "type_token_ratio", value: stats.type_token_ratio,
+                  tip: "Vocabulary is repetitive. Try using more varied word choices — but don't synonym-cycle (that's also an AI tell)." }
+      end
+      if stats.trigram_repetition > 0.1 && stats.word_count > 100
+        tips << { metric: "trigram_repetition", value: stats.trigram_repetition,
+                  tip: "Repeated 3-word phrases detected. Vary your sentence structures." }
+      end
+      if tips.length >= 2
+        tips << { metric: "general", value: nil,
+                  tip: "Try the read-aloud test: read the text out loud. If it sounds weird or robotic, rewrite those parts until they sound like something you'd actually say." }
+        tips << { metric: "general", value: nil,
+                  tip: "Add first-person perspective where it fits: \"I found\", \"We noticed\", \"In my experience\". Real humans write from a point of view." }
+      end
+      tips
+    end
+  end
+end