moderate 0.1.0 → 1.0.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +8 -0
- data/.simplecov +62 -0
- data/AGENTS.md +7 -0
- data/Appraisals +16 -0
- data/CHANGELOG.md +71 -1
- data/CLAUDE.md +7 -0
- data/README.md +376 -29
- data/Rakefile +28 -2
- data/app/controllers/concerns/moderate/moderation.rb +161 -0
- data/app/controllers/moderate/appeals_controller.rb +190 -0
- data/app/controllers/moderate/application_controller.rb +45 -0
- data/app/controllers/moderate/notices_controller.rb +382 -0
- data/app/controllers/moderate/transparency_reports_controller.rb +30 -0
- data/app/helpers/moderate/engine_helper.rb +151 -0
- data/app/views/moderate/appeals/new.html.erb +78 -0
- data/app/views/moderate/notices/new.html.erb +255 -0
- data/app/views/moderate/transparency_reports/_summary_card.html.erb +20 -0
- data/app/views/moderate/transparency_reports/show.html.erb +52 -0
- data/config/moderate/blocklists/en.yml +81 -0
- data/config/moderate/blocklists/es.yml +40 -0
- data/config/routes.rb +36 -0
- data/docs/compliance.md +178 -0
- data/docs/configuration.md +326 -0
- data/docs/dsa-notice-form.md +371 -0
- data/docs/madmin.md +490 -0
- data/docs/notifications.md +363 -0
- data/examples/aws_rekognition_adapter.rb +140 -0
- data/examples/openai_moderation_adapter.rb +111 -0
- data/gemfiles/rails_7.1.gemfile +36 -0
- data/gemfiles/rails_7.2.gemfile +36 -0
- data/gemfiles/rails_8.1.gemfile +36 -0
- data/lib/generators/moderate/install_generator.rb +56 -0
- data/lib/generators/moderate/templates/create_moderate_tables.rb.erb +237 -0
- data/lib/generators/moderate/templates/initializer.rb +198 -0
- data/lib/generators/moderate/views_generator.rb +63 -0
- data/lib/moderate/configuration.rb +341 -0
- data/lib/moderate/engine.rb +138 -0
- data/lib/moderate/errors.rb +26 -0
- data/lib/moderate/event.rb +75 -0
- data/lib/moderate/filters/base.rb +126 -0
- data/lib/moderate/filters/wordlist.rb +255 -0
- data/lib/moderate/jobs/classify_job.rb +158 -0
- data/lib/moderate/label.rb +111 -0
- data/lib/moderate/macros.rb +90 -0
- data/lib/moderate/models/appeal.rb +154 -0
- data/lib/moderate/models/application_record.rb +31 -0
- data/lib/moderate/models/block.rb +203 -0
- data/lib/moderate/models/concerns/actor.rb +174 -0
- data/lib/moderate/models/concerns/content_filterable.rb +155 -0
- data/lib/moderate/models/concerns/reportable.rb +282 -0
- data/lib/moderate/models/flag.rb +136 -0
- data/lib/moderate/models/report.rb +620 -0
- data/lib/moderate/result.rb +176 -0
- data/lib/moderate/services/intake_appeal.rb +89 -0
- data/lib/moderate/services/intake_notice.rb +132 -0
- data/lib/moderate/services/intake_report.rb +132 -0
- data/lib/moderate/services/resolve_appeal.rb +134 -0
- data/lib/moderate/services/resolve_flag.rb +101 -0
- data/lib/moderate/services/resolve_report.rb +291 -0
- data/lib/moderate/version.rb +1 -1
- data/lib/moderate.rb +365 -18
- data/log/development.log +0 -0
- data/log/test.log +0 -0
- metadata +154 -15
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moderate
|
|
4
|
+
# The built-in filter adapters live under `Moderate::Filters`. They're the
|
|
5
|
+
# gem's own implementations of the ONE adapter contract the whole filtering
|
|
6
|
+
# design hinges on:
|
|
7
|
+
#
|
|
8
|
+
# adapter.classify(value) -> Moderate::Result
|
|
9
|
+
#
|
|
10
|
+
# ...where `value` is a piece of user content (a String of text, or an image
|
|
11
|
+
# reference) and the returned `Moderate::Result` answers "is this allowed?" and,
|
|
12
|
+
# if not, "why?" (the per-`Moderate::Label` detail, mapped onto the gem's single
|
|
13
|
+
# canonical taxonomy — see `Moderate::Label`).
|
|
14
|
+
#
|
|
15
|
+
# ── How adapters are invoked ────────────────────────────────────────────────
|
|
16
|
+
# The configuration registry (`Moderate::Configuration#adapters`) stores each
|
|
17
|
+
# adapter as EITHER a live object the host registered, OR a class-NAME String the
|
|
18
|
+
# gem constantizes lazily. The one built-in is seeded as the string
|
|
19
|
+
# "Moderate::Filters::Wordlist", and `Configuration#resolve_adapter` returns the
|
|
20
|
+
# CLASS itself for a String/Class entry. That means the gem calls
|
|
21
|
+
# `SomeAdapterClass.classify(value)` and `SomeAdapterClass.synchronous?` — i.e. the
|
|
22
|
+
# built-in exposes CLASS methods, not instance methods. (A host's own adapter
|
|
23
|
+
# registered as an instance — including the reference adapters in `examples/` —
|
|
24
|
+
# exposes the same `#classify`/`#synchronous?` on that instance — same duck type,
|
|
25
|
+
# both work.)
|
|
26
|
+
#
|
|
27
|
+
# `Base` gives the built-in both halves of that duck type from a single source:
|
|
28
|
+
# subclasses implement the work as an INSTANCE method (`#classify`), and `Base`
|
|
29
|
+
# provides the CLASS-level `classify`/`synchronous?`/`async?` that the registry
|
|
30
|
+
# resolution path calls, delegating the class call to a fresh instance. So one
|
|
31
|
+
# implementation satisfies both call styles and there's no copy-paste. (It's the
|
|
32
|
+
# base for the bundled wordlist; the `examples/` reference adapters don't need it —
|
|
33
|
+
# any object answering `#classify` is a valid adapter.)
|
|
34
|
+
#
|
|
35
|
+
# ── Sync vs. async (why it matters for :block) ──────────────────────────────
|
|
36
|
+
# `Configuration#validate!` enforces the README's rule: a `:block`-mode filter
|
|
37
|
+
# MUST use a synchronous adapter, because you can't reject a save on a result
|
|
38
|
+
# that's still computing in a background job. The validator probes the adapter
|
|
39
|
+
# with `synchronous?` and treats anything that doesn't answer, or answers truthy,
|
|
40
|
+
# as synchronous (the safe default that keeps simple adapters working); only an
|
|
41
|
+
# adapter that explicitly returns `synchronous? == false` is rejected for :block.
|
|
42
|
+
#
|
|
43
|
+
# We model this once here as `async?` (default `false` — the built-in wordlist is
|
|
44
|
+
# sync) and derive `synchronous?` from it, so a subclass flips ONE flag
|
|
45
|
+
# (`def self.async? = true`) to declare itself background-only. The wordlist leaves
|
|
46
|
+
# the default; a network-backed reference adapter (see `examples/`) declares itself
|
|
47
|
+
# async via its own `synchronous? == false`, which the spine honors regardless of
|
|
48
|
+
# whether the adapter inherits from `Base`.
|
|
49
|
+
module Filters
|
|
50
|
+
class Base
|
|
51
|
+
class << self
|
|
52
|
+
# The class-level entry point the registry resolution path calls. Spins up
|
|
53
|
+
# a per-call instance so subclasses can keep per-classification state in
|
|
54
|
+
# instance vars without any thread-safety worry (a new instance per call).
|
|
55
|
+
def classify(value)
|
|
56
|
+
new.classify(value)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Is this adapter background-only? Default `false` — the built-in
|
|
60
|
+
# deterministic adapters run inline. Override with `def self.async? = true`
|
|
61
|
+
# in an adapter whose `classify` does blocking I/O (a network moderation
|
|
62
|
+
# API), so the gem routes it through `Moderate::ClassifyJob` in :flag mode
|
|
63
|
+
# and forbids it in :block mode.
|
|
64
|
+
def async?
|
|
65
|
+
false
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# The predicate the spine's `Configuration#validate_block_mode_adapter!`
|
|
69
|
+
# actually reads. Defined in terms of `async?` so there's a single source
|
|
70
|
+
# of truth: an async adapter is, by definition, not synchronous.
|
|
71
|
+
def synchronous?
|
|
72
|
+
!async?
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
# Subclasses MUST implement `#classify(value) -> Moderate::Result`. We raise a
|
|
77
|
+
# clear NotImplementedError rather than silently allowing nil, so a half-built
|
|
78
|
+
# adapter fails loudly in development instead of mysteriously "allowing"
|
|
79
|
+
# everything in production.
|
|
80
|
+
def classify(_value)
|
|
81
|
+
raise NotImplementedError, "#{self.class} must implement #classify(value) and return a Moderate::Result"
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
# Mirror the class-level predicates on the instance, so a `Base` subclass
|
|
87
|
+
# registered as an *instance* (rather than resolved from a class name) still
|
|
88
|
+
# answers the same duck type the validator probes.
|
|
89
|
+
def async?
|
|
90
|
+
self.class.async?
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def synchronous?
|
|
94
|
+
self.class.synchronous?
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# ── Shared helpers for subclasses ────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
# The canonical "nothing matched" Result, stamped with this adapter's name so
|
|
100
|
+
# an allowed verdict is still attributable in audit. Adapters call this on the
|
|
101
|
+
# happy path (and a network-backed adapter calls it on a fail-open error path
|
|
102
|
+
# too — a moderation API must NEVER block a save on a transient network blip;
|
|
103
|
+
# see the reference adapters in `examples/`).
|
|
104
|
+
def allowed_result(raw: nil)
|
|
105
|
+
Moderate::Result.allowed(source: source_name, raw: raw)
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Build a flagged Result from a list of canonical label hashes/objects. Thin
|
|
109
|
+
# wrapper so subclasses don't repeat the `source:`/`allowed:` bookkeeping.
|
|
110
|
+
def flagged_result(labels:, raw: nil)
|
|
111
|
+
Moderate::Result.new(allowed: false, labels: labels, source: source_name, raw: raw)
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# The adapter's `source` string — the value recorded on `Moderate::Flag#source`
|
|
115
|
+
# so the moderation queue shows which backend flagged each item. Defaults to
|
|
116
|
+
# the demodulized, underscored class name ("Wordlist" -> "wordlist"); the
|
|
117
|
+
# bundled wordlist overrides it to one of the migration's allowed `source` enum
|
|
118
|
+
# values ("text_filter"), and a reference adapter sets the value that fits it
|
|
119
|
+
# ("image_filter" / "external_classifier"). See the `moderate_flags_source_check`
|
|
120
|
+
# constraint in the install migration.
|
|
121
|
+
def source_name
|
|
122
|
+
self.class.name.to_s.split("::").last.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "set"
|
|
5
|
+
|
|
6
|
+
module Moderate
|
|
7
|
+
module Filters
|
|
8
|
+
# The default, built-in TEXT adapter: a fast, offline, multilingual,
|
|
9
|
+
# zero-dependency wordlist matcher. It is the ONE built-in adapter the gem ships
|
|
10
|
+
# — registered by the spine under the name :wordlist (config seed
|
|
11
|
+
# "Moderate::Filters::Wordlist", constantized lazily). Synchronous, so it's valid
|
|
12
|
+
# in :block mode.
|
|
13
|
+
#
|
|
14
|
+
# ── What it's for, and what it's NOT ─────────────────────────────────────────
|
|
15
|
+
# This satisfies the store bar of "a method for filtering objectionable UGC
|
|
16
|
+
# before it's posted" (Apple Guideline 1.2:
|
|
17
|
+
# https://developer.apple.com/app-store/review/guidelines/#user-generated-content;
|
|
18
|
+
# Google Play UGC: https://support.google.com/googleplay/android-developer/answer/9876937)
|
|
19
|
+
# with no network call and no external service. It is NOT a full trust-&-safety
|
|
20
|
+
# classifier — it can't read context. For nuance (or for images), register a
|
|
21
|
+
# reference adapter from `examples/` (OpenAI, AWS Rekognition, …) or your own.
|
|
22
|
+
# The bar this clears is "obvious slurs/threats/spam don't sail straight
|
|
23
|
+
# through", at zero latency and zero cost.
|
|
24
|
+
#
|
|
25
|
+
# ── Evasion resistance (the part that matters) ───────────────────────────────
|
|
26
|
+
# Naive substring matching is trivially defeated ("f.u.c.k", "FÜCK", "f u c k",
|
|
27
|
+
# "fuuuck", "fμck"). Before matching, text is NORMALIZED:
|
|
28
|
+
# 1. Unicode NFKD decomposition + stripping combining marks — folds accented
|
|
29
|
+
# and look-alike forms ("FÜCK" -> "fuck") to a plain base form. NFKD
|
|
30
|
+
# (compatibility decomposition) also flattens many homoglyph/fullwidth
|
|
31
|
+
# tricks. (Ruby String#unicode_normalize; \p{Mn} = Unicode "Mark,
|
|
32
|
+
# nonspacing", the combining accents.)
|
|
33
|
+
# 2. Leetspeak transliteration (0->o, 1->i, 3->e, 4->a, 5->s, 7->t, @->a,
|
|
34
|
+
# $->s) — folds the common letter/number/symbol swaps.
|
|
35
|
+
# 3. Lowercasing + collapsing every run of non-alphanumerics to a single space
|
|
36
|
+
# — kills punctuation-as-separator evasion ("f.u.c.k" -> "f u c k").
|
|
37
|
+
# The normalized text is then matched in TWO forms: the single-spaced form
|
|
38
|
+
# (so word-boundary patterns like "\bkill yourself\b" work), AND a space-removed
|
|
39
|
+
# "compact" form (so spacing evasion "f u c k" -> "fuck" is caught). A pattern
|
|
40
|
+
# hits if it matches EITHER form. This is a fast offline baseline matching
|
|
41
|
+
# strategy the adapter relies on.
|
|
42
|
+
#
|
|
43
|
+
# ── Output ───────────────────────────────────────────────────────────────────
|
|
44
|
+
# On a hit, returns a flagged Moderate::Result whose labels are the canonical
|
|
45
|
+
# categories that matched, each with score 1.0 — a deterministic matcher has no
|
|
46
|
+
# probability, so a trip is "certain". `source` is "text_filter" to match the
|
|
47
|
+
# `moderate_flags_source_check` migration constraint.
|
|
48
|
+
class Wordlist < Base
|
|
49
|
+
# NFKD-fold, then strip combining marks. `\p{Mn}` is the Unicode general
|
|
50
|
+
# category "Mark, nonspacing" — the accents that NFKD splits off the base
|
|
51
|
+
# letter. Removing them turns "é" -> "e", "ñ" -> "n", etc.
|
|
52
|
+
COMBINING_MARKS = /\p{Mn}/
|
|
53
|
+
|
|
54
|
+
# Common leetspeak / symbol substitutions, folded back to plain letters before
|
|
55
|
+
# matching. Kept tiny on purpose — over-aggressive folding creates false
|
|
56
|
+
# positives (e.g. folding "l"->"i" would mangle ordinary words). These are the
|
|
57
|
+
# high-signal swaps that catch the bulk of leetspeak evasion.
|
|
58
|
+
LEETSPEAK = {
|
|
59
|
+
"0" => "o", "1" => "i", "3" => "e", "4" => "a",
|
|
60
|
+
"5" => "s", "7" => "t", "@" => "a", "$" => "s"
|
|
61
|
+
}.freeze
|
|
62
|
+
|
|
63
|
+
# Everything that isn't a-z or 0-9 becomes a single space — collapses
|
|
64
|
+
# punctuation/emoji/whitespace runs into one separator so "f.u.c.k" reads as
|
|
65
|
+
# "f u c k".
|
|
66
|
+
NON_ALNUM = /[^a-z0-9]+/
|
|
67
|
+
|
|
68
|
+
# The bundled blocklist YAMLs live in the gem's config dir, one per locale.
|
|
69
|
+
# We load and MERGE all of them (multilingual by default — see es.yml's note).
|
|
70
|
+
BLOCKLISTS_GLOB = File.expand_path("../../../config/moderate/blocklists/*.yml", __dir__)
|
|
71
|
+
|
|
72
|
+
# The normalized text, in both matchable forms. A tiny immutable value object
|
|
73
|
+
# so we compute the (mildly expensive) normalization exactly once per classify.
|
|
74
|
+
Normalized = Data.define(:spaced, :compact)
|
|
75
|
+
|
|
76
|
+
# The bundled patterns are the same for every classify call and never change
|
|
77
|
+
# at runtime, so compile them ONCE per process and memoize on the class.
|
|
78
|
+
# (config.additional_words / excluded_words are layered in per-call, since the
|
|
79
|
+
# host can in principle reconfigure between calls — and they're cheap.)
|
|
80
|
+
def self.patterns
|
|
81
|
+
@patterns ||= compile_bundled_patterns
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Reset the compiled-pattern cache. Exposed mainly for the test suite, which
|
|
85
|
+
# may stub the blocklist files; harmless in production.
|
|
86
|
+
def self.reset!
|
|
87
|
+
@patterns = nil
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Compile every bundled locale file into
|
|
91
|
+
# { canonical_slug_string => [[spaced_regex, compact_regex], ...] }.
|
|
92
|
+
# Multiple locales contributing the same category (e.g. en + es both add to
|
|
93
|
+
# "harassment") are merged into one pattern list.
|
|
94
|
+
#
|
|
95
|
+
# Each blocklist source string yields TWO compiled regexes, because the matcher
|
|
96
|
+
# tests two normalized forms (see #classify):
|
|
97
|
+
# - `spaced_regex` = the source verbatim, with its `\b` word boundaries
|
|
98
|
+
# intact, tested against the single-spaced form. Boundaries here are what
|
|
99
|
+
# keep ordinary text from over-matching (the Scunthorpe protection for
|
|
100
|
+
# normal input).
|
|
101
|
+
# - `compact_regex` = the source with any TRAILING `\b` removed, tested
|
|
102
|
+
# against the space-STRIPPED form. This is the spacing-evasion defense:
|
|
103
|
+
# "f u c k you" normalizes to compact "fuckyou", which a trailing `\b` would
|
|
104
|
+
# reject (no boundary after "fuck" in "fuckyou"). We keep the LEADING `\b`
|
|
105
|
+
# so we still anchor to a real word start — that's what stops "scunthorpe"
|
|
106
|
+
# from matching "\bcunt" (the "cunt" in "scunthorpe" is preceded by "s", so
|
|
107
|
+
# there's no leading boundary). Dropping only the trailing boundary is the
|
|
108
|
+
# sweet spot: catches concatenation evasion without opening the Scunthorpe
|
|
109
|
+
# floodgates. (Genuine residual false positives are handled by
|
|
110
|
+
# `config.excluded_words`.)
|
|
111
|
+
def self.compile_bundled_patterns
|
|
112
|
+
Dir.glob(BLOCKLISTS_GLOB).each_with_object({}) do |path, acc|
|
|
113
|
+
loaded = YAML.safe_load_file(path) || {}
|
|
114
|
+
loaded.each do |category, raw_patterns|
|
|
115
|
+
list = Array(raw_patterns).map { |source| compile_pair(source) }
|
|
116
|
+
(acc[category.to_s] ||= []).concat(list)
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Build the [spaced, compact] regex pair for one blocklist source string.
|
|
122
|
+
def self.compile_pair(source)
|
|
123
|
+
spaced = Regexp.new(source)
|
|
124
|
+
compact = Regexp.new(compact_source(source))
|
|
125
|
+
[spaced, compact]
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Derive the compact-form pattern from a blocklist source. The compact form of
|
|
129
|
+
# the text has ALL whitespace removed, so the pattern must too:
|
|
130
|
+
# - strip a single trailing `\b` (so single tokens survive concatenation,
|
|
131
|
+
# e.g. "fuck" inside "fuckyou"); the LEADING `\b` is kept to anchor to a real
|
|
132
|
+
# word start and keep Scunthorpe-type false positives out;
|
|
133
|
+
# - drop interior whitespace matchers (`\s+`, `\s*`, and literal spaces) so a
|
|
134
|
+
# multi-word phrase pattern ("kill\s+yourself") still matches the
|
|
135
|
+
# space-stripped form ("killyourself") — the no-spaces spelling of the same
|
|
136
|
+
# evasion.
|
|
137
|
+
def self.compact_source(source)
|
|
138
|
+
source
|
|
139
|
+
.sub(/\\b\z/, "") # drop trailing word boundary
|
|
140
|
+
.gsub(/\\s[*+]?/, "") # drop \s, \s*, \s+ whitespace matchers
|
|
141
|
+
.gsub(/\[ ([^\]]*)\]/, '[\1]') # drop a literal space inside a char class
|
|
142
|
+
.delete(" ") # drop any remaining literal spaces
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def classify(value)
|
|
146
|
+
text = value.to_s
|
|
147
|
+
# Empty / blank content can't violate anything — short-circuit so we don't
|
|
148
|
+
# pay for normalization on the (very common) empty case.
|
|
149
|
+
return allowed_result if text.strip.empty?
|
|
150
|
+
|
|
151
|
+
norm = normalize(text)
|
|
152
|
+
hits = matched_categories(norm)
|
|
153
|
+
|
|
154
|
+
return allowed_result if hits.empty?
|
|
155
|
+
|
|
156
|
+
# One Label per matched canonical slug, score 1.0 (deterministic = certain).
|
|
157
|
+
# `Moderate::Label` parses "hate/threatening" into category :hate +
|
|
158
|
+
# subcategory :threatening for us.
|
|
159
|
+
labels = hits.map do |slug|
|
|
160
|
+
category, subcategory = slug.split("/", 2)
|
|
161
|
+
Moderate::Label.new(
|
|
162
|
+
category: category, subcategory: subcategory,
|
|
163
|
+
score: 1.0, flagged: true, input: :text
|
|
164
|
+
)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
flagged_result(labels: labels)
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
# The wordlist writes flags with source "text_filter" — one of the four values
|
|
173
|
+
# allowed by the migration's `moderate_flags_source_check` constraint.
|
|
174
|
+
def source_name
|
|
175
|
+
"text_filter"
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# The canonical slugs that tripped. An excluded word (config.excluded_words)
|
|
179
|
+
# is stripped from the normalized text BEFORE matching, so a legitimate word
|
|
180
|
+
# that contains a banned substring (the classic "Scunthorpe problem") never
|
|
181
|
+
# trips. additional_words are matched as a whole-word extra "harassment" bucket
|
|
182
|
+
# — a host's domain-specific terms it wants caught.
|
|
183
|
+
def matched_categories(norm)
|
|
184
|
+
slugs = self.class.patterns.filter_map do |slug, pairs|
|
|
185
|
+
# A category trips if ANY of its patterns matches EITHER the spaced form
|
|
186
|
+
# (with the precise boundary regex) OR the compact/space-stripped form (with
|
|
187
|
+
# the trailing-boundary-relaxed regex — the spacing-evasion defense).
|
|
188
|
+
slug if pairs.any? { |spaced_re, compact_re| spaced_re.match?(norm.spaced) || compact_re.match?(norm.compact) }
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
slugs.concat(additional_word_categories(norm))
|
|
192
|
+
slugs.uniq
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# config.additional_words: extra terms the host wants flagged beyond the
|
|
196
|
+
# bundled lists. We treat a hit on any of them as plain "harassment" (the
|
|
197
|
+
# safest generic bucket for "a word this host disallows") with a word-boundary
|
|
198
|
+
# match on the normalized spaced form. Returns [] when none configured.
|
|
199
|
+
def additional_word_categories(norm)
|
|
200
|
+
words = Array(config.additional_words).map { |w| normalize_token(w) }.reject(&:empty?)
|
|
201
|
+
return [] if words.empty?
|
|
202
|
+
|
|
203
|
+
matched = words.any? do |word|
|
|
204
|
+
boundary = /\b#{Regexp.escape(word)}\b/
|
|
205
|
+
boundary.match?(norm.spaced) || norm.compact.include?(word)
|
|
206
|
+
end
|
|
207
|
+
matched ? ["harassment"] : []
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
# Full normalization pipeline (see the class header for the why of each step),
|
|
211
|
+
# producing both the single-spaced and space-removed matchable forms. Excluded
|
|
212
|
+
# words are deleted from the spaced form first so they can never contribute a
|
|
213
|
+
# match (and can't bleed into the compact form either).
|
|
214
|
+
def normalize(text)
|
|
215
|
+
folded = fold(text)
|
|
216
|
+
folded = strip_excluded_words(folded)
|
|
217
|
+
Normalized.new(spaced: folded, compact: folded.delete(" "))
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def fold(text)
|
|
221
|
+
text
|
|
222
|
+
.unicode_normalize(:nfkd)
|
|
223
|
+
.downcase
|
|
224
|
+
.gsub(COMBINING_MARKS, "")
|
|
225
|
+
.tr(LEETSPEAK.keys.join, LEETSPEAK.values.join)
|
|
226
|
+
.gsub(NON_ALNUM, " ")
|
|
227
|
+
.strip
|
|
228
|
+
.squeeze(" ") # collapse any residual double spaces (no ActiveSupport #squish dependency)
|
|
229
|
+
end
|
|
230
|
+
|
|
231
|
+
# Normalize a single configured token the same way as content, so an
|
|
232
|
+
# excluded/additional word the host writes with caps/accents still lines up
|
|
233
|
+
# with the normalized text it's compared against.
|
|
234
|
+
def normalize_token(word)
|
|
235
|
+
fold(word.to_s).delete(" ")
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
# Remove configured false-positive words from the spaced form before matching.
|
|
239
|
+
# We match them on word boundaries so we only excise the standalone word, not
|
|
240
|
+
# every occurrence of the substring.
|
|
241
|
+
def strip_excluded_words(spaced)
|
|
242
|
+
excluded = Array(config.excluded_words).map { |w| normalize_token(w) }.reject(&:empty?)
|
|
243
|
+
return spaced if excluded.empty?
|
|
244
|
+
|
|
245
|
+
excluded.reduce(spaced) do |acc, word|
|
|
246
|
+
acc.gsub(/\b#{Regexp.escape(word)}\b/, " ")
|
|
247
|
+
end.squeeze(" ").strip
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def config
|
|
251
|
+
Moderate.config
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
end
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moderate
|
|
4
|
+
# The background worker for ASYNCHRONOUS filter adapters (the :openai adapter, a
|
|
5
|
+
# host's hosted classifier, the default :image adapter) running in :flag mode.
|
|
6
|
+
#
|
|
7
|
+
# ── Why a job exists at all ──────────────────────────────────────────────────
|
|
8
|
+
# An async adapter does blocking I/O (a moderation API call), which must never run
|
|
9
|
+
# inside the request that saved the content — and CANNOT run inside a validator or
|
|
10
|
+
# an after_commit callback without stalling the write. So the :flag path works in
|
|
11
|
+
# two halves:
|
|
12
|
+
# 1. The model's filter concern (`moderates :field`) lets the write succeed, then
|
|
13
|
+
# in an after_commit hook enqueues THIS job for any field whose policy uses an
|
|
14
|
+
# async adapter. (Synchronous adapters like :wordlist skip the job — the
|
|
15
|
+
# concern classifies inline and files the Flag directly.)
|
|
16
|
+
# 2. This job re-reads the saved value, classifies it through the adapter, and —
|
|
17
|
+
# if the content is flagged — files (or updates) a Moderate::Flag for the
|
|
18
|
+
# moderation queue.
|
|
19
|
+
#
|
|
20
|
+
# ── Re-reading the value (deliberate) ────────────────────────────────────────
|
|
21
|
+
# The job is handed the RECORD (GlobalID-serialized by ActiveJob) and the FIELD
|
|
22
|
+
# NAME, not the raw text/image — so it always classifies the CURRENT persisted
|
|
23
|
+
# value. If the record was edited or deleted between enqueue and run, we classify
|
|
24
|
+
# what's actually there now (or skip a vanished record), never a stale snapshot.
|
|
25
|
+
#
|
|
26
|
+
# ── Idempotency ──────────────────────────────────────────────────────────────
|
|
27
|
+
# Flag creation goes through `Moderate::Flag.flag!`, the single builder shared by
|
|
28
|
+
# the synchronous and asynchronous paths. It's an upsert-by-(flaggable, field,
|
|
29
|
+
# source) so a retried job (ActiveJob retries on transient failures) doesn't pile
|
|
30
|
+
# up duplicate queue entries for the same content.
|
|
31
|
+
#
|
|
32
|
+
# ── Base class ───────────────────────────────────────────────────────────────
|
|
33
|
+
# We subclass `ActiveJob::Base` directly (not the host's ApplicationJob) so the gem
|
|
34
|
+
# doesn't depend on a constant that lives in the host app — the same reason the
|
|
35
|
+
# rest of the gem stays host-agnostic. The host configures the queue adapter,
|
|
36
|
+
# retries, and queue name globally as usual.
|
|
37
|
+
class ClassifyJob < ActiveJob::Base
|
|
38
|
+
# Run on a low-priority queue by default — moderation flagging is important but
|
|
39
|
+
# not latency-critical (the content is already published in :flag mode). A host
|
|
40
|
+
# can override the queue globally.
|
|
41
|
+
queue_as { Moderate.config.respond_to?(:job_queue) && Moderate.config.job_queue || :default }
|
|
42
|
+
|
|
43
|
+
# @param record [ActiveRecord::Base] the flaggable record (any Moderate::Reportable).
|
|
44
|
+
# @param field [String, Symbol] the field whose value to classify.
|
|
45
|
+
# @param adapter [String, Symbol, nil] optional explicit adapter name; when nil,
|
|
46
|
+
# the field's resolved FilterPolicy (or the global default) decides.
|
|
47
|
+
def perform(record, field, adapter: nil)
|
|
48
|
+
# The record may have been destroyed between enqueue and execution — nothing
|
|
49
|
+
# to classify, nothing to flag. (ActiveJob raises DeserializationError for a
|
|
50
|
+
# GlobalID that no longer resolves; that's rescued at the framework level, but
|
|
51
|
+
# we also guard a nil here defensively.)
|
|
52
|
+
return if record.nil?
|
|
53
|
+
|
|
54
|
+
field = field.to_s
|
|
55
|
+
policy = resolve_policy(record, field, adapter)
|
|
56
|
+
|
|
57
|
+
# If the field's policy is :off (e.g. it was reconfigured to off after the job
|
|
58
|
+
# was enqueued), there's nothing to do.
|
|
59
|
+
return if policy.respond_to?(:off?) && policy.off?
|
|
60
|
+
|
|
61
|
+
value = field_value(record, field)
|
|
62
|
+
return if blank?(value)
|
|
63
|
+
|
|
64
|
+
result = Moderate.classify(value, policy: policy)
|
|
65
|
+
return unless result.flagged?
|
|
66
|
+
|
|
67
|
+
file_flag(record, field, policy, result, value)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
# Resolve the FilterPolicy for this record/field. If an explicit adapter name was
|
|
73
|
+
# passed (the enqueuer already knew it), prefer the field's declared policy but
|
|
74
|
+
# fall back to the global resolution — `Moderate.filter_policy_for` already walks
|
|
75
|
+
# the ancestor chain and falls back to an :off policy, so this is always defined.
|
|
76
|
+
def resolve_policy(record, field, adapter)
|
|
77
|
+
policy = Moderate.filter_policy_for(record, field)
|
|
78
|
+
return policy if adapter.nil?
|
|
79
|
+
|
|
80
|
+
# An explicit adapter override: keep the resolved policy's class/field/mode but
|
|
81
|
+
# swap in the requested adapter, so the job classifies with exactly the backend
|
|
82
|
+
# the enqueuer intended even if config changed.
|
|
83
|
+
Moderate::Configuration::FilterPolicy.new(
|
|
84
|
+
class_name: policy.class_name, field: field,
|
|
85
|
+
adapter: adapter.to_s.strip.downcase.to_sym, mode: policy.mode
|
|
86
|
+
)
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# File (or upsert) the Moderate::Flag via the shared builder. We pass exactly the
|
|
90
|
+
# columns the install migration defines for moderate_flags. `Flag.flag!` owns the
|
|
91
|
+
# `content_flagged` notify event so there's a single emission site across the
|
|
92
|
+
# sync and async paths — the job doesn't fire it itself, to avoid double-notifying.
|
|
93
|
+
#
|
|
94
|
+
# `result.source` is the human-readable adapter NAME the spine stamped on the
|
|
95
|
+
# Result (e.g. "openai"); the persisted `source` COLUMN, however, is constrained
|
|
96
|
+
# to text_filter/image_filter/external_classifier/manual, which the adapter's own
|
|
97
|
+
# Result already reflects via its `source_name`. We pass `result.source` and let
|
|
98
|
+
# the model coerce/validate against the constraint.
|
|
99
|
+
def file_flag(record, field, policy, result, value)
|
|
100
|
+
Moderate::Flag.flag!(
|
|
101
|
+
flaggable: record,
|
|
102
|
+
field: field,
|
|
103
|
+
owner: content_owner(record),
|
|
104
|
+
source: result.source,
|
|
105
|
+
mode: policy.respond_to?(:mode) ? policy.mode : :flag,
|
|
106
|
+
categories: result.categories,
|
|
107
|
+
scores: result.scores,
|
|
108
|
+
excerpt: excerpt_for(value),
|
|
109
|
+
context: flag_context(policy, result)
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Who owns the flagged content. The Reportable concern defines `reported_owner`
|
|
114
|
+
# (the "who's responsible" hook); we use it when present and degrade to nil
|
|
115
|
+
# otherwise (the owner column is nullable — a flag with no resolvable owner is
|
|
116
|
+
# still a valid queue item).
|
|
117
|
+
def content_owner(record)
|
|
118
|
+
record.respond_to?(:reported_owner) ? record.reported_owner : nil
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Diagnostic context persisted on the flag (the `context` jsonb column): the raw
|
|
122
|
+
# provider payload (for audit/debugging) plus the policy that produced this flag.
|
|
123
|
+
# Never relied on by the gem's own logic — purely for the human in the queue.
|
|
124
|
+
def flag_context(policy, result)
|
|
125
|
+
context = {}
|
|
126
|
+
context[:raw] = result.raw unless result.raw.nil?
|
|
127
|
+
context[:policy] = {
|
|
128
|
+
class_name: policy.class_name, field: policy.field, mode: policy.mode.to_s
|
|
129
|
+
}
|
|
130
|
+
context
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# A short, human-readable snippet of the offending value for the queue. We keep
|
|
134
|
+
# it to 500 chars — enough context for a moderator, not
|
|
135
|
+
# a full copy of a long document. An image value (not a String) gets stringified
|
|
136
|
+
# to its reference, which is fine for the queue's "what was flagged" column.
|
|
137
|
+
def excerpt_for(value)
|
|
138
|
+
value.to_s[0, 500]
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
def field_value(record, field)
|
|
142
|
+
record.public_send(field)
|
|
143
|
+
rescue NoMethodError
|
|
144
|
+
nil
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Blank check without forcing an ActiveSupport dependency in the job's hot path
|
|
148
|
+
# (ActiveSupport IS loaded in a Rails host, but #blank? on arbitrary values is
|
|
149
|
+
# easy to reproduce and keeps the job self-contained).
|
|
150
|
+
def blank?(value)
|
|
151
|
+
return true if value.nil?
|
|
152
|
+
return value.strip.empty? if value.is_a?(String)
|
|
153
|
+
return value.empty? if value.respond_to?(:empty?)
|
|
154
|
+
|
|
155
|
+
false
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Moderate
|
|
4
|
+
# A single classification label produced by a filter adapter.
|
|
5
|
+
#
|
|
6
|
+
# One piece of content can produce *several* labels — e.g. a message might trip
|
|
7
|
+
# both `hate` and `hate/threatening`, or an image might trip `sexual/minors`
|
|
8
|
+
# while its caption trips `harassment`. Each Label records exactly one
|
|
9
|
+
# (category, subcategory) verdict, the adapter's confidence `score`, and which
|
|
10
|
+
# `input` (text vs image) tripped it. `Moderate::Result` holds the collection.
|
|
11
|
+
#
|
|
12
|
+
# The taxonomy is OpenAI's `omni-moderation-latest` category set, adopted as the
|
|
13
|
+
# gem's ONE canonical vocabulary so every adapter (the offline wordlist, the
|
|
14
|
+
# OpenAI moderation endpoint, or a host-registered backend) speaks the same
|
|
15
|
+
# language — which in turn lets `Moderate::Flag`, the DSA Art. 17 statement of
|
|
16
|
+
# reasons, and the Art. 24 transparency counters all aggregate over a single set.
|
|
17
|
+
# See: https://developers.openai.com/api/docs/guides/moderation
|
|
18
|
+
#
|
|
19
|
+
# Implemented with Ruby's `Data.define` (Ruby 3.2+, which the gemspec requires):
|
|
20
|
+
# an immutable, frozen-by-construction value object — exactly what a label
|
|
21
|
+
# should be (you never mutate a verdict after the fact).
|
|
22
|
+
Label = Data.define(:category, :subcategory, :score, :flagged, :input) do
|
|
23
|
+
# NOTE: constants live OUTSIDE this block (see below). A `Data.define do...end`
|
|
24
|
+
# block is class_eval'd in a context where constant *assignment* leaks to the
|
|
25
|
+
# lexically-enclosing namespace (here `Moderate`) instead of attaching to the
|
|
26
|
+
# Data class — a well-known Ruby gotcha. So `TAXONOMY`/`CATEGORIES`/`INPUTS` are
|
|
27
|
+
# defined by reopening `Moderate::Label` after the `Data.define` call. Instance
|
|
28
|
+
# methods defined in the block (below) are unaffected and work as written.
|
|
29
|
+
|
|
30
|
+
# Normalize everything on the way in so adapters can be sloppy about types:
|
|
31
|
+
# - category/subcategory/input accepted as String or Symbol, downcased
|
|
32
|
+
# - score coerced to Float (defaults to 1.0 — deterministic adapters like the
|
|
33
|
+
# wordlist have no probability, so a trip is "certain")
|
|
34
|
+
# - flagged defaults to true (you only build a Label when something matched)
|
|
35
|
+
def initialize(category:, subcategory: nil, score: 1.0, flagged: true, input: :unknown)
|
|
36
|
+
super(
|
|
37
|
+
category: normalize_symbol(category),
|
|
38
|
+
subcategory: subcategory.nil? ? nil : normalize_symbol(subcategory),
|
|
39
|
+
score: score.nil? ? nil : score.to_f,
|
|
40
|
+
flagged: flagged ? true : false,
|
|
41
|
+
input: normalize_symbol(input || :unknown)
|
|
42
|
+
)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# The full slug, OpenAI-style: "hate/threatening", "self-harm/intent", or just
|
|
46
|
+
# "hate" when there's no subcategory. This is the canonical wire/storage form
|
|
47
|
+
# used by `Moderate::Result#categories` and persisted on `Moderate::Flag`.
|
|
48
|
+
def slug
|
|
49
|
+
subcategory ? "#{category}/#{subcategory}" : category.to_s
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# True when this label belongs to the canonical taxonomy. Adapters MAY emit
|
|
53
|
+
# off-taxonomy labels (a provider category we haven't mapped) — we don't raise,
|
|
54
|
+
# we just let callers filter on `canonical?` when they want strictness.
|
|
55
|
+
def canonical?
|
|
56
|
+
# `self.class::TAXONOMY` resolves the constant on the Label class regardless
|
|
57
|
+
# of the Data.define block's quirky lexical scope (see the note at the top).
|
|
58
|
+
subs = self.class::TAXONOMY[category]
|
|
59
|
+
return false if subs.nil?
|
|
60
|
+
|
|
61
|
+
subcategory.nil? || subs.include?(subcategory)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
private
|
|
65
|
+
|
|
66
|
+
def normalize_symbol(value)
|
|
67
|
+
value.to_s.strip.downcase.to_sym
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# --- Canonical taxonomy constants (attached to Moderate::Label) -------------
|
|
72
|
+
# Defined here, by reopening the class, rather than inside the Data.define block
|
|
73
|
+
# above, because constant assignment inside that block would leak to the
|
|
74
|
+
# `Moderate` namespace instead of landing on `Moderate::Label`.
|
|
75
|
+
class Label
|
|
76
|
+
# The canonical OpenAI moderation taxonomy: each top-level category mapped to
|
|
77
|
+
# its allowed subcategories. Sources, in the README and OpenAI's docs
|
|
78
|
+
# (https://developers.openai.com/api/docs/guides/moderation):
|
|
79
|
+
# harassment → :threatening
|
|
80
|
+
# hate → :threatening
|
|
81
|
+
# sexual → :minors
|
|
82
|
+
# self-harm → :intent, :instructions
|
|
83
|
+
# violence → :graphic
|
|
84
|
+
# illicit → :violent
|
|
85
|
+
#
|
|
86
|
+
# `nil` is always an implicitly-valid subcategory (the bare top-level category,
|
|
87
|
+
# e.g. plain `:hate` with no qualifier).
|
|
88
|
+
#
|
|
89
|
+
# NOTE: `self-harm` is the hyphenated symbol `:"self-harm"` to match OpenAI's
|
|
90
|
+
# wire format verbatim — `Result#categories` joins category+subcategory with "/"
|
|
91
|
+
# to reproduce OpenAI's exact slug strings ("self-harm/intent" etc.), so
|
|
92
|
+
# downstream consumers comparing against OpenAI labels line up byte-for-byte.
|
|
93
|
+
TAXONOMY = {
|
|
94
|
+
harassment: %i[threatening],
|
|
95
|
+
hate: %i[threatening],
|
|
96
|
+
sexual: %i[minors],
|
|
97
|
+
"self-harm": %i[intent instructions],
|
|
98
|
+
violence: %i[graphic],
|
|
99
|
+
illicit: %i[violent]
|
|
100
|
+
}.freeze
|
|
101
|
+
|
|
102
|
+
# Every canonical category as a flat symbol list, for validation and iteration.
|
|
103
|
+
CATEGORIES = TAXONOMY.keys.freeze
|
|
104
|
+
|
|
105
|
+
# Which inputs an adapter can attribute a label to. `:text` and `:image` mirror
|
|
106
|
+
# OpenAI's multimodal `category_applied_input_types`; `:unknown` is the safe
|
|
107
|
+
# default for adapters (like the offline wordlist) that only see one kind of
|
|
108
|
+
# input and don't bother to say which.
|
|
109
|
+
INPUTS = %i[text image unknown].freeze
|
|
110
|
+
end
|
|
111
|
+
end
|