moderate 0.1.0 → 1.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +8 -0
  3. data/.simplecov +62 -0
  4. data/AGENTS.md +7 -0
  5. data/Appraisals +16 -0
  6. data/CHANGELOG.md +71 -1
  7. data/CLAUDE.md +7 -0
  8. data/README.md +376 -29
  9. data/Rakefile +28 -2
  10. data/app/controllers/concerns/moderate/moderation.rb +161 -0
  11. data/app/controllers/moderate/appeals_controller.rb +190 -0
  12. data/app/controllers/moderate/application_controller.rb +45 -0
  13. data/app/controllers/moderate/notices_controller.rb +382 -0
  14. data/app/controllers/moderate/transparency_reports_controller.rb +30 -0
  15. data/app/helpers/moderate/engine_helper.rb +151 -0
  16. data/app/views/moderate/appeals/new.html.erb +78 -0
  17. data/app/views/moderate/notices/new.html.erb +255 -0
  18. data/app/views/moderate/transparency_reports/_summary_card.html.erb +20 -0
  19. data/app/views/moderate/transparency_reports/show.html.erb +52 -0
  20. data/config/moderate/blocklists/en.yml +81 -0
  21. data/config/moderate/blocklists/es.yml +40 -0
  22. data/config/routes.rb +36 -0
  23. data/docs/compliance.md +178 -0
  24. data/docs/configuration.md +326 -0
  25. data/docs/dsa-notice-form.md +371 -0
  26. data/docs/madmin.md +490 -0
  27. data/docs/notifications.md +363 -0
  28. data/examples/aws_rekognition_adapter.rb +140 -0
  29. data/examples/openai_moderation_adapter.rb +111 -0
  30. data/gemfiles/rails_7.1.gemfile +36 -0
  31. data/gemfiles/rails_7.2.gemfile +36 -0
  32. data/gemfiles/rails_8.1.gemfile +36 -0
  33. data/lib/generators/moderate/install_generator.rb +56 -0
  34. data/lib/generators/moderate/templates/create_moderate_tables.rb.erb +237 -0
  35. data/lib/generators/moderate/templates/initializer.rb +198 -0
  36. data/lib/generators/moderate/views_generator.rb +63 -0
  37. data/lib/moderate/configuration.rb +341 -0
  38. data/lib/moderate/engine.rb +138 -0
  39. data/lib/moderate/errors.rb +26 -0
  40. data/lib/moderate/event.rb +75 -0
  41. data/lib/moderate/filters/base.rb +126 -0
  42. data/lib/moderate/filters/wordlist.rb +255 -0
  43. data/lib/moderate/jobs/classify_job.rb +158 -0
  44. data/lib/moderate/label.rb +111 -0
  45. data/lib/moderate/macros.rb +90 -0
  46. data/lib/moderate/models/appeal.rb +154 -0
  47. data/lib/moderate/models/application_record.rb +31 -0
  48. data/lib/moderate/models/block.rb +203 -0
  49. data/lib/moderate/models/concerns/actor.rb +174 -0
  50. data/lib/moderate/models/concerns/content_filterable.rb +155 -0
  51. data/lib/moderate/models/concerns/reportable.rb +282 -0
  52. data/lib/moderate/models/flag.rb +136 -0
  53. data/lib/moderate/models/report.rb +620 -0
  54. data/lib/moderate/result.rb +176 -0
  55. data/lib/moderate/services/intake_appeal.rb +89 -0
  56. data/lib/moderate/services/intake_notice.rb +132 -0
  57. data/lib/moderate/services/intake_report.rb +132 -0
  58. data/lib/moderate/services/resolve_appeal.rb +134 -0
  59. data/lib/moderate/services/resolve_flag.rb +101 -0
  60. data/lib/moderate/services/resolve_report.rb +291 -0
  61. data/lib/moderate/version.rb +1 -1
  62. data/lib/moderate.rb +365 -18
  63. data/log/development.log +0 -0
  64. data/log/test.log +0 -0
  65. metadata +154 -15
@@ -0,0 +1,126 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moderate
4
+ # The built-in filter adapters live under `Moderate::Filters`. They're the
5
+ # gem's own implementations of the ONE adapter contract the whole filtering
6
+ # design hinges on:
7
+ #
8
+ # adapter.classify(value) -> Moderate::Result
9
+ #
10
+ # ...where `value` is a piece of user content (a String of text, or an image
11
+ # reference) and the returned `Moderate::Result` answers "is this allowed?" and,
12
+ # if not, "why?" (the per-`Moderate::Label` detail, mapped onto the gem's single
13
+ # canonical taxonomy — see `Moderate::Label`).
14
+ #
15
+ # ── How adapters are invoked ────────────────────────────────────────────────
16
+ # The configuration registry (`Moderate::Configuration#adapters`) stores each
17
+ # adapter as EITHER a live object the host registered, OR a class-NAME String the
18
+ # gem constantizes lazily. The one built-in is seeded as the string
19
+ # "Moderate::Filters::Wordlist", and `Configuration#resolve_adapter` returns the
20
+ # CLASS itself for a String/Class entry. That means the gem calls
21
+ # `SomeAdapterClass.classify(value)` and `SomeAdapterClass.synchronous?` — i.e. the
22
+ # built-in exposes CLASS methods, not instance methods. (A host's own adapter
23
+ # registered as an instance — including the reference adapters in `examples/` —
24
+ # exposes the same `#classify`/`#synchronous?` on that instance — same duck type,
25
+ # both work.)
26
+ #
27
+ # `Base` gives the built-in both halves of that duck type from a single source:
28
+ # subclasses implement the work as an INSTANCE method (`#classify`), and `Base`
29
+ # provides the CLASS-level `classify`/`synchronous?`/`async?` that the registry
30
+ # resolution path calls, delegating the class call to a fresh instance. So one
31
+ # implementation satisfies both call styles and there's no copy-paste. (It's the
32
+ # base for the bundled wordlist; the `examples/` reference adapters don't need it —
33
+ # any object answering `#classify` is a valid adapter.)
34
+ #
35
+ # ── Sync vs. async (why it matters for :block) ──────────────────────────────
36
+ # `Configuration#validate!` enforces the README's rule: a `:block`-mode filter
37
+ # MUST use a synchronous adapter, because you can't reject a save on a result
38
+ # that's still computing in a background job. The validator probes the adapter
39
+ # with `synchronous?` and treats anything that doesn't answer, or answers truthy,
40
+ # as synchronous (the safe default that keeps simple adapters working); only an
41
+ # adapter that explicitly returns `synchronous? == false` is rejected for :block.
42
+ #
43
+ # We model this once here as `async?` (default `false` — the built-in wordlist is
44
+ # sync) and derive `synchronous?` from it, so a subclass flips ONE flag
45
+ # (`def self.async? = true`) to declare itself background-only. The wordlist leaves
46
+ # the default; a network-backed reference adapter (see `examples/`) declares itself
47
+ # async via its own `synchronous? == false`, which the spine honors regardless of
48
+ # whether the adapter inherits from `Base`.
49
+ module Filters
50
+ class Base
51
+ class << self
52
+ # The class-level entry point the registry resolution path calls. Spins up
53
+ # a per-call instance so subclasses can keep per-classification state in
54
+ # instance vars without any thread-safety worry (a new instance per call).
55
+ def classify(value)
56
+ new.classify(value)
57
+ end
58
+
59
+ # Is this adapter background-only? Default `false` — the built-in
60
+ # deterministic adapters run inline. Override with `def self.async? = true`
61
+ # in an adapter whose `classify` does blocking I/O (a network moderation
62
+ # API), so the gem routes it through `Moderate::ClassifyJob` in :flag mode
63
+ # and forbids it in :block mode.
64
+ def async?
65
+ false
66
+ end
67
+
68
+ # The predicate the spine's `Configuration#validate_block_mode_adapter!`
69
+ # actually reads. Defined in terms of `async?` so there's a single source
70
+ # of truth: an async adapter is, by definition, not synchronous.
71
+ def synchronous?
72
+ !async?
73
+ end
74
+ end
75
+
76
+ # Subclasses MUST implement `#classify(value) -> Moderate::Result`. We raise a
77
+ # clear NotImplementedError rather than silently allowing nil, so a half-built
78
+ # adapter fails loudly in development instead of mysteriously "allowing"
79
+ # everything in production.
80
+ def classify(_value)
81
+ raise NotImplementedError, "#{self.class} must implement #classify(value) and return a Moderate::Result"
82
+ end
83
+
84
+ private
85
+
86
+ # Mirror the class-level predicates on the instance, so a `Base` subclass
87
+ # registered as an *instance* (rather than resolved from a class name) still
88
+ # answers the same duck type the validator probes.
89
+ def async?
90
+ self.class.async?
91
+ end
92
+
93
+ def synchronous?
94
+ self.class.synchronous?
95
+ end
96
+
97
+ # ── Shared helpers for subclasses ────────────────────────────────────────
98
+
99
+ # The canonical "nothing matched" Result, stamped with this adapter's name so
100
+ # an allowed verdict is still attributable in audit. Adapters call this on the
101
+ # happy path (and a network-backed adapter calls it on a fail-open error path
102
+ # too — a moderation API must NEVER block a save on a transient network blip;
103
+ # see the reference adapters in `examples/`).
104
+ def allowed_result(raw: nil)
105
+ Moderate::Result.allowed(source: source_name, raw: raw)
106
+ end
107
+
108
+ # Build a flagged Result from a list of canonical label hashes/objects. Thin
109
+ # wrapper so subclasses don't repeat the `source:`/`allowed:` bookkeeping.
110
+ def flagged_result(labels:, raw: nil)
111
+ Moderate::Result.new(allowed: false, labels: labels, source: source_name, raw: raw)
112
+ end
113
+
114
+ # The adapter's `source` string — the value recorded on `Moderate::Flag#source`
115
+ # so the moderation queue shows which backend flagged each item. Defaults to
116
+ # the demodulized, underscored class name ("Wordlist" -> "wordlist"); the
117
+ # bundled wordlist overrides it to one of the migration's allowed `source` enum
118
+ # values ("text_filter"), and a reference adapter sets the value that fits it
119
+ # ("image_filter" / "external_classifier"). See the `moderate_flags_source_check`
120
+ # constraint in the install migration.
121
+ def source_name
122
+ self.class.name.to_s.split("::").last.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,255 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "set"
5
+
6
+ module Moderate
7
+ module Filters
8
+ # The default, built-in TEXT adapter: a fast, offline, multilingual,
9
+ # zero-dependency wordlist matcher. It is the ONE built-in adapter the gem ships
10
+ # — registered by the spine under the name :wordlist (config seed
11
+ # "Moderate::Filters::Wordlist", constantized lazily). Synchronous, so it's valid
12
+ # in :block mode.
13
+ #
14
+ # ── What it's for, and what it's NOT ─────────────────────────────────────────
15
+ # This satisfies the store bar of "a method for filtering objectionable UGC
16
+ # before it's posted" (Apple Guideline 1.2:
17
+ # https://developer.apple.com/app-store/review/guidelines/#user-generated-content;
18
+ # Google Play UGC: https://support.google.com/googleplay/android-developer/answer/9876937)
19
+ # with no network call and no external service. It is NOT a full trust-&-safety
20
+ # classifier — it can't read context. For nuance (or for images), register a
21
+ # reference adapter from `examples/` (OpenAI, AWS Rekognition, …) or your own.
22
+ # The bar this clears is "obvious slurs/threats/spam don't sail straight
23
+ # through", at zero latency and zero cost.
24
+ #
25
+ # ── Evasion resistance (the part that matters) ───────────────────────────────
26
+ # Naive substring matching is trivially defeated ("f.u.c.k", "FÜCK", "f u c k",
27
+ # "fuuuck", "fμck"). Before matching, text is NORMALIZED:
28
+ # 1. Unicode NFKD decomposition + stripping combining marks — folds accented
29
+ # and look-alike forms ("FÜCK" -> "fuck") to a plain base form. NFKD
30
+ # (compatibility decomposition) also flattens many homoglyph/fullwidth
31
+ # tricks. (Ruby String#unicode_normalize; \p{Mn} = Unicode "Mark,
32
+ # nonspacing", the combining accents.)
33
+ # 2. Leetspeak transliteration (0->o, 1->i, 3->e, 4->a, 5->s, 7->t, @->a,
34
+ # $->s) — folds the common letter/number/symbol swaps.
35
+ # 3. Lowercasing + collapsing every run of non-alphanumerics to a single space
36
+ # — kills punctuation-as-separator evasion ("f.u.c.k" -> "f u c k").
37
+ # The normalized text is then matched in TWO forms: the single-spaced form
38
+ # (so word-boundary patterns like "\bkill yourself\b" work), AND a space-removed
39
+ # "compact" form (so spacing evasion "f u c k" -> "fuck" is caught). A pattern
40
+ # hits if it matches EITHER form. This is a fast offline baseline matching
41
+ # strategy the adapter relies on.
42
+ #
43
+ # ── Output ───────────────────────────────────────────────────────────────────
44
+ # On a hit, returns a flagged Moderate::Result whose labels are the canonical
45
+ # categories that matched, each with score 1.0 — a deterministic matcher has no
46
+ # probability, so a trip is "certain". `source` is "text_filter" to match the
47
+ # `moderate_flags_source_check` migration constraint.
48
+ class Wordlist < Base
49
+ # NFKD-fold, then strip combining marks. `\p{Mn}` is the Unicode general
50
+ # category "Mark, nonspacing" — the accents that NFKD splits off the base
51
+ # letter. Removing them turns "é" -> "e", "ñ" -> "n", etc.
52
+ COMBINING_MARKS = /\p{Mn}/
53
+
54
+ # Common leetspeak / symbol substitutions, folded back to plain letters before
55
+ # matching. Kept tiny on purpose — over-aggressive folding creates false
56
+ # positives (e.g. folding "l"->"i" would mangle ordinary words). These are the
57
+ # high-signal swaps that catch the bulk of leetspeak evasion.
58
+ LEETSPEAK = {
59
+ "0" => "o", "1" => "i", "3" => "e", "4" => "a",
60
+ "5" => "s", "7" => "t", "@" => "a", "$" => "s"
61
+ }.freeze
62
+
63
+ # Everything that isn't a-z or 0-9 becomes a single space — collapses
64
+ # punctuation/emoji/whitespace runs into one separator so "f.u.c.k" reads as
65
+ # "f u c k".
66
+ NON_ALNUM = /[^a-z0-9]+/
67
+
68
+ # The bundled blocklist YAMLs live in the gem's config dir, one per locale.
69
+ # We load and MERGE all of them (multilingual by default — see es.yml's note).
70
+ BLOCKLISTS_GLOB = File.expand_path("../../../config/moderate/blocklists/*.yml", __dir__)
71
+
72
+ # The normalized text, in both matchable forms. A tiny immutable value object
73
+ # so we compute the (mildly expensive) normalization exactly once per classify.
74
+ Normalized = Data.define(:spaced, :compact)
75
+
76
+ # The bundled patterns are the same for every classify call and never change
77
+ # at runtime, so compile them ONCE per process and memoize on the class.
78
+ # (config.additional_words / excluded_words are layered in per-call, since the
79
+ # host can in principle reconfigure between calls — and they're cheap.)
80
+ def self.patterns
81
+ @patterns ||= compile_bundled_patterns
82
+ end
83
+
84
+ # Reset the compiled-pattern cache. Exposed mainly for the test suite, which
85
+ # may stub the blocklist files; harmless in production.
86
+ def self.reset!
87
+ @patterns = nil
88
+ end
89
+
90
+ # Compile every bundled locale file into
91
+ # { canonical_slug_string => [[spaced_regex, compact_regex], ...] }.
92
+ # Multiple locales contributing the same category (e.g. en + es both add to
93
+ # "harassment") are merged into one pattern list.
94
+ #
95
+ # Each blocklist source string yields TWO compiled regexes, because the matcher
96
+ # tests two normalized forms (see #classify):
97
+ # - `spaced_regex` = the source verbatim, with its `\b` word boundaries
98
+ # intact, tested against the single-spaced form. Boundaries here are what
99
+ # keep ordinary text from over-matching (the Scunthorpe protection for
100
+ # normal input).
101
+ # - `compact_regex` = the source with any TRAILING `\b` removed, tested
102
+ # against the space-STRIPPED form. This is the spacing-evasion defense:
103
+ # "f u c k you" normalizes to compact "fuckyou", which a trailing `\b` would
104
+ # reject (no boundary after "fuck" in "fuckyou"). We keep the LEADING `\b`
105
+ # so we still anchor to a real word start — that's what stops "scunthorpe"
106
+ # from matching "\bcunt" (the "cunt" in "scunthorpe" is preceded by "s", so
107
+ # there's no leading boundary). Dropping only the trailing boundary is the
108
+ # sweet spot: catches concatenation evasion without opening the Scunthorpe
109
+ # floodgates. (Genuine residual false positives are handled by
110
+ # `config.excluded_words`.)
111
+ def self.compile_bundled_patterns
112
+ Dir.glob(BLOCKLISTS_GLOB).each_with_object({}) do |path, acc|
113
+ loaded = YAML.safe_load_file(path) || {}
114
+ loaded.each do |category, raw_patterns|
115
+ list = Array(raw_patterns).map { |source| compile_pair(source) }
116
+ (acc[category.to_s] ||= []).concat(list)
117
+ end
118
+ end
119
+ end
120
+
121
+ # Build the [spaced, compact] regex pair for one blocklist source string.
122
+ def self.compile_pair(source)
123
+ spaced = Regexp.new(source)
124
+ compact = Regexp.new(compact_source(source))
125
+ [spaced, compact]
126
+ end
127
+
128
+ # Derive the compact-form pattern from a blocklist source. The compact form of
129
+ # the text has ALL whitespace removed, so the pattern must too:
130
+ # - strip a single trailing `\b` (so single tokens survive concatenation,
131
+ # e.g. "fuck" inside "fuckyou"); the LEADING `\b` is kept to anchor to a real
132
+ # word start and keep Scunthorpe-type false positives out;
133
+ # - drop interior whitespace matchers (`\s+`, `\s*`, and literal spaces) so a
134
+ # multi-word phrase pattern ("kill\s+yourself") still matches the
135
+ # space-stripped form ("killyourself") — the no-spaces spelling of the same
136
+ # evasion.
137
+ def self.compact_source(source)
138
+ source
139
+ .sub(/\\b\z/, "") # drop trailing word boundary
140
+ .gsub(/\\s[*+]?/, "") # drop \s, \s*, \s+ whitespace matchers
141
+ .gsub(/\[ ([^\]]*)\]/, '[\1]') # drop a literal space inside a char class
142
+ .delete(" ") # drop any remaining literal spaces
143
+ end
144
+
145
+ def classify(value)
146
+ text = value.to_s
147
+ # Empty / blank content can't violate anything — short-circuit so we don't
148
+ # pay for normalization on the (very common) empty case.
149
+ return allowed_result if text.strip.empty?
150
+
151
+ norm = normalize(text)
152
+ hits = matched_categories(norm)
153
+
154
+ return allowed_result if hits.empty?
155
+
156
+ # One Label per matched canonical slug, score 1.0 (deterministic = certain).
157
+ # `Moderate::Label` parses "hate/threatening" into category :hate +
158
+ # subcategory :threatening for us.
159
+ labels = hits.map do |slug|
160
+ category, subcategory = slug.split("/", 2)
161
+ Moderate::Label.new(
162
+ category: category, subcategory: subcategory,
163
+ score: 1.0, flagged: true, input: :text
164
+ )
165
+ end
166
+
167
+ flagged_result(labels: labels)
168
+ end
169
+
170
+ private
171
+
172
+ # The wordlist writes flags with source "text_filter" — one of the four values
173
+ # allowed by the migration's `moderate_flags_source_check` constraint.
174
+ def source_name
175
+ "text_filter"
176
+ end
177
+
178
+ # The canonical slugs that tripped. An excluded word (config.excluded_words)
179
+ # is stripped from the normalized text BEFORE matching, so a legitimate word
180
+ # that contains a banned substring (the classic "Scunthorpe problem") never
181
+ # trips. additional_words are matched as a whole-word extra "harassment" bucket
182
+ # — a host's domain-specific terms it wants caught.
183
+ def matched_categories(norm)
184
+ slugs = self.class.patterns.filter_map do |slug, pairs|
185
+ # A category trips if ANY of its patterns matches EITHER the spaced form
186
+ # (with the precise boundary regex) OR the compact/space-stripped form (with
187
+ # the trailing-boundary-relaxed regex — the spacing-evasion defense).
188
+ slug if pairs.any? { |spaced_re, compact_re| spaced_re.match?(norm.spaced) || compact_re.match?(norm.compact) }
189
+ end
190
+
191
+ slugs.concat(additional_word_categories(norm))
192
+ slugs.uniq
193
+ end
194
+
195
+ # config.additional_words: extra terms the host wants flagged beyond the
196
+ # bundled lists. We treat a hit on any of them as plain "harassment" (the
197
+ # safest generic bucket for "a word this host disallows") with a word-boundary
198
+ # match on the normalized spaced form. Returns [] when none configured.
199
+ def additional_word_categories(norm)
200
+ words = Array(config.additional_words).map { |w| normalize_token(w) }.reject(&:empty?)
201
+ return [] if words.empty?
202
+
203
+ matched = words.any? do |word|
204
+ boundary = /\b#{Regexp.escape(word)}\b/
205
+ boundary.match?(norm.spaced) || norm.compact.include?(word)
206
+ end
207
+ matched ? ["harassment"] : []
208
+ end
209
+
210
+ # Full normalization pipeline (see the class header for the why of each step),
211
+ # producing both the single-spaced and space-removed matchable forms. Excluded
212
+ # words are deleted from the spaced form first so they can never contribute a
213
+ # match (and can't bleed into the compact form either).
214
+ def normalize(text)
215
+ folded = fold(text)
216
+ folded = strip_excluded_words(folded)
217
+ Normalized.new(spaced: folded, compact: folded.delete(" "))
218
+ end
219
+
220
+ def fold(text)
221
+ text
222
+ .unicode_normalize(:nfkd)
223
+ .downcase
224
+ .gsub(COMBINING_MARKS, "")
225
+ .tr(LEETSPEAK.keys.join, LEETSPEAK.values.join)
226
+ .gsub(NON_ALNUM, " ")
227
+ .strip
228
+ .squeeze(" ") # collapse any residual double spaces (no ActiveSupport #squish dependency)
229
+ end
230
+
231
+ # Normalize a single configured token the same way as content, so an
232
+ # excluded/additional word the host writes with caps/accents still lines up
233
+ # with the normalized text it's compared against.
234
+ def normalize_token(word)
235
+ fold(word.to_s).delete(" ")
236
+ end
237
+
238
+ # Remove configured false-positive words from the spaced form before matching.
239
+ # We match them on word boundaries so we only excise the standalone word, not
240
+ # every occurrence of the substring.
241
+ def strip_excluded_words(spaced)
242
+ excluded = Array(config.excluded_words).map { |w| normalize_token(w) }.reject(&:empty?)
243
+ return spaced if excluded.empty?
244
+
245
+ excluded.reduce(spaced) do |acc, word|
246
+ acc.gsub(/\b#{Regexp.escape(word)}\b/, " ")
247
+ end.squeeze(" ").strip
248
+ end
249
+
250
+ def config
251
+ Moderate.config
252
+ end
253
+ end
254
+ end
255
+ end
@@ -0,0 +1,158 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moderate
4
+ # The background worker for ASYNCHRONOUS filter adapters (the :openai adapter, a
5
+ # host's hosted classifier, the default :image adapter) running in :flag mode.
6
+ #
7
+ # ── Why a job exists at all ──────────────────────────────────────────────────
8
+ # An async adapter does blocking I/O (a moderation API call), which must never run
9
+ # inside the request that saved the content — and CANNOT run inside a validator or
10
+ # an after_commit callback without stalling the write. So the :flag path works in
11
+ # two halves:
12
+ # 1. The model's filter concern (`moderates :field`) lets the write succeed, then
13
+ # in an after_commit hook enqueues THIS job for any field whose policy uses an
14
+ # async adapter. (Synchronous adapters like :wordlist skip the job — the
15
+ # concern classifies inline and files the Flag directly.)
16
+ # 2. This job re-reads the saved value, classifies it through the adapter, and —
17
+ # if the content is flagged — files (or updates) a Moderate::Flag for the
18
+ # moderation queue.
19
+ #
20
+ # ── Re-reading the value (deliberate) ────────────────────────────────────────
21
+ # The job is handed the RECORD (GlobalID-serialized by ActiveJob) and the FIELD
22
+ # NAME, not the raw text/image — so it always classifies the CURRENT persisted
23
+ # value. If the record was edited or deleted between enqueue and run, we classify
24
+ # what's actually there now (or skip a vanished record), never a stale snapshot.
25
+ #
26
+ # ── Idempotency ──────────────────────────────────────────────────────────────
27
+ # Flag creation goes through `Moderate::Flag.flag!`, the single builder shared by
28
+ # the synchronous and asynchronous paths. It's an upsert-by-(flaggable, field,
29
+ # source) so a retried job (ActiveJob retries on transient failures) doesn't pile
30
+ # up duplicate queue entries for the same content.
31
+ #
32
+ # ── Base class ───────────────────────────────────────────────────────────────
33
+ # We subclass `ActiveJob::Base` directly (not the host's ApplicationJob) so the gem
34
+ # doesn't depend on a constant that lives in the host app — the same reason the
35
+ # rest of the gem stays host-agnostic. The host configures the queue adapter,
36
+ # retries, and queue name globally as usual.
37
+ class ClassifyJob < ActiveJob::Base
38
+ # Run on a low-priority queue by default — moderation flagging is important but
39
+ # not latency-critical (the content is already published in :flag mode). A host
40
+ # can override the queue globally.
41
+ queue_as { Moderate.config.respond_to?(:job_queue) && Moderate.config.job_queue || :default }
42
+
43
+ # @param record [ActiveRecord::Base] the flaggable record (any Moderate::Reportable).
44
+ # @param field [String, Symbol] the field whose value to classify.
45
+ # @param adapter [String, Symbol, nil] optional explicit adapter name; when nil,
46
+ # the field's resolved FilterPolicy (or the global default) decides.
47
+ def perform(record, field, adapter: nil)
48
+ # The record may have been destroyed between enqueue and execution — nothing
49
+ # to classify, nothing to flag. (ActiveJob raises DeserializationError for a
50
+ # GlobalID that no longer resolves; that's rescued at the framework level, but
51
+ # we also guard a nil here defensively.)
52
+ return if record.nil?
53
+
54
+ field = field.to_s
55
+ policy = resolve_policy(record, field, adapter)
56
+
57
+ # If the field's policy is :off (e.g. it was reconfigured to off after the job
58
+ # was enqueued), there's nothing to do.
59
+ return if policy.respond_to?(:off?) && policy.off?
60
+
61
+ value = field_value(record, field)
62
+ return if blank?(value)
63
+
64
+ result = Moderate.classify(value, policy: policy)
65
+ return unless result.flagged?
66
+
67
+ file_flag(record, field, policy, result, value)
68
+ end
69
+
70
+ private
71
+
72
+ # Resolve the FilterPolicy for this record/field. If an explicit adapter name was
73
+ # passed (the enqueuer already knew it), prefer the field's declared policy but
74
+ # fall back to the global resolution — `Moderate.filter_policy_for` already walks
75
+ # the ancestor chain and falls back to an :off policy, so this is always defined.
76
+ def resolve_policy(record, field, adapter)
77
+ policy = Moderate.filter_policy_for(record, field)
78
+ return policy if adapter.nil?
79
+
80
+ # An explicit adapter override: keep the resolved policy's class/field/mode but
81
+ # swap in the requested adapter, so the job classifies with exactly the backend
82
+ # the enqueuer intended even if config changed.
83
+ Moderate::Configuration::FilterPolicy.new(
84
+ class_name: policy.class_name, field: field,
85
+ adapter: adapter.to_s.strip.downcase.to_sym, mode: policy.mode
86
+ )
87
+ end
88
+
89
+ # File (or upsert) the Moderate::Flag via the shared builder. We pass exactly the
90
+ # columns the install migration defines for moderate_flags. `Flag.flag!` owns the
91
+ # `content_flagged` notify event so there's a single emission site across the
92
+ # sync and async paths — the job doesn't fire it itself, to avoid double-notifying.
93
+ #
94
+ # `result.source` is the human-readable adapter NAME the spine stamped on the
95
+ # Result (e.g. "openai"); the persisted `source` COLUMN, however, is constrained
96
+ # to text_filter/image_filter/external_classifier/manual, which the adapter's own
97
+ # Result already reflects via its `source_name`. We pass `result.source` and let
98
+ # the model coerce/validate against the constraint.
99
+ def file_flag(record, field, policy, result, value)
100
+ Moderate::Flag.flag!(
101
+ flaggable: record,
102
+ field: field,
103
+ owner: content_owner(record),
104
+ source: result.source,
105
+ mode: policy.respond_to?(:mode) ? policy.mode : :flag,
106
+ categories: result.categories,
107
+ scores: result.scores,
108
+ excerpt: excerpt_for(value),
109
+ context: flag_context(policy, result)
110
+ )
111
+ end
112
+
113
+ # Who owns the flagged content. The Reportable concern defines `reported_owner`
114
+ # (the "who's responsible" hook); we use it when present and degrade to nil
115
+ # otherwise (the owner column is nullable — a flag with no resolvable owner is
116
+ # still a valid queue item).
117
+ def content_owner(record)
118
+ record.respond_to?(:reported_owner) ? record.reported_owner : nil
119
+ end
120
+
121
+ # Diagnostic context persisted on the flag (the `context` jsonb column): the raw
122
+ # provider payload (for audit/debugging) plus the policy that produced this flag.
123
+ # Never relied on by the gem's own logic — purely for the human in the queue.
124
+ def flag_context(policy, result)
125
+ context = {}
126
+ context[:raw] = result.raw unless result.raw.nil?
127
+ context[:policy] = {
128
+ class_name: policy.class_name, field: policy.field, mode: policy.mode.to_s
129
+ }
130
+ context
131
+ end
132
+
133
+ # A short, human-readable snippet of the offending value for the queue. We keep
134
+ # it to 500 chars — enough context for a moderator, not
135
+ # a full copy of a long document. An image value (not a String) gets stringified
136
+ # to its reference, which is fine for the queue's "what was flagged" column.
137
+ def excerpt_for(value)
138
+ value.to_s[0, 500]
139
+ end
140
+
141
+ def field_value(record, field)
142
+ record.public_send(field)
143
+ rescue NoMethodError
144
+ nil
145
+ end
146
+
147
+ # Blank check without forcing an ActiveSupport dependency in the job's hot path
148
+ # (ActiveSupport IS loaded in a Rails host, but #blank? on arbitrary values is
149
+ # easy to reproduce and keeps the job self-contained).
150
+ def blank?(value)
151
+ return true if value.nil?
152
+ return value.strip.empty? if value.is_a?(String)
153
+ return value.empty? if value.respond_to?(:empty?)
154
+
155
+ false
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Moderate
4
+ # A single classification label produced by a filter adapter.
5
+ #
6
+ # One piece of content can produce *several* labels — e.g. a message might trip
7
+ # both `hate` and `hate/threatening`, or an image might trip `sexual/minors`
8
+ # while its caption trips `harassment`. Each Label records exactly one
9
+ # (category, subcategory) verdict, the adapter's confidence `score`, and which
10
+ # `input` (text vs image) tripped it. `Moderate::Result` holds the collection.
11
+ #
12
+ # The taxonomy is OpenAI's `omni-moderation-latest` category set, adopted as the
13
+ # gem's ONE canonical vocabulary so every adapter (the offline wordlist, the
14
+ # OpenAI moderation endpoint, or a host-registered backend) speaks the same
15
+ # language — which in turn lets `Moderate::Flag`, the DSA Art. 17 statement of
16
+ # reasons, and the Art. 24 transparency counters all aggregate over a single set.
17
+ # See: https://developers.openai.com/api/docs/guides/moderation
18
+ #
19
+ # Implemented with Ruby's `Data.define` (Ruby 3.2+, which the gemspec requires):
20
+ # an immutable, frozen-by-construction value object — exactly what a label
21
+ # should be (you never mutate a verdict after the fact).
22
+ Label = Data.define(:category, :subcategory, :score, :flagged, :input) do
23
+ # NOTE: constants live OUTSIDE this block (see below). A `Data.define do...end`
24
+ # block is class_eval'd in a context where constant *assignment* leaks to the
25
+ # lexically-enclosing namespace (here `Moderate`) instead of attaching to the
26
+ # Data class — a well-known Ruby gotcha. So `TAXONOMY`/`CATEGORIES`/`INPUTS` are
27
+ # defined by reopening `Moderate::Label` after the `Data.define` call. Instance
28
+ # methods defined in the block (below) are unaffected and work as written.
29
+
30
+ # Normalize everything on the way in so adapters can be sloppy about types:
31
+ # - category/subcategory/input accepted as String or Symbol, downcased
32
+ # - score coerced to Float (defaults to 1.0 — deterministic adapters like the
33
+ # wordlist have no probability, so a trip is "certain")
34
+ # - flagged defaults to true (you only build a Label when something matched)
35
+ def initialize(category:, subcategory: nil, score: 1.0, flagged: true, input: :unknown)
36
+ super(
37
+ category: normalize_symbol(category),
38
+ subcategory: subcategory.nil? ? nil : normalize_symbol(subcategory),
39
+ score: score.nil? ? nil : score.to_f,
40
+ flagged: flagged ? true : false,
41
+ input: normalize_symbol(input || :unknown)
42
+ )
43
+ end
44
+
45
+ # The full slug, OpenAI-style: "hate/threatening", "self-harm/intent", or just
46
+ # "hate" when there's no subcategory. This is the canonical wire/storage form
47
+ # used by `Moderate::Result#categories` and persisted on `Moderate::Flag`.
48
+ def slug
49
+ subcategory ? "#{category}/#{subcategory}" : category.to_s
50
+ end
51
+
52
+ # True when this label belongs to the canonical taxonomy. Adapters MAY emit
53
+ # off-taxonomy labels (a provider category we haven't mapped) — we don't raise,
54
+ # we just let callers filter on `canonical?` when they want strictness.
55
+ def canonical?
56
+ # `self.class::TAXONOMY` resolves the constant on the Label class regardless
57
+ # of the Data.define block's quirky lexical scope (see the note at the top).
58
+ subs = self.class::TAXONOMY[category]
59
+ return false if subs.nil?
60
+
61
+ subcategory.nil? || subs.include?(subcategory)
62
+ end
63
+
64
+ private
65
+
66
+ def normalize_symbol(value)
67
+ value.to_s.strip.downcase.to_sym
68
+ end
69
+ end
70
+
71
+ # --- Canonical taxonomy constants (attached to Moderate::Label) -------------
72
+ # Defined here, by reopening the class, rather than inside the Data.define block
73
+ # above, because constant assignment inside that block would leak to the
74
+ # `Moderate` namespace instead of landing on `Moderate::Label`.
75
+ class Label
76
+ # The canonical OpenAI moderation taxonomy: each top-level category mapped to
77
+ # its allowed subcategories. Sources, in the README and OpenAI's docs
78
+ # (https://developers.openai.com/api/docs/guides/moderation):
79
+ # harassment → :threatening
80
+ # hate → :threatening
81
+ # sexual → :minors
82
+ # self-harm → :intent, :instructions
83
+ # violence → :graphic
84
+ # illicit → :violent
85
+ #
86
+ # `nil` is always an implicitly-valid subcategory (the bare top-level category,
87
+ # e.g. plain `:hate` with no qualifier).
88
+ #
89
+ # NOTE: `self-harm` is the hyphenated symbol `:"self-harm"` to match OpenAI's
90
+ # wire format verbatim — `Result#categories` joins category+subcategory with "/"
91
+ # to reproduce OpenAI's exact slug strings ("self-harm/intent" etc.), so
92
+ # downstream consumers comparing against OpenAI labels line up byte-for-byte.
93
+ TAXONOMY = {
94
+ harassment: %i[threatening],
95
+ hate: %i[threatening],
96
+ sexual: %i[minors],
97
+ "self-harm": %i[intent instructions],
98
+ violence: %i[graphic],
99
+ illicit: %i[violent]
100
+ }.freeze
101
+
102
+ # Every canonical category as a flat symbol list, for validation and iteration.
103
+ CATEGORIES = TAXONOMY.keys.freeze
104
+
105
+ # Which inputs an adapter can attribute a label to. `:text` and `:image` mirror
106
+ # OpenAI's multimodal `category_applied_input_types`; `:unknown` is the safe
107
+ # default for adapters (like the offline wordlist) that only see one kind of
108
+ # input and don't bother to say which.
109
+ INPUTS = %i[text image unknown].freeze
110
+ end
111
+ end