RubyGems - profanity-filter - Versions diffs - 0.1.2 - Mend

profanity-filter 0.1.2

Files changed (29) hide show

checksums.yaml +7 -0
data/.gitignore +9 -0
data/.travis.yml +7 -0
data/CHANGELOG.md +0 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +4 -0
data/Gemfile.lock +34 -0
data/LICENSE.txt +21 -0
data/README.md +54 -0
data/Rakefile +10 -0
data/bin/console +7 -0
data/bin/setup +8 -0
data/lib/profanity_dictionaries/en.yaml +166 -0
data/lib/profanity_dictionaries/es.yaml +68 -0
data/lib/profanity_dictionaries/leet_strategy_dictionary.yaml +1716 -0
data/lib/profanity_dictionaries/partial_match.yaml +1 -0
data/lib/profanity_dictionaries/pt.yaml +74 -0
data/lib/profanity_filter.rb +160 -0
data/lib/profanity_filter/engines/allow_duplicate_characters_strategy.rb +15 -0
data/lib/profanity_filter/engines/allow_symbols_in_words_strategy.rb +16 -0
data/lib/profanity_filter/engines/component.rb +17 -0
data/lib/profanity_filter/engines/composite.rb +37 -0
data/lib/profanity_filter/engines/exact_match_strategy.rb +35 -0
data/lib/profanity_filter/engines/leet_exact_match_strategy.rb +40 -0
data/lib/profanity_filter/engines/partial_match_strategy.rb +28 -0
data/lib/profanity_filter/engines/regexp_strategy.rb +37 -0
data/lib/profanity_filter/version.rb +3 -0
data/profanity_filter.gemspec +37 -0
metadata +162 -0

data/lib/profanity_dictionaries/partial_match.yaml ADDED

	@@ -0,0 +1 @@
1	+ - 🖕

data/lib/profanity_dictionaries/pt.yaml ADDED

@@ -0,0 +1,74 @@
+- aborto
+- amador
+- ânus
+- aranha
+- ariano
+- balalao
+- bastardo
+- bicha
+- biscate
+- bissexual
+- boceta
+- boob
+- bosta
+- braulio de borracha
+- bumbum
+- burro
+- cabrao
+- cacete
+- cagar
+- camisinha
+- caralho
+- cerveja
+- chochota
+- chupar
+- clitoris
+- cocaína
+- colhoes
+- comer
+- cona
+- consolo
+- corno
+- cu
+- dar o rabo
+- dum raio
+- esporra
+- fecal
+- filho da puta
+- foda
+- foda-se
+- foder
+- frango assado
+- gozar
+- grelho
+- heroína
+- heterosexual
+- homem gay
+- homoerótico
+- homosexual
+- inferno
+- lésbica
+- lolita
+- mama
+- merda
+- paneleiro
+- passar um cheque
+- pau
+- peidar
+- pênis
+- pinto
+- porra
+- puta
+- puta que pariu
+- puta que te pariu
+- queca
+- sacanagem
+- saco
+- torneira
+- transar
+- vai-te foder
+- vai tomar no cu
+- veado
+- vibrador
+- xana
+- xochota

data/lib/profanity_filter.rb ADDED

@@ -0,0 +1,160 @@
+# frozen_string_literal: true
+require 'profanity_filter/version'
+require 'profanity_filter/engines/composite'
+require 'profanity_filter/engines/partial_match_strategy'
+require 'profanity_filter/engines/allow_duplicate_characters_strategy'
+require 'profanity_filter/engines/allow_symbols_in_words_strategy'
+require 'profanity_filter/engines/leet_exact_match_strategy'
+require 'web_purify'
+class ProfanityFilter
+  WP_DEFAULT_LANGS = [:en, :sp, :pt].freeze
+  WP_AVAILABLE_LANGS = [
+    :en, :ar, :fr, :de, :hi, :jp, :it, :pt, :ru, :sp, :th, :tr, :zh, :kr, :pa
+  ].freeze
+  WP_LANG_CONVERSIONS = { es: :sp, ko: :kr, ja: :jp }.freeze
+  attr_reader :strict_filter, :tolerant_filter
+  def initialize(web_purifier_api_key: nil)
+    # If we are using Web Purifier
+    @wp_client = web_purifier_api_key ? WebPurify::Client.new(web_purifier_api_key) : nil
+    exact_match_dictionary = load_exact_match_dictionary
+    partial_match_dictionary = load_partial_match_dictionary
+    allow_symbol_strategy = ::ProfanityFilterEngine::AllowSymbolsInWordsStrategy.new(
+      dictionary: exact_match_dictionary,
+      ignore_case: true
+    )
+    duplicate_characters_strategy = ::ProfanityFilterEngine::AllowDuplicateCharactersStrategy.new(
+      dictionary: exact_match_dictionary,
+      ignore_case: true
+    )
+    leet_strategy = ::ProfanityFilterEngine::LeetExactMatchStrategy.new(
+      dictionary: exact_match_dictionary,
+      ignore_case: true
+    )
+    partial_match_strategy = ::ProfanityFilterEngine::PartialMatchStrategy.new(
+      dictionary: partial_match_dictionary,
+      ignore_case: true
+    )
+    # Set up strict filter.
+    @strict_filter = ::ProfanityFilterEngine::Composite.new
+    @strict_filter.add_strategies(
+      leet_strategy,
+      allow_symbol_strategy,
+      partial_match_strategy,
+      duplicate_characters_strategy
+    )
+    # Set up tolerant filter.
+    @tolerant_filter = ::ProfanityFilterEngine::Composite.new
+    @tolerant_filter.add_strategies(
+      allow_symbol_strategy,
+      partial_match_strategy
+    )
+  end
+  def profane?(phrase, lang: nil, strictness: :tolerant)
+    return false if phrase == '' || phrase.nil?
+    is_profane = pf_profane?(phrase, strictness: strictness)
+    if !is_profane && use_webpurify?
+      wp_is_profane = wp_profane?(phrase, lang: lang)
+      is_profane = wp_is_profane unless wp_is_profane.nil?
+    end
+    !!is_profane
+  end
+  def profanity_count(phrase, lang: nil, strictness: :tolerant)
+    return 0 if phrase == '' || phrase.nil?
+    banned_words_count = pf_profanity_count(phrase, strictness: strictness)
+    if banned_words_count == 0 && use_webpurify?
+      wp_banned_words_count = wp_profanity_count(phrase, lang: lang)
+      banned_words_count = wp_banned_words_count unless wp_banned_words_count.nil?
+    end
+    banned_words_count
+  end
+  private
+  def use_webpurify?
+    !!@wp_client
+  end
+  def filter(strictness: :tolerant)
+    case strictness
+    when :strict
+      @strict_filter
+    when :tolerant
+      @tolerant_filter
+    else
+      @tolerant_filter
+    end
+  end
+  def pf_profane?(phrase, strictness: :tolerant)
+    filter(strictness: strictness).profane?(phrase)
+  end
+  def pf_profanity_count(phrase, strictness: :tolerant)
+    filter(strictness: strictness).profanity_count(phrase)
+  end
+  def wp_profane?(phrase, lang: nil, timeout_duration: 5)
+    profanity_count = wp_profanity_count(phrase, lang: lang, timeout_duration: timeout_duration)
+    if profanity_count.nil? || profanity_count == 0
+      false
+    else
+      true
+    end
+  end
+  def wp_profanity_count(phrase, lang: nil, timeout_duration: 5)
+    Timeout::timeout(timeout_duration) do
+      @wp_client.check_count phrase, lang: wp_langs_list_with(lang)
+    end
+  rescue StandardError => e
+    nil
+  end
+  def wp_langs_list_with(lang)
+    langs = Set.new(WP_DEFAULT_LANGS)
+    if lang
+      lang = shorten_language(lang).to_sym
+      lang = WP_LANG_CONVERSIONS[lang] || lang
+      if WP_AVAILABLE_LANGS.include?(lang)
+        langs << lang
+      end
+    end
+    langs.to_a.join(',')
+  end
+  def load_dictionary(file_path)
+    dir = File.dirname(__FILE__)
+    YAML.load(File.read("#{dir}/profanity_dictionaries/#{file_path}.yaml"))
+  end
+  def load_exact_match_dictionary
+    en_dictionary = load_dictionary('en')
+    es_dictionary = load_dictionary('es')
+    pt_dictionary = load_dictionary('pt')
+    en_dictionary + es_dictionary + pt_dictionary
+  end
+  def load_partial_match_dictionary
+    load_dictionary('partial_match')
+  end
+  def shorten_language(lang)
+    lang && lang.to_s.downcase[0, 2]
+  end
+end

data/lib/profanity_filter/engines/allow_duplicate_characters_strategy.rb ADDED

@@ -0,0 +1,15 @@
+# frozen_string_literal: true
+require_relative 'exact_match_strategy'
+module ProfanityFilterEngine
+  class AllowDuplicateCharactersStrategy < ExactMatchStrategy
+    DEFAULT_IGNORE_CASE = true
+    private
+    def build_word_regexp(word)
+      word.chars.map { |char| Regexp.escape(char) + '+' }.join
+    end
+  end
+end

data/lib/profanity_filter/engines/allow_symbols_in_words_strategy.rb ADDED

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+require_relative 'exact_match_strategy'
+module ProfanityFilterEngine
+  class AllowSymbolsInWordsStrategy < ExactMatchStrategy
+    SYMBOLS_REGEXP = '(?:\p{Mark}|\p{Separator}|\p{Symbol}|\p{Punctuation})*'
+    DEFAULT_IGNORE_CASE = true
+    private
+    def build_word_regexp(word)
+      word.chars.map { |char| Regexp.escape(char) }.join(SYMBOLS_REGEXP)
+    end
+  end
+end

data/lib/profanity_filter/engines/component.rb ADDED

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+module ProfanityFilterEngine
+  class Component
+    def profane?(text)
+      raise NotImplementedError
+    end
+    def profane_words(text)
+      raise NotImplementedError
+    end
+    def profanity_count(text)
+      profane_words(text).size
+    end
+  end
+end

data/lib/profanity_filter/engines/composite.rb ADDED

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require 'yaml'
+require_relative 'component'
+module ProfanityFilterEngine
+  class Composite < Component
+    attr_reader :strategies
+    def initialize
+      @strategies = []
+    end
+    def add_strategy(strategy)
+      strategies << strategy
+    end
+    def add_strategies(*new_strategies)
+      strategies.concat(new_strategies)
+    end
+    def delete_strategy(strategy)
+      strategies.delete(strategy)
+    end
+    def profane?(text)
+      strategies.any? { |strategy| strategy.profane?(text) }
+    end
+    def profane_words(text)
+      total_words = strategies.reduce([]) do |words, strategy|
+        words.concat(strategy.profane_words(text))
+      end
+      total_words.uniq
+    end
+  end
+end

data/lib/profanity_filter/engines/exact_match_strategy.rb ADDED

@@ -0,0 +1,35 @@
+# frozen_string_literal: true
+require_relative 'regexp_strategy'
+module ProfanityFilterEngine
+  class ExactMatchStrategy < RegexpStrategy
+    DELIMITER = '(?:\b|^|$|_)'
+    DEFAULT_IGNORE_CASE = false
+    attr_reader :delimiter
+    attr_reader :ignore_case
+    def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
+      @dictionary = dictionary
+      @delimiter = DELIMITER
+      @ignore_case = ignore_case
+      @profanity_regexp = build_profanity_regexp
+    end
+    private
+    def build_profanity_regexp
+      option = ignore_case ? Regexp::IGNORECASE : nil
+      regexp_list = dictionary.map do |word|
+        Regexp.new("#{delimiter}#{build_word_regexp(word)}#{delimiter}", option)
+      end
+      Regexp.union(*regexp_list)
+    end
+    def build_word_regexp(word)
+      Regexp.escape(word)
+    end
+  end
+end

data/lib/profanity_filter/engines/leet_exact_match_strategy.rb ADDED

@@ -0,0 +1,40 @@
+# frozen_string_literal: true
+require_relative 'exact_match_strategy'
+require 'pry'
+module ProfanityFilterEngine
+  class LeetExactMatchStrategy < ExactMatchStrategy
+    DEFAULT_IGNORE_CASE = true
+    private
+    def build_word_regexp(word)
+      build_leet_dictionary unless defined? LEET_DICTIONARY
+      word.chars.map do |char|
+        downcase_char = char.downcase
+        if LEET_DICTIONARY.include?(downcase_char)
+          LEET_DICTIONARY[downcase_char]
+        else
+          Regexp.escape(char)
+        end
+      end.join
+    end
+    def build_leet_dictionary
+      lib_dir  = File.expand_path('../../../', __FILE__)
+      file     = File.read("#{lib_dir}/profanity_dictionaries/leet_strategy_dictionary.yaml")
+      raw_data = YAML.safe_load(file)
+      dict     = transform_data_to_regex(raw_data)
+      ::ProfanityFilterEngine::LeetExactMatchStrategy.const_set('LEET_DICTIONARY', dict)
+    end
+    def transform_data_to_regex(dict)
+      dict.map do |char, data|
+        data_str = data.join('|')
+        dict[char] = "(?:#{data_str})"
+      end
+      dict
+    end
+  end
+end

data/lib/profanity_filter/engines/partial_match_strategy.rb ADDED

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+require_relative 'regexp_strategy'
+module ProfanityFilterEngine
+  class PartialMatchStrategy < RegexpStrategy
+    DEFAULT_IGNORE_CASE = false
+    attr_reader :ignore_case
+    def initialize(dictionary:, ignore_case: DEFAULT_IGNORE_CASE)
+      @dictionary = dictionary
+      @ignore_case = ignore_case
+      @profanity_regexp = build_profanity_regexp
+    end
+    private
+    def build_profanity_regexp
+      option = ignore_case ? Regexp::IGNORECASE : nil
+      regexp_list = dictionary.map do |word|
+        Regexp.new("#{Regexp.escape(word)}", option)
+      end
+      Regexp.union(*regexp_list)
+    end
+  end
+end

data/lib/profanity_filter/engines/regexp_strategy.rb ADDED

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+require_relative 'component'
+module ProfanityFilterEngine
+  class RegexpStrategy < Component
+    DEFAULT_DELIMITER = '(?:\b|^|$|_)'
+    attr_reader :dictionary, :profanity_regexp
+    attr_writer :profanity_regexp
+    private :profanity_regexp=
+    def initialize(dictionary:, profanity_regexp: nil)
+      @dictionary = dictionary
+      @profanity_regexp = profanity_regexp || build_profanity_regexp
+    end
+    def profane_words(text)
+      text.scan(profanity_regexp).uniq
+    end
+    def profane?(text)
+      profanity_regexp.match?(text)
+    end
+    private
+    def build_profanity_regexp
+      regexp_list = dictionary.map do |word|
+        Regexp.new("#{DEFAULT_DELIMITER}#{Regexp.escape(word)}#{DEFAULT_DELIMITER}")
+      end
+      Regexp.union(*regexp_list)
+    end
+  end
+end