wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the size of removed text in the edit's new
|
|
7
|
+
# revision.
|
|
8
|
+
class RemovedSize < Base
|
|
9
|
+
# Returns the size of removed character in the new revision.
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
size = edit.removed_text.size
|
|
14
|
+
size > 0 ? size : 0
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of vulgarism words in the removed text.
|
|
9
|
+
class RemovedVulgarismFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of vulgarism words in the removed text.
|
|
11
|
+
# Returns 0.0 if removed clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::VULGARISM)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the number of removed words in the edit's new
|
|
7
|
+
# revision.
|
|
8
|
+
class RemovedWords < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
edit.removed_words.count
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/diff'
|
|
3
|
+
require 'hotwater'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the similarity of deleted to inserted text.
|
|
9
|
+
# As similarity measure the Jaro-Winkler distance is used.
|
|
10
|
+
# See: http://courses.cs.washington.edu/courses/cse590q/04au/papers/Winkler99.pdf
|
|
11
|
+
class ReplacementSimilarity < Base
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
::Hotwater.jaro_winkler_distance(edit.removed_text, edit.inserted_text)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/contains_base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature returns whether the edit's comment includes 'copyedit'.
|
|
7
|
+
class Copyedit < ContainsBase
|
|
8
|
+
def calculate(edit)
|
|
9
|
+
super
|
|
10
|
+
|
|
11
|
+
contains(edit.new_revision.comment, ['copyedit', 'copy edit'])
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/algorithms'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the Kullback-Leibler Divergence of the old and
|
|
8
|
+
# new text's character distribution.
|
|
9
|
+
# The smaller the divergence, the higher the similarity of the
|
|
10
|
+
# distributions and conversely.
|
|
11
|
+
class RevisionsCharacterDistribution < Base
|
|
12
|
+
include Algorithms
|
|
13
|
+
|
|
14
|
+
def calculate(edit)
|
|
15
|
+
super
|
|
16
|
+
|
|
17
|
+
kullback_leibler_divergence(
|
|
18
|
+
edit.old_revision.text,
|
|
19
|
+
edit.new_revision.text
|
|
20
|
+
)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require_relative 'base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature describes whether the contributor of the new revision is
|
|
7
|
+
# the same as the editor of the old revision.
|
|
8
|
+
class SameEditor < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
old_revision = edit.old_revision
|
|
13
|
+
|
|
14
|
+
if old_revision.contributor.blank?
|
|
15
|
+
xml = Wikipedia.api_request(
|
|
16
|
+
prop: 'revisions',
|
|
17
|
+
rvprop: 'user',
|
|
18
|
+
revids: old_revision.id
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
contributor = xml.xpath('//rev/@user').text
|
|
22
|
+
return Features::MISSING_VALUE if contributor.blank?
|
|
23
|
+
|
|
24
|
+
old_revision.contributor = contributor
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
old_revision.contributor == edit.new_revision.contributor ? 1 : 0
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of bad words in the inserted text.
|
|
9
|
+
class SexFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of biased words in the inserted text.
|
|
11
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::SEX)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of sex words in the text.
|
|
9
|
+
class SexImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
old_text = edit.old_revision.text.clean
|
|
13
|
+
new_text = edit.new_revision.text.clean
|
|
14
|
+
|
|
15
|
+
impact(old_text, new_text, WordLists::SEX)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the increment of the edit's revisions text length.
|
|
7
|
+
class SizeIncrement < Base
|
|
8
|
+
# computation: |new| - |old|
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
old_size = edit.old_revision.text.size
|
|
13
|
+
new_size = edit.new_revision.text.size
|
|
14
|
+
|
|
15
|
+
new_size - old_size
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the ratio of the edit's revisions text length.
|
|
7
|
+
class SizeRatio < Base
|
|
8
|
+
# Returns the ration of new text length to old text length:
|
|
9
|
+
# returns 0.0 for empty old revision text,
|
|
10
|
+
# returns 1.0 for empty new revision text,
|
|
11
|
+
# returns 0.5 for both revision texts empty or same size
|
|
12
|
+
# computation: old / old + new
|
|
13
|
+
def calculate(edit)
|
|
14
|
+
super
|
|
15
|
+
|
|
16
|
+
old_size = edit.old_revision.text.size.to_f
|
|
17
|
+
new_size = edit.new_revision.text.size.to_f
|
|
18
|
+
|
|
19
|
+
if old_size.zero? && new_size.zero?
|
|
20
|
+
0.5
|
|
21
|
+
else
|
|
22
|
+
old_size / (old_size + new_size)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/text'
|
|
3
|
+
require 'wikipedia/vandalism_detection/diff'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes average frequency of words inserted in the new
|
|
9
|
+
# revision relative to the words in the old revision.
|
|
10
|
+
class TermFrequency < FrequencyBase
|
|
11
|
+
def calculate(edit)
|
|
12
|
+
super
|
|
13
|
+
|
|
14
|
+
new_text = edit.new_revision.text
|
|
15
|
+
inserted_terms = Text.new(edit.inserted_words.join("\n"))
|
|
16
|
+
.clean.gsub(/[^\w\s]/, '').split.uniq
|
|
17
|
+
|
|
18
|
+
summed_frequencies = inserted_terms.reduce(0) do |count, term|
|
|
19
|
+
count + frequency(new_text.clean, term)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
if inserted_terms.count > 0
|
|
23
|
+
summed_frequencies / inserted_terms.count
|
|
24
|
+
else
|
|
25
|
+
0.0
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'date'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the time interval in days between old and new
|
|
8
|
+
# revision.
|
|
9
|
+
class TimeInterval < Base
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
new_time = DateTime.parse(edit.new_revision.timestamp)
|
|
14
|
+
old_timestamp = timestamp_for(edit.old_revision)
|
|
15
|
+
|
|
16
|
+
return Features::MISSING_VALUE unless old_timestamp
|
|
17
|
+
old_time = DateTime.parse(old_timestamp)
|
|
18
|
+
|
|
19
|
+
(new_time - old_time).to_f.abs
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def timestamp_for(revision)
|
|
25
|
+
return revision.timestamp if revision.timestamp.present?
|
|
26
|
+
|
|
27
|
+
xml = Wikipedia.api_request(
|
|
28
|
+
prop: 'revisions',
|
|
29
|
+
rvprop: 'timestamp',
|
|
30
|
+
revids: revision.id
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
xml.xpath('//rev/@timestamp').text.presence
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'date'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature calculates the time of day of new revision edit as decimal
|
|
8
|
+
# value .
|
|
9
|
+
class TimeOfDay < Base
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
timestamp = edit.new_revision.timestamp
|
|
14
|
+
time = DateTime.parse(timestamp)
|
|
15
|
+
|
|
16
|
+
time.hour.to_f + time.min / 60.0 + time.sec / 360.0
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the uppercase to all letters ratio of the edit's
|
|
7
|
+
# new revision inserted text.
|
|
8
|
+
class UpperCaseRatio < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
text = edit.inserted_text
|
|
13
|
+
return 0.0 if text.empty?
|
|
14
|
+
|
|
15
|
+
uppercase_count = text.scan(/[[:upper:]]/).size
|
|
16
|
+
all_letters_count = text.scan(/[[:alpha:]]/).size
|
|
17
|
+
|
|
18
|
+
(1.0 + uppercase_count) / (1.0 + all_letters_count)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
4
|
+
require 'wikipedia/vandalism_detection/text'
|
|
5
|
+
|
|
6
|
+
module Wikipedia
|
|
7
|
+
module VandalismDetection
|
|
8
|
+
module Features
|
|
9
|
+
# This feature computes the uppercase to all words ratio of the edit's new
|
|
10
|
+
# revision inserted text.
|
|
11
|
+
class UpperCaseWordsRatio < Base
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
inserted_alpha_text = edit.inserted_words
|
|
16
|
+
.delete_if { |w| w.gsub(/[^A-Za-z]/, '').empty? }
|
|
17
|
+
.join("\n")
|
|
18
|
+
|
|
19
|
+
words = Text.new(inserted_alpha_text).clean.gsub(/[^\w\s]/, '').split
|
|
20
|
+
|
|
21
|
+
return 0.0 if words.empty?
|
|
22
|
+
|
|
23
|
+
uppercase_words_count = words.reduce(0) do |count, word|
|
|
24
|
+
count += 1 if word == word.upcase
|
|
25
|
+
count
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
(1.0 + uppercase_words_count) / (1.0 + words.count)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the uppercase to all letters ratio of the edit's
|
|
7
|
+
# new revision inserted text.
|
|
8
|
+
class UpperToLowerCaseRatio < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
text = edit.inserted_text
|
|
13
|
+
return 0.0 if text.empty?
|
|
14
|
+
|
|
15
|
+
uppercase_count = text.scan(/[[:upper:]]/).size
|
|
16
|
+
lowercase_count = text.scan(/[[:lower:]]/).size
|
|
17
|
+
|
|
18
|
+
(1.0 + uppercase_count) / (1.0 + lowercase_count)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of vulgarism words in the inserted text.
|
|
9
|
+
class VulgarismFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of vulgarism words in the inserted text.
|
|
11
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::VULGARISM)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of vulgarism words in the text.
|
|
9
|
+
class VulgarismImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
old_text = edit.old_revision.text.clean
|
|
14
|
+
new_text = edit.new_revision.text.clean
|
|
15
|
+
|
|
16
|
+
impact(old_text, new_text, WordLists::VULGARISM)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'date'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature calculates the weekday of new revision edit as numeric
|
|
8
|
+
# value. Monday => 1, Thuesday => 2, etc.
|
|
9
|
+
class Weekday < Base
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
timestamp = edit.new_revision.timestamp
|
|
14
|
+
DateTime.parse(timestamp).wday
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the increment of the edit's revisions words.
|
|
7
|
+
class WordsIncrement < Base
|
|
8
|
+
# computation: |inserted| - |removed|
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
inserted_count = edit.inserted_words.count
|
|
13
|
+
removed_count = edit.removed_words.count
|
|
14
|
+
|
|
15
|
+
inserted_count - removed_count
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/all_wordlists_frequency'
|
|
2
|
+
require 'wikipedia/vandalism_detection/features/all_wordlists_impact'
|
|
3
|
+
require 'wikipedia/vandalism_detection/features/anonymity'
|
|
4
|
+
require 'wikipedia/vandalism_detection/features/anonymity_previous'
|
|
5
|
+
require 'wikipedia/vandalism_detection/features/article_size'
|
|
6
|
+
require 'wikipedia/vandalism_detection/features/bad_frequency'
|
|
7
|
+
require 'wikipedia/vandalism_detection/features/bad_impact'
|
|
8
|
+
require 'wikipedia/vandalism_detection/features/biased_frequency'
|
|
9
|
+
require 'wikipedia/vandalism_detection/features/biased_impact'
|
|
10
|
+
require 'wikipedia/vandalism_detection/features/blanking'
|
|
11
|
+
require 'wikipedia/vandalism_detection/features/character_diversity'
|
|
12
|
+
require 'wikipedia/vandalism_detection/features/character_sequence'
|
|
13
|
+
require 'wikipedia/vandalism_detection/features/comment_length'
|
|
14
|
+
require 'wikipedia/vandalism_detection/features/comment_bad_frequency'
|
|
15
|
+
require 'wikipedia/vandalism_detection/features/comment_biased_frequency'
|
|
16
|
+
require 'wikipedia/vandalism_detection/features/comment_markup_frequency'
|
|
17
|
+
require 'wikipedia/vandalism_detection/features/comment_pronoun_frequency'
|
|
18
|
+
require 'wikipedia/vandalism_detection/features/comment_sex_frequency'
|
|
19
|
+
require 'wikipedia/vandalism_detection/features/comment_vulgarism_frequency'
|
|
20
|
+
require 'wikipedia/vandalism_detection/features/compressibility'
|
|
21
|
+
require 'wikipedia/vandalism_detection/features/copyedit'
|
|
22
|
+
require 'wikipedia/vandalism_detection/features/digit_ratio'
|
|
23
|
+
require 'wikipedia/vandalism_detection/features/edits_per_user'
|
|
24
|
+
require 'wikipedia/vandalism_detection/features/emoticons_frequency'
|
|
25
|
+
require 'wikipedia/vandalism_detection/features/emoticons_impact'
|
|
26
|
+
require 'wikipedia/vandalism_detection/features/inserted_size'
|
|
27
|
+
require 'wikipedia/vandalism_detection/features/inserted_words'
|
|
28
|
+
require 'wikipedia/vandalism_detection/features/inserted_character_distribution'
|
|
29
|
+
require 'wikipedia/vandalism_detection/features/inserted_external_links'
|
|
30
|
+
require 'wikipedia/vandalism_detection/features/inserted_internal_links'
|
|
31
|
+
require 'wikipedia/vandalism_detection/features/longest_word'
|
|
32
|
+
require 'wikipedia/vandalism_detection/features/markup_frequency'
|
|
33
|
+
require 'wikipedia/vandalism_detection/features/markup_impact'
|
|
34
|
+
require 'wikipedia/vandalism_detection/features/non_alphanumeric_ratio'
|
|
35
|
+
require 'wikipedia/vandalism_detection/features/personal_life'
|
|
36
|
+
require 'wikipedia/vandalism_detection/features/pronoun_frequency'
|
|
37
|
+
require 'wikipedia/vandalism_detection/features/pronoun_impact'
|
|
38
|
+
require 'wikipedia/vandalism_detection/features/removed_all_wordlists_frequency'
|
|
39
|
+
require 'wikipedia/vandalism_detection/features/removed_bad_frequency'
|
|
40
|
+
require 'wikipedia/vandalism_detection/features/removed_biased_frequency'
|
|
41
|
+
require 'wikipedia/vandalism_detection/features/removed_character_distribution'
|
|
42
|
+
require 'wikipedia/vandalism_detection/features/removed_emoticons_frequency'
|
|
43
|
+
require 'wikipedia/vandalism_detection/features/removed_markup_frequency'
|
|
44
|
+
require 'wikipedia/vandalism_detection/features/removed_pronoun_frequency'
|
|
45
|
+
require 'wikipedia/vandalism_detection/features/removed_sex_frequency'
|
|
46
|
+
require 'wikipedia/vandalism_detection/features/removed_vulgarism_frequency'
|
|
47
|
+
require 'wikipedia/vandalism_detection/features/removed_size'
|
|
48
|
+
require 'wikipedia/vandalism_detection/features/removed_words'
|
|
49
|
+
require 'wikipedia/vandalism_detection/features/replacement_similarity'
|
|
50
|
+
require 'wikipedia/vandalism_detection/features/reverted'
|
|
51
|
+
require 'wikipedia/vandalism_detection/features/revisions_character_distribution'
|
|
52
|
+
require 'wikipedia/vandalism_detection/features/same_editor'
|
|
53
|
+
require 'wikipedia/vandalism_detection/features/sex_frequency'
|
|
54
|
+
require 'wikipedia/vandalism_detection/features/sex_impact'
|
|
55
|
+
require 'wikipedia/vandalism_detection/features/size_increment'
|
|
56
|
+
require 'wikipedia/vandalism_detection/features/size_ratio'
|
|
57
|
+
require 'wikipedia/vandalism_detection/features/term_frequency'
|
|
58
|
+
require 'wikipedia/vandalism_detection/features/time_interval'
|
|
59
|
+
require 'wikipedia/vandalism_detection/features/time_of_day'
|
|
60
|
+
require 'wikipedia/vandalism_detection/features/upper_case_ratio'
|
|
61
|
+
require 'wikipedia/vandalism_detection/features/upper_case_words_ratio'
|
|
62
|
+
require 'wikipedia/vandalism_detection/features/upper_to_lower_case_ratio'
|
|
63
|
+
require 'wikipedia/vandalism_detection/features/vulgarism_frequency'
|
|
64
|
+
require 'wikipedia/vandalism_detection/features/vulgarism_impact'
|
|
65
|
+
require 'wikipedia/vandalism_detection/features/weekday'
|
|
66
|
+
require 'wikipedia/vandalism_detection/features/words_increment'
|