wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
require 'active_support/core_ext/string'
|
|
2
|
+
require 'active_support/core_ext/array'
|
|
3
|
+
|
|
4
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
|
5
|
+
require 'wikipedia/vandalism_detection/features'
|
|
6
|
+
require 'wikipedia/vandalism_detection/edit'
|
|
7
|
+
|
|
8
|
+
module Wikipedia
|
|
9
|
+
module VandalismDetection
|
|
10
|
+
# This class provides methods for calculating a feature set of an edit.
|
|
11
|
+
# The features that shall be used can be defined in the config/wikipedia-vandalism-detection.yml file
|
|
12
|
+
# under the 'features:' root attribute like this:
|
|
13
|
+
#
|
|
14
|
+
# features:
|
|
15
|
+
# - anonymity
|
|
16
|
+
# - character sequence
|
|
17
|
+
# - ...
|
|
18
|
+
# etc.
|
|
19
|
+
class FeatureCalculator
|
|
20
|
+
def initialize
|
|
21
|
+
@features = Wikipedia::VandalismDetection.config.features
|
|
22
|
+
raise FeaturesNotConfiguredError if @features.blank? || @features.empty?
|
|
23
|
+
@feature_classes = build_feature_classes @features
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Calculates the configured festures for the given edit and returns an
|
|
27
|
+
# array of the computed values.
|
|
28
|
+
def calculate_features_for(edit)
|
|
29
|
+
raise ArgumentError, 'Input has to be an Edit.' unless edit.is_a?(Edit)
|
|
30
|
+
|
|
31
|
+
features = @feature_classes.map do |feature|
|
|
32
|
+
begin
|
|
33
|
+
feature.calculate(edit)
|
|
34
|
+
rescue WikitextExtractionError
|
|
35
|
+
$stderr.print %{
|
|
36
|
+
Edit (#{edit.old_revision.id}, #{edit.new_revision.id}) could not
|
|
37
|
+
be parsed by the WikitextExtractor and will be discarded.\n""}
|
|
38
|
+
|
|
39
|
+
Features::MISSING_VALUE
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
features
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Returns the calculated Numeric feature value for given edit and feature with given name
|
|
47
|
+
def calculate_feature_for(edit, feature_name)
|
|
48
|
+
unless edit.is_a?(Edit)
|
|
49
|
+
raise ArgumentError, 'First parameter has to be an Edit.'
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
unless feature_name.is_a?(String)
|
|
53
|
+
message = 'Second parameter has to be a feature name String ' \
|
|
54
|
+
'(e.g. "anonymity").'
|
|
55
|
+
raise ArgumentError, message
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
value = Features::MISSING_VALUE
|
|
59
|
+
|
|
60
|
+
begin
|
|
61
|
+
feature = feature_class_from_name(feature_name)
|
|
62
|
+
value = feature.calculate(edit)
|
|
63
|
+
rescue WikitextExtractionError
|
|
64
|
+
$stderr.print %{
|
|
65
|
+
Edit (#{edit.old_revision.id}, #{edit.new_revision.id}) could not
|
|
66
|
+
be parsed by the WikitextExtractor and will be discarded.\n""}
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
value
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Returns the feature names as defined in
|
|
73
|
+
# conf/wikipedia-vandalism-detection.yml under 'features:'.
|
|
74
|
+
def used_features
|
|
75
|
+
@features
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
private
|
|
79
|
+
|
|
80
|
+
# Returns an array of all configured Feature class instances.
|
|
81
|
+
def build_feature_classes(feature_names)
|
|
82
|
+
feature_names.map do |name|
|
|
83
|
+
feature_class_from_name(name)
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Returns the Feature class of the given name
|
|
88
|
+
def feature_class_from_name(name)
|
|
89
|
+
camelcased_name = name.split(/[\s-]/).map(&:capitalize!).join('')
|
|
90
|
+
"Wikipedia::VandalismDetection::Features::#{camelcased_name}".constantize.new
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of all wordlists words in the inserted
|
|
9
|
+
# text.
|
|
10
|
+
class AllWordlistsFrequency < FrequencyBase
|
|
11
|
+
# Returns the percentage of wordlists words in the inserted text.
|
|
12
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
13
|
+
def calculate(edit)
|
|
14
|
+
super
|
|
15
|
+
|
|
16
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
17
|
+
frequency(text, WordLists.all)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of all wordlists words in the text.
|
|
9
|
+
class AllWordlistsImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
old_text = edit.old_revision.text.clean
|
|
14
|
+
new_text = edit.new_revision.text.clean
|
|
15
|
+
|
|
16
|
+
impact(old_text, new_text, WordLists.all)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require_relative 'base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature describes whether the contributor of the new revision is
|
|
7
|
+
# an anonymous or registered Wikipedia user.
|
|
8
|
+
class Anonymity < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
edit.new_revision.anonymous_contributor? ? 0 : 1
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
require_relative 'base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature describes whether the contributor of the old revision is
|
|
7
|
+
# an anonymous or registered Wikipedia user.
|
|
8
|
+
class AnonymityPrevious < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
old_revision = edit.old_revision
|
|
13
|
+
|
|
14
|
+
if old_revision.contributor.blank?
|
|
15
|
+
xml = Wikipedia.api_request(
|
|
16
|
+
prop: 'revisions',
|
|
17
|
+
rvprop: 'user',
|
|
18
|
+
revids: old_revision.id
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
contributor = xml.xpath('//rev/@user').text
|
|
22
|
+
return Features::MISSING_VALUE if contributor.blank?
|
|
23
|
+
|
|
24
|
+
old_revision.contributor = contributor
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
old_revision.anonymous_contributor? ? 0 : 1
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the size of the edit's new revision text
|
|
7
|
+
# (article size).
|
|
8
|
+
class ArticleSize < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
edit.new_revision.text.size
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/bad'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of bad words in the inserted text.
|
|
9
|
+
class BadFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of bad words in the inserted text.
|
|
11
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::BAD)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/bad'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of bad words in the text.
|
|
9
|
+
class BadImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
old_text = edit.old_revision.text.clean
|
|
13
|
+
new_text = edit.new_revision.text.clean
|
|
14
|
+
|
|
15
|
+
impact(old_text, new_text, WordLists::BAD)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
require 'wikipedia'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
MISSING_VALUE = '?'.freeze
|
|
7
|
+
|
|
8
|
+
# This class should be the base class for all Wikipedia::Feature classes.
|
|
9
|
+
class Base
|
|
10
|
+
# Base method for feature calculation.
|
|
11
|
+
# This method should be overwritten in the concrete
|
|
12
|
+
# Wikipedia::Feature-classes.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# def calculate(edit)
|
|
16
|
+
# super # to handle ArgumentException
|
|
17
|
+
#
|
|
18
|
+
# ... concrete calculation of feature out of edit...
|
|
19
|
+
# end
|
|
20
|
+
def calculate(edit)
|
|
21
|
+
return if edit.is_a?(Edit)
|
|
22
|
+
raise ArgumentError, 'Passed argument has to be an Edit'
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# Count the apperance of a given single term or multiple terms in the
|
|
26
|
+
# given text
|
|
27
|
+
#
|
|
28
|
+
# @param terms String
|
|
29
|
+
# @param options Hash of form { in: String }
|
|
30
|
+
#
|
|
31
|
+
# @example
|
|
32
|
+
# feature.count "and", in: text
|
|
33
|
+
# feature.count ["and", "or"], in: text
|
|
34
|
+
#
|
|
35
|
+
# @return Integer
|
|
36
|
+
def count(terms, options = {})
|
|
37
|
+
unless options[:in]
|
|
38
|
+
raise ArgumentError, 'The options hash must include the in: key'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
unless terms.is_a?(String) || terms.is_a?(Array)
|
|
42
|
+
raise ArgumentError, 'The 1st arg should be an Array or String'
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
words = options[:in].downcase
|
|
46
|
+
freq = Hash.new(0)
|
|
47
|
+
|
|
48
|
+
words.gsub(/[\.,'{2,}:\!\?\(\)]/, '').split.each do |word|
|
|
49
|
+
freq[word.to_sym] += 1
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
if terms.is_a?(String)
|
|
53
|
+
freq[terms.downcase.to_sym]
|
|
54
|
+
else
|
|
55
|
+
terms.reduce(0) { |result, term| result + freq[term] }
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of biased words in the inserted text.
|
|
9
|
+
class BiasedFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of biased words in the inserted text.
|
|
11
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::BIASED)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of biased words in the text.
|
|
9
|
+
class BiasedImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
old_text = edit.old_revision.text.clean
|
|
14
|
+
new_text = edit.new_revision.text.clean
|
|
15
|
+
|
|
16
|
+
impact(old_text, new_text, WordLists::BIASED)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature returns whether the edit's is a blanking.
|
|
7
|
+
# size < 7, based on Mola Velasco 2010 implementation.
|
|
8
|
+
class Blanking < Base
|
|
9
|
+
BLANKING_THRESHOLD = 7
|
|
10
|
+
|
|
11
|
+
def calculate(edit)
|
|
12
|
+
super
|
|
13
|
+
|
|
14
|
+
old_text_size = edit.old_revision.text.size
|
|
15
|
+
new_text_size = edit.new_revision.text.size
|
|
16
|
+
|
|
17
|
+
text_removed = old_text_size > new_text_size
|
|
18
|
+
above_threshold = new_text_size < BLANKING_THRESHOLD
|
|
19
|
+
|
|
20
|
+
blanking = text_removed && above_threshold
|
|
21
|
+
blanking ? 1.0 : 0.0
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the character diversity of the edit's new revision inserted text.
|
|
7
|
+
# I.e. how many unique characters are amongst all inserted?
|
|
8
|
+
#
|
|
9
|
+
# Random typing leads to less unique characters relative to full length =>
|
|
10
|
+
class CharacterDiversity < Base
|
|
11
|
+
def calculate(edit)
|
|
12
|
+
super
|
|
13
|
+
|
|
14
|
+
inserted_letters = edit.inserted_text.scan(/[^\s]/)
|
|
15
|
+
all_letters_count = inserted_letters.size
|
|
16
|
+
unique_count = inserted_letters.uniq.size
|
|
17
|
+
|
|
18
|
+
all_letters_count**(1.0 / unique_count)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the length of the longest sequence of the same
|
|
7
|
+
# character in the inserted text.
|
|
8
|
+
class CharacterSequence < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
sequence_hash = edit.inserted_text.scan(/((.)\2*)/)
|
|
13
|
+
sequence_hash = sequence_hash.group_by { |seq, _| seq.length }
|
|
14
|
+
sequence_hash.empty? ? 0 : sequence_hash.max.first
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/bad'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of bad words in the comment of the
|
|
8
|
+
# edit's new revision.
|
|
9
|
+
class CommentBadFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of bad words in the new revision's comment.
|
|
11
|
+
# Returns 0.0 if text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
comment = edit.new_revision.comment.clean
|
|
16
|
+
frequency(comment, WordLists::BAD)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of biased words in the comment of the
|
|
8
|
+
# edit's new revision.
|
|
9
|
+
class CommentBiasedFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of biased words in the new revision's comment.
|
|
11
|
+
# Returns 0.0 if text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
comment = edit.new_revision.comment.clean
|
|
16
|
+
frequency(comment, WordLists::BIASED)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the byte length of the edit's new revision's
|
|
7
|
+
# comment.
|
|
8
|
+
class CommentLength < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
edit.new_revision.comment.clean.bytesize
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of markup words in the comment of the
|
|
8
|
+
# edit's new revision.
|
|
9
|
+
class CommentMarkupFrequency < Base
|
|
10
|
+
MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
|
|
11
|
+
|
|
12
|
+
# Returns the percentage of markup words in the new revision's comment.
|
|
13
|
+
# Returns 0.0 if text is of zero length.
|
|
14
|
+
def calculate(edit)
|
|
15
|
+
super
|
|
16
|
+
|
|
17
|
+
comment = edit.new_revision.comment
|
|
18
|
+
all_words_count = comment.split.count
|
|
19
|
+
markup_words_count = comment.scan(MARKUP_REGEX).count
|
|
20
|
+
|
|
21
|
+
if all_words_count > 0
|
|
22
|
+
markup_words_count.to_f / all_words_count.to_f
|
|
23
|
+
else
|
|
24
|
+
0.0
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the frequency of pronouns in the comment of the
|
|
8
|
+
# new revision.
|
|
9
|
+
class CommentPronounFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of pronoun words in the new revision's
|
|
11
|
+
# comment. Returns 0.0 if text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
comment = edit.new_revision.comment.clean
|
|
16
|
+
frequency(comment, WordLists::PRONOUNS)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of sex words in the comment of the
|
|
8
|
+
# edit's new revision.
|
|
9
|
+
class CommentSexFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of sex words in the new revision's comment.
|
|
11
|
+
# Returns 0.0 if text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
comment = edit.new_revision.comment.clean
|
|
16
|
+
frequency(comment, WordLists::SEX)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of vulgarism words in the comment of the
|
|
8
|
+
# edit's new revision.
|
|
9
|
+
class CommentVulgarismFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of vulgarism words in the new revision's
|
|
11
|
+
# comment.Returns 0.0 if text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
comment = edit.new_revision.comment.clean
|
|
16
|
+
frequency(comment, WordLists::VULGARISM)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'zlib'
|
|
3
|
+
require 'wikipedia/vandalism_detection/diff'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature describes compressibility ratio of compressed and
|
|
9
|
+
# uncompressed inserted text.
|
|
10
|
+
class Compressibility < Base
|
|
11
|
+
# Calculates the compressibility ratio of the inserted text.
|
|
12
|
+
# Values above 0.5 are higher compressed and therefor can stand for
|
|
13
|
+
# nonsense text as:
|
|
14
|
+
# 'AAAAAAAAAAAAAAAAAAAhhhhhhhhhhhhhhhh!' etc.
|
|
15
|
+
def calculate(edit)
|
|
16
|
+
super
|
|
17
|
+
|
|
18
|
+
inserted_text = edit.inserted_text
|
|
19
|
+
uncompressed_size = inserted_text.bytesize.to_f
|
|
20
|
+
compressed_size = Zlib::Deflate.deflate(inserted_text).bytesize.to_f
|
|
21
|
+
|
|
22
|
+
if inserted_text.empty?
|
|
23
|
+
0.5
|
|
24
|
+
else
|
|
25
|
+
uncompressed_size / (compressed_size + uncompressed_size)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
class ContainsBase < Base
|
|
7
|
+
# Returns whether the comment contains the given term.
|
|
8
|
+
# It returns 0 if term is not included, else 1.
|
|
9
|
+
def contains(comment, terms)
|
|
10
|
+
terms = terms.is_a?(Array) ? terms.join('|') : terms
|
|
11
|
+
comment =~ /#{terms}/i ? 1 : 0
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/contains_base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature returns whether the edit's comment includes reverted key
|
|
7
|
+
# words.
|
|
8
|
+
class Reverted < ContainsBase
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
contains(edit.new_revision.comment, %w[rvt rvv revert])
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|