wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Algorithms
|
|
6
|
+
def kullback_leibler_divergence(text_a, text_b)
|
|
7
|
+
divergence = KullbackLeiblerDivergence.new
|
|
8
|
+
divergence.of(text_a, text_b)
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
require 'weka'
|
|
2
|
+
require 'active_support/core_ext/string'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
|
|
5
|
+
require 'wikipedia/vandalism_detection/configuration'
|
|
6
|
+
require 'wikipedia/vandalism_detection/edit'
|
|
7
|
+
require 'wikipedia/vandalism_detection/feature_calculator'
|
|
8
|
+
require 'wikipedia/vandalism_detection/instances'
|
|
9
|
+
require 'wikipedia/vandalism_detection/evaluator'
|
|
10
|
+
|
|
11
|
+
module Wikipedia
|
|
12
|
+
module VandalismDetection
|
|
13
|
+
class Classifier
|
|
14
|
+
attr_reader :evaluator, :dataset
|
|
15
|
+
|
|
16
|
+
# Loads the classifier instance configured in the config file.
|
|
17
|
+
def initialize(dataset = nil)
|
|
18
|
+
@config = Wikipedia::VandalismDetection.config
|
|
19
|
+
@feature_calculator = FeatureCalculator.new
|
|
20
|
+
@classifier = load_classifier(dataset)
|
|
21
|
+
@evaluator = Evaluator.new(self)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Returns the concrete classifier instance configured in the config file
|
|
25
|
+
# When you configured a Trees::RandomForest classifier you will get a
|
|
26
|
+
# Weka::Classifiers::Trees::RandomForest instance.
|
|
27
|
+
# This instance can be used for native function callings of the classifier
|
|
28
|
+
# class.
|
|
29
|
+
def classifier_instance
|
|
30
|
+
@classifier
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Classifies an edit or a set of features and returns the vandalism
|
|
34
|
+
# confidence by default.
|
|
35
|
+
# If option 'return_all_params: true' is set, it returns a Hash of form
|
|
36
|
+
# { confidence => ..., class_index => ...}
|
|
37
|
+
#
|
|
38
|
+
# @example
|
|
39
|
+
# # suppose you have a dataset with 2 feature or 'edit' as an instance
|
|
40
|
+
# # of Wikipedia::VandalismDetection::Edit
|
|
41
|
+
# classifier = Wikipedia::VandalsimDetection::Classifier.new
|
|
42
|
+
# features = [0.45, 0.67]
|
|
43
|
+
#
|
|
44
|
+
# confidence = classifier.classify(features)
|
|
45
|
+
# confidence = classifier.classify(edit)
|
|
46
|
+
def classify(edit_or_features, options = {})
|
|
47
|
+
features = @config.features
|
|
48
|
+
param_is_features = edit_or_features.is_a?(Array) && edit_or_features.size == features.count
|
|
49
|
+
param_is_edit = edit_or_features.is_a? Edit
|
|
50
|
+
|
|
51
|
+
unless param_is_edit || param_is_features
|
|
52
|
+
message = 'Input has to be an Edit or an Array of feature values.'
|
|
53
|
+
raise ArgumentError, message
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
feature_values = param_is_edit ? @feature_calculator.calculate_features_for(edit_or_features) : edit_or_features
|
|
57
|
+
return -1.0 if feature_values.empty?
|
|
58
|
+
|
|
59
|
+
feature_values = feature_values.map do |i|
|
|
60
|
+
i == Features::MISSING_VALUE ? nil : i
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
dataset = Instances.empty
|
|
64
|
+
dataset.set_class_index(feature_values.count)
|
|
65
|
+
dataset.add_instance([*feature_values, Instances::VANDALISM])
|
|
66
|
+
|
|
67
|
+
instance = dataset.instance(0)
|
|
68
|
+
instance.set_class_missing
|
|
69
|
+
|
|
70
|
+
if @config.use_occ?
|
|
71
|
+
if @config.classifier_options =~ /#{Instances::VANDALISM}/
|
|
72
|
+
index = Instances::VANDALISM_CLASS_INDEX
|
|
73
|
+
else
|
|
74
|
+
index = Instances::REGULAR_CLASS_INDEX
|
|
75
|
+
end
|
|
76
|
+
else
|
|
77
|
+
index = Instances::VANDALISM_CLASS_INDEX
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
confidence = @classifier.distribution_for_instance(instance).to_a[index]
|
|
82
|
+
|
|
83
|
+
if options[:return_all_params]
|
|
84
|
+
class_index = @classifier.classify_instance(instance)
|
|
85
|
+
class_index = class_index.nan? ? Instances::NOT_KNOWN_INDEX : class_index.to_i
|
|
86
|
+
results = { confidence: confidence, class_index: class_index }
|
|
87
|
+
else
|
|
88
|
+
results = confidence
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
results
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Cross validates the classifier.
|
|
95
|
+
# Fold is used as defined in configuration (default is 10).
|
|
96
|
+
#
|
|
97
|
+
# @example
|
|
98
|
+
# classifier = Wikipedia::VandalismDetection::Classifier.new
|
|
99
|
+
# evaluation = classifier.cross_validate
|
|
100
|
+
# evaluation = classifier.cross_validate(equally_distributed: true)
|
|
101
|
+
#
|
|
102
|
+
def cross_validate(options = {})
|
|
103
|
+
@evaluator.cross_validate(options)
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
private
|
|
107
|
+
|
|
108
|
+
# Loads the (Weka-) Classifier set in the Configuration
|
|
109
|
+
def load_classifier(dataset)
|
|
110
|
+
classifier_name = @config.classifier_type
|
|
111
|
+
|
|
112
|
+
unless classifier_name
|
|
113
|
+
message = 'Classifier type is not defined in wikipedia-vandalism-detection.yml'
|
|
114
|
+
raise ClassifierNotConfiguredError, message
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
if @config.features.blank?
|
|
118
|
+
message = 'No features configured in wikipedia-vandalism-detection.yml'
|
|
119
|
+
raise FeaturesNotConfiguredError, message
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
begin
|
|
123
|
+
"Weka::Classifiers::#{classifier_name}".constantize
|
|
124
|
+
rescue
|
|
125
|
+
message = "The configured classifier type '#{classifier_name}' is unknown."
|
|
126
|
+
raise ClassifierUnknownError, message
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
classifier_class = "Weka::Classifiers::#{classifier_name}".constantize
|
|
130
|
+
options = @config.classifier_options
|
|
131
|
+
|
|
132
|
+
puts "Loading classifier #{classifier_name} with options '#{options}'…"
|
|
133
|
+
|
|
134
|
+
if dataset.nil?
|
|
135
|
+
if @config.balanced_training_data?
|
|
136
|
+
puts 'using BALANCED training dataset'
|
|
137
|
+
dataset = TrainingDataset.balanced_instances
|
|
138
|
+
elsif @config.unbalanced_training_data?
|
|
139
|
+
puts 'using FULL (unbalanced) training dataset'
|
|
140
|
+
dataset = TrainingDataset.instances
|
|
141
|
+
elsif @config.oversampled_training_data?
|
|
142
|
+
puts 'using OVERSAMPLED training dataset'
|
|
143
|
+
dataset = TrainingDataset.oversampled_instances
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
if @config.use_occ?
|
|
148
|
+
dataset.rename_attribute_value(
|
|
149
|
+
dataset.class_index,
|
|
150
|
+
one_class_index,
|
|
151
|
+
Instances::OUTLIER
|
|
152
|
+
)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
@dataset = dataset
|
|
156
|
+
|
|
157
|
+
begin
|
|
158
|
+
classifier = classifier_class.build do
|
|
159
|
+
use_options options if options
|
|
160
|
+
train_with_instances dataset
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
classifier
|
|
164
|
+
rescue => error
|
|
165
|
+
raise "Error while loading classifier: #{error}"
|
|
166
|
+
end
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def one_class_index
|
|
170
|
+
if @config.classifier_options =~ /#{Instances::VANDALISM}/
|
|
171
|
+
Instances::REGULAR_CLASS_INDEX
|
|
172
|
+
else
|
|
173
|
+
Instances::VANDALISM_CLASS_INDEX
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Returns the given dataset cleaned up the regular instances
|
|
178
|
+
def remove_regular_instances(dataset)
|
|
179
|
+
features = @config.features
|
|
180
|
+
|
|
181
|
+
vandalism_dataset = Weka::Core::Instances.new.with_attributes do
|
|
182
|
+
features.each { |name| numeric :"#{name.tr(' ', '_')}" }
|
|
183
|
+
nominal :class, values: [Instances::VANDALISM], class_attribute: true
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
dataset.to_a.map(&:values).each_with_index do |attributes, index|
|
|
187
|
+
class_value = Instances::CLASSES[dataset.instance(index).value(dataset.class_index).to_i]
|
|
188
|
+
|
|
189
|
+
if class_value == Instances::VANDALISM
|
|
190
|
+
values = attributes[0..-2]
|
|
191
|
+
vandalism_dataset.add_instance([*values, class_value])
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
|
|
196
|
+
vandalism_dataset = filter.filter(vandalism_dataset)
|
|
197
|
+
|
|
198
|
+
vandalism_dataset
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|
|
202
|
+
end
|
|
@@ -0,0 +1,350 @@
|
|
|
1
|
+
require 'weka/classifiers/meta/one_class_classifier'
|
|
2
|
+
require 'singleton'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
require 'yaml'
|
|
7
|
+
|
|
8
|
+
def self.config
|
|
9
|
+
Configuration.instance
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
class Configuration
|
|
13
|
+
include Singleton
|
|
14
|
+
|
|
15
|
+
TRAINING_DATA_BALANCED = 'balanced'.freeze
|
|
16
|
+
TRAINING_DATA_UNBALANCED = 'unbalanced'.freeze
|
|
17
|
+
TRAINING_DATA_OVERSAMPLED = 'oversampled'.freeze
|
|
18
|
+
CONFIG_FILE = 'wikipedia-vandalism-detection.yml'.freeze
|
|
19
|
+
|
|
20
|
+
attr_reader :data,
|
|
21
|
+
:features,
|
|
22
|
+
:classifier_options,
|
|
23
|
+
:classifier_type,
|
|
24
|
+
:cross_validation_fold,
|
|
25
|
+
:output_base_directory,
|
|
26
|
+
:training_data_options
|
|
27
|
+
|
|
28
|
+
def initialize
|
|
29
|
+
config = DefaultConfiguration[DefaultConfiguration::DEFAULTS]
|
|
30
|
+
@config_from_file ||= config.load_config_file(config.config_file)
|
|
31
|
+
|
|
32
|
+
@data ||= @config_from_file ? config.deep_merge(@config_from_file) : config
|
|
33
|
+
|
|
34
|
+
@classifier_type = @data['classifier']['type']
|
|
35
|
+
@classifier_options = @data['classifier']['options']
|
|
36
|
+
@cross_validation_fold = @data['classifier']['cross-validation-fold']
|
|
37
|
+
@training_data_options = @data['classifier']['training-data-options']
|
|
38
|
+
@replace_missing_values = @data['classifier']['replace-missing-values'].to_s
|
|
39
|
+
|
|
40
|
+
@features = @data['features']
|
|
41
|
+
@output_base_directory = File.expand_path(@data['output']['base_directory'], __FILE__)
|
|
42
|
+
@training_arff_file_name = @data['output']['training']['arff_file']
|
|
43
|
+
@test_arff_file_name = @data['output']['test']['arff_file']
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Returns whether the classifier uses one class classification
|
|
47
|
+
def use_occ?
|
|
48
|
+
@classifier_type == Weka::Classifiers::Meta::OneClassClassifier.type
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def replace_training_data_missing_values?
|
|
52
|
+
!!(@replace_missing_values =~ /(true|t|yes|y)/i)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Returns a boolean value whether a balanced data set is used for
|
|
56
|
+
# classifier training.
|
|
57
|
+
# (balanced means: same number of vandalism and regular samples)
|
|
58
|
+
def balanced_training_data?
|
|
59
|
+
@training_data_options == TRAINING_DATA_BALANCED
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Returns a boolean value whether an unbalanced data set is used for
|
|
63
|
+
# classifier training.
|
|
64
|
+
# (unbalanced means: vandalism and regular samples are used as given in
|
|
65
|
+
# arff file)
|
|
66
|
+
def unbalanced_training_data?
|
|
67
|
+
@training_data_options == TRAINING_DATA_UNBALANCED ||
|
|
68
|
+
@training_data_options.nil? ||
|
|
69
|
+
(!balanced_training_data? && !oversampled_training_data?)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Returns a boolean value whether a oversampled data set is used for
|
|
73
|
+
# classifier training.
|
|
74
|
+
# (oversampled means: a balanced dataset is enriched through vandalism
|
|
75
|
+
# instances if vandalism number is less than regular number)
|
|
76
|
+
def oversampled_training_data?
|
|
77
|
+
!@training_data_options.nil? &&
|
|
78
|
+
@training_data_options.include?(TRAINING_DATA_OVERSAMPLED)
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
# Returns a hash of the oversampled training data options.
|
|
82
|
+
# Allowed options are -p (-percent) and -u (-undersampling)
|
|
83
|
+
def oversampling_options
|
|
84
|
+
if oversampled_training_data?
|
|
85
|
+
params = @training_data_options.gsub(TRAINING_DATA_OVERSAMPLED, '').split('-')
|
|
86
|
+
|
|
87
|
+
percent_default = 100.0
|
|
88
|
+
undersampling_default = 100.0
|
|
89
|
+
|
|
90
|
+
percent_option = params.select { |param| param.match(/(p\s|percentage\s)\d+/i) }[0]
|
|
91
|
+
undersampling_option = params.select { |param| param.match(/(u\s|undersampling\s)/i) }[0]
|
|
92
|
+
|
|
93
|
+
percent = percent_option.nil? ? percent_default : percent_option.split.last.to_f
|
|
94
|
+
undersampling = undersampling_default
|
|
95
|
+
|
|
96
|
+
if undersampling_option
|
|
97
|
+
if !undersampling_option.match(/(true|t|yes|y)/i).nil?
|
|
98
|
+
undersampling_percentage = undersampling_option.split.last
|
|
99
|
+
undersampling = undersampling_percentage.nil? ? undersampling_default : undersampling_percentage.to_f
|
|
100
|
+
else
|
|
101
|
+
undersampling = 0.0
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
{ percentage: percent, undersampling: undersampling }
|
|
106
|
+
else
|
|
107
|
+
{}
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Returns the path to the classification file.
|
|
112
|
+
# Automatically sub directories for classifier and training data options
|
|
113
|
+
# are added. Thus it results in
|
|
114
|
+
# <output base dir>/<classifier name>/<training data options>/<file name>
|
|
115
|
+
def test_output_classification_file
|
|
116
|
+
classifiction_file_name = @data['output']['test']['classification_file']
|
|
117
|
+
classifier_name = @classifier_type.split('::').last.downcase
|
|
118
|
+
|
|
119
|
+
File.join(
|
|
120
|
+
@output_base_directory,
|
|
121
|
+
classifier_name,
|
|
122
|
+
@training_data_options.gsub(/\s+/, '_'),
|
|
123
|
+
classifiction_file_name
|
|
124
|
+
)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Returns the training arff file name.
|
|
128
|
+
# The path is expanded by used classifier & options and is in the same
|
|
129
|
+
# directory as the classification file.
|
|
130
|
+
def training_output_arff_file
|
|
131
|
+
directory = File.dirname(test_output_classification_file)
|
|
132
|
+
File.join(directory, @training_arff_file_name)
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Returns the test arff file name.
|
|
136
|
+
# The path is expanded by used classifier & options and is in the same
|
|
137
|
+
# directory as the classification file.
|
|
138
|
+
def test_output_arff_file
|
|
139
|
+
directory = File.dirname(test_output_classification_file)
|
|
140
|
+
File.join(directory, @test_arff_file_name)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Returns file/path string for corpora files/directories and output files
|
|
144
|
+
# after following schema: <corpus type>_<progress stage>_<file name>.
|
|
145
|
+
#
|
|
146
|
+
# Instead of 'corpora' the word 'corpus' is used for grammatical reasons.
|
|
147
|
+
#
|
|
148
|
+
# example:
|
|
149
|
+
# training_corpus_edits_file()
|
|
150
|
+
# test_output_index_file()
|
|
151
|
+
#
|
|
152
|
+
def method_missing(method_name, *args)
|
|
153
|
+
if instance_variable_defined?("@#{method_name}")
|
|
154
|
+
return instance_variable_get("@#{method_name}")
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
file_path_parts = method_name.to_s.split('_')
|
|
158
|
+
|
|
159
|
+
if file_path_parts.count >= 4
|
|
160
|
+
corpus_type = file_path_parts[0]
|
|
161
|
+
progress_stage = file_path_parts[1]
|
|
162
|
+
file_path = file_path_parts[2..-1].join('_')
|
|
163
|
+
|
|
164
|
+
if progress_stage == 'corpus'
|
|
165
|
+
progress_stage = 'corpora'
|
|
166
|
+
path = File.join(
|
|
167
|
+
@data[progress_stage]['base_directory'],
|
|
168
|
+
@data[progress_stage][corpus_type]['base_directory']
|
|
169
|
+
)
|
|
170
|
+
elsif progress_stage == 'output'
|
|
171
|
+
path = @output_base_directory
|
|
172
|
+
else
|
|
173
|
+
return super
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
relative_path = File.join(path, @data[progress_stage][corpus_type][file_path])
|
|
177
|
+
absolute_path = File.expand_path(relative_path, __FILE__)
|
|
178
|
+
instance_variable_set("@#{method_name}", absolute_path)
|
|
179
|
+
else
|
|
180
|
+
super
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# This class represents the default config which is merged with the
|
|
186
|
+
# customized config from config YAML file.
|
|
187
|
+
class DefaultConfiguration < Hash
|
|
188
|
+
DEFAULTS = {
|
|
189
|
+
'source' => Dir.pwd,
|
|
190
|
+
'features' => [
|
|
191
|
+
'anonymity',
|
|
192
|
+
'anonymity previous',
|
|
193
|
+
'all wordlists frequency',
|
|
194
|
+
'all wordlists impact',
|
|
195
|
+
'article size',
|
|
196
|
+
'bad frequency',
|
|
197
|
+
'bad impact',
|
|
198
|
+
'biased frequency',
|
|
199
|
+
'biased impact',
|
|
200
|
+
'blanking',
|
|
201
|
+
'character sequence',
|
|
202
|
+
'character diversity',
|
|
203
|
+
'comment length',
|
|
204
|
+
'comment biased frequency',
|
|
205
|
+
'comment pronoun frequency',
|
|
206
|
+
'comment vulgarism frequency',
|
|
207
|
+
'compressibility',
|
|
208
|
+
'copyedit',
|
|
209
|
+
'digit ratio',
|
|
210
|
+
'edits per user',
|
|
211
|
+
'emoticons frequency',
|
|
212
|
+
'emoticons impact',
|
|
213
|
+
'inserted size',
|
|
214
|
+
'inserted words',
|
|
215
|
+
'inserted character distribution',
|
|
216
|
+
'inserted external links',
|
|
217
|
+
'inserted internal links',
|
|
218
|
+
'longest word',
|
|
219
|
+
'markup frequency',
|
|
220
|
+
'markup impact',
|
|
221
|
+
'non-alphanumeric ratio',
|
|
222
|
+
'personal life',
|
|
223
|
+
'pronoun frequency',
|
|
224
|
+
'pronoun impact',
|
|
225
|
+
'removed size',
|
|
226
|
+
'removed words',
|
|
227
|
+
'removed all wordlists frequency',
|
|
228
|
+
'removed bad frequency',
|
|
229
|
+
'removed biased frequency',
|
|
230
|
+
'removed character distribution',
|
|
231
|
+
'removed emoticons frequency',
|
|
232
|
+
'removed markup frequency',
|
|
233
|
+
'removed pronoun frequency',
|
|
234
|
+
'removed sex frequency',
|
|
235
|
+
'removed vulgarism frequency',
|
|
236
|
+
'replacement similarity',
|
|
237
|
+
'reverted',
|
|
238
|
+
'revisions character distribution',
|
|
239
|
+
'sex frequency',
|
|
240
|
+
'sex impact',
|
|
241
|
+
'same editor',
|
|
242
|
+
'size increment',
|
|
243
|
+
'size ratio',
|
|
244
|
+
'term frequency',
|
|
245
|
+
'time interval',
|
|
246
|
+
'time of day',
|
|
247
|
+
'upper case ratio',
|
|
248
|
+
'upper case words ratio',
|
|
249
|
+
'upper to lower case ratio',
|
|
250
|
+
'vulgarism frequency',
|
|
251
|
+
'vulgarism impact',
|
|
252
|
+
'weekday',
|
|
253
|
+
'words increment'
|
|
254
|
+
],
|
|
255
|
+
'corpora' => {
|
|
256
|
+
'base_directory' => nil,
|
|
257
|
+
'training' => {
|
|
258
|
+
'base_directory' => nil,
|
|
259
|
+
'edits_file' => nil,
|
|
260
|
+
'annotations_file' => nil,
|
|
261
|
+
'revisions_directory' => nil
|
|
262
|
+
},
|
|
263
|
+
'test' => {
|
|
264
|
+
'base_directory' => nil,
|
|
265
|
+
'edits_file' => nil,
|
|
266
|
+
'revisions_directory' => nil,
|
|
267
|
+
'ground_truth_file' => nil
|
|
268
|
+
}
|
|
269
|
+
},
|
|
270
|
+
'output' => {
|
|
271
|
+
'base_directory' => File.join(Dir.pwd, 'build'),
|
|
272
|
+
'training' => {
|
|
273
|
+
'arff_file' => 'training.arff',
|
|
274
|
+
'index_file' => 'training_index.yml'
|
|
275
|
+
},
|
|
276
|
+
'test' => {
|
|
277
|
+
'arff_file' => 'test.arff',
|
|
278
|
+
'index_file' => 'test_index.yml',
|
|
279
|
+
'classification_file' => 'classification.txt'
|
|
280
|
+
}
|
|
281
|
+
},
|
|
282
|
+
'classifier' => {
|
|
283
|
+
'type' => nil,
|
|
284
|
+
'options' => nil,
|
|
285
|
+
'cross-validation-fold' => 10,
|
|
286
|
+
'training-data-options' => 'unbalanced',
|
|
287
|
+
'replace-missing-values' => nil
|
|
288
|
+
}
|
|
289
|
+
}.freeze
|
|
290
|
+
|
|
291
|
+
def source
|
|
292
|
+
DEFAULTS['source']
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Looks in two places for a custom config file:
|
|
296
|
+
# in <app_root>/config/ and in <app_root>/lib/config
|
|
297
|
+
def config_file
|
|
298
|
+
config_file_path = "config/#{Configuration::CONFIG_FILE}"
|
|
299
|
+
root_file = File.join(source, config_file_path)
|
|
300
|
+
lib_file = File.join(source, "lib/#{config_file_path}")
|
|
301
|
+
|
|
302
|
+
first_parent_file = find_first_parent_path_for(
|
|
303
|
+
File.expand_path(File.dirname(__FILE__)),
|
|
304
|
+
config_file_path
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
if File.exist?(root_file)
|
|
308
|
+
root_file
|
|
309
|
+
elsif File.exist?(lib_file)
|
|
310
|
+
lib_file
|
|
311
|
+
else
|
|
312
|
+
first_parent_file
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def load_config_file(file)
|
|
317
|
+
config_file = Configuration::CONFIG_FILE
|
|
318
|
+
|
|
319
|
+
if File.exist?(file) && file =~ /#{config_file}/
|
|
320
|
+
YAML.load_file(file)
|
|
321
|
+
else
|
|
322
|
+
warn %(
|
|
323
|
+
|
|
324
|
+
Configuration file not found in
|
|
325
|
+
#{source}/config,
|
|
326
|
+
#{source}/lib/config directory
|
|
327
|
+
or any other parent path.
|
|
328
|
+
|
|
329
|
+
To customize the system, create a '#{config_file}' file.
|
|
330
|
+
|
|
331
|
+
)
|
|
332
|
+
end
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
private
|
|
336
|
+
|
|
337
|
+
def find_first_parent_path_for(start_path, file)
|
|
338
|
+
file_path = File.join(start_path, file)
|
|
339
|
+
|
|
340
|
+
if File.exist?(file_path)
|
|
341
|
+
file_path
|
|
342
|
+
elsif File.dirname(start_path) != start_path
|
|
343
|
+
find_first_parent_path_for(File.dirname(start_path), file)
|
|
344
|
+
else
|
|
345
|
+
start_path
|
|
346
|
+
end
|
|
347
|
+
end
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
require 'java'
|
|
4
|
+
require 'java/diffutils-1.3.0.jar'
|
|
5
|
+
|
|
6
|
+
java_import 'difflib.DiffUtils'
|
|
7
|
+
|
|
8
|
+
class Diff
|
|
9
|
+
def initialize(original, current)
|
|
10
|
+
@original = clean_text(original)
|
|
11
|
+
@current = clean_text(current)
|
|
12
|
+
@patch = DiffUtils.diff(@original.split, @current.split)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def inserted_words
|
|
16
|
+
@patch.deltas.map { |delta| delta.revised.lines }.flatten
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def removed_words
|
|
20
|
+
@patch.deltas.map { |delta| delta.original.lines }.flatten
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
private
|
|
24
|
+
|
|
25
|
+
def clean_text(text)
|
|
26
|
+
text.encode(
|
|
27
|
+
'UTF-8',
|
|
28
|
+
'binary',
|
|
29
|
+
invalid: :replace,
|
|
30
|
+
undef: :replace,
|
|
31
|
+
replace: ''
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/diff'
|
|
2
|
+
require 'wikipedia/vandalism_detection/text'
|
|
3
|
+
require 'wikipedia/vandalism_detection/page'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
class Edit
|
|
8
|
+
attr_reader :old_revision, :new_revision
|
|
9
|
+
attr_accessor :page
|
|
10
|
+
|
|
11
|
+
def initialize(old_revision, new_revision, attributes = {})
|
|
12
|
+
message = "old revision: #{old_revision.id} | parent: #{old_revision.parent_id},
|
|
13
|
+
new revision: #{new_revision.id} | parent: #{new_revision.parent_id}"
|
|
14
|
+
|
|
15
|
+
unless sequent?(old_revision, new_revision)
|
|
16
|
+
raise ArgumentError, "Revisions are not sequent: #{message}."
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
@old_revision = old_revision
|
|
20
|
+
@new_revision = new_revision
|
|
21
|
+
@page = attributes[:page] || Page.new
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def serialize(*attributes)
|
|
25
|
+
old_revision_parts = []
|
|
26
|
+
new_revision_parts = []
|
|
27
|
+
|
|
28
|
+
attributes.each do |attr|
|
|
29
|
+
if @old_revision.respond_to?(attr)
|
|
30
|
+
old_revision_parts.push @old_revision.method(attr).call
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
attributes.each do |attr|
|
|
35
|
+
if @new_revision.respond_to?(attr)
|
|
36
|
+
new_revision_parts.push @new_revision.method(attr).call
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
old_revision_string = old_revision_parts.join(',')
|
|
41
|
+
new_revision_string = new_revision_parts.join(',')
|
|
42
|
+
|
|
43
|
+
"#{old_revision_string}\t#{new_revision_string}"
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Returns an array of the words inserted in the new revision compared with
|
|
47
|
+
# the old one.
|
|
48
|
+
def inserted_words
|
|
49
|
+
@diff ||= Diff.new(@old_revision.text, @new_revision.text)
|
|
50
|
+
@inserted_words ||= @diff.inserted_words
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Returns a Text of the words inserted in the new revision compared with
|
|
54
|
+
# the old one.
|
|
55
|
+
def inserted_text
|
|
56
|
+
@inserted_text ||= Text.new(inserted_words.join(' '))
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Returns an array of the words removed in the new revision compared with
|
|
60
|
+
# the old one.
|
|
61
|
+
def removed_words
|
|
62
|
+
@diff ||= Diff.new(@old_revision.text, @new_revision.text)
|
|
63
|
+
@removed_words ||= @diff.removed_words
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Returns a Text of the words removed in the new revision compared with
|
|
67
|
+
# the old one.
|
|
68
|
+
def removed_text
|
|
69
|
+
@removed_text ||= Text.new(removed_words.join(' '))
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
protected
|
|
73
|
+
|
|
74
|
+
# Returns whether the given revisions are sequent, i.e. the old revisions
|
|
75
|
+
# id is the the new revisions parent id.
|
|
76
|
+
def sequent?(old_revision, new_revision)
|
|
77
|
+
new_revision.parent_id == old_revision.id
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|