wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
require 'find'
|
|
2
|
+
require 'yaml'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'csv'
|
|
5
|
+
require 'weka'
|
|
6
|
+
|
|
7
|
+
require 'wikipedia/vandalism_detection/configuration'
|
|
8
|
+
require 'wikipedia/vandalism_detection/text'
|
|
9
|
+
require 'wikipedia/vandalism_detection/revision'
|
|
10
|
+
require 'wikipedia/vandalism_detection/edit'
|
|
11
|
+
require 'wikipedia/vandalism_detection/feature_calculator'
|
|
12
|
+
require 'wikipedia/vandalism_detection/instances'
|
|
13
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
|
14
|
+
|
|
15
|
+
module Wikipedia
|
|
16
|
+
module VandalismDetection
|
|
17
|
+
# This class provides methods for getting and creating a test ARFF file from
|
|
18
|
+
# a configured test corpus.
|
|
19
|
+
class TestDataset
|
|
20
|
+
class << self
|
|
21
|
+
# Returns an instance dataset from the configured gold annotation file
|
|
22
|
+
# using the configured features from feature_calculator parameter.
|
|
23
|
+
def build
|
|
24
|
+
@config = Wikipedia::VandalismDetection.config
|
|
25
|
+
print "\ncreating test dataset…"
|
|
26
|
+
|
|
27
|
+
edits_file = @config.test_corpus_edits_file
|
|
28
|
+
raise EditsFileNotConfiguredError unless edits_file
|
|
29
|
+
|
|
30
|
+
edits = CSV.parse(File.read(edits_file), headers: true)
|
|
31
|
+
|
|
32
|
+
output_directory = File.join(@config.output_base_directory, 'test')
|
|
33
|
+
|
|
34
|
+
unless Dir.exist?(output_directory)
|
|
35
|
+
FileUtils.mkdir_p(output_directory)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
unless Dir.exist?(@config.output_base_directory)
|
|
39
|
+
FileUtils.mkdir_p(@config.output_base_directory)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# create feature file hash with io objects
|
|
43
|
+
feature_files = @config.features.each_with_object({}) do |feature_name, hash|
|
|
44
|
+
file_name = "#{feature_name.tr(' ', '_').downcase}.arff"
|
|
45
|
+
arff_file = File.join(output_directory, file_name)
|
|
46
|
+
|
|
47
|
+
next if File.exist?(arff_file)
|
|
48
|
+
|
|
49
|
+
dataset = Instances.empty_for_test_feature(feature_name)
|
|
50
|
+
dataset.to_arff(arff_file)
|
|
51
|
+
hash[feature_name] = File.open(arff_file, 'a')
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
feature_calculator = FeatureCalculator.new
|
|
55
|
+
|
|
56
|
+
unless feature_files.empty?
|
|
57
|
+
processed_edits = 0
|
|
58
|
+
edits_count = edits.count
|
|
59
|
+
|
|
60
|
+
edits.each do |edit_data|
|
|
61
|
+
old_revision_id = edit_data['oldrevisionid']
|
|
62
|
+
new_revision_id = edit_data['newrevisionid']
|
|
63
|
+
|
|
64
|
+
processed_edits += 1
|
|
65
|
+
print_progress(processed_edits, edits_count, 'computing test features')
|
|
66
|
+
|
|
67
|
+
annotated_old_revision = annotated_revision?(old_revision_id)
|
|
68
|
+
annotated_new_revision = annotated_revision?(new_revision_id)
|
|
69
|
+
next unless annotated_old_revision && annotated_new_revision
|
|
70
|
+
|
|
71
|
+
edit = create_edit_from(edit_data)
|
|
72
|
+
|
|
73
|
+
feature_files.each do |feature_name, file|
|
|
74
|
+
value = feature_calculator.calculate_feature_for(edit, feature_name)
|
|
75
|
+
file.puts [value, old_revision_id, new_revision_id].join(',')
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# close all io objects
|
|
80
|
+
feature_files.each_value(&:close)
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
merge_feature_arffs(@config.features, output_directory)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
alias instances build
|
|
87
|
+
|
|
88
|
+
# Saves and returns the dataset as ARFF file.
|
|
89
|
+
# As test data the configured data corpus from /config/wikipedia-vandalism-detection.yml is used.
|
|
90
|
+
def build!
|
|
91
|
+
@config = Wikipedia::VandalismDetection.config
|
|
92
|
+
|
|
93
|
+
dataset = instances
|
|
94
|
+
output_file = @config.test_output_arff_file
|
|
95
|
+
|
|
96
|
+
dataset.to_arff(output_file)
|
|
97
|
+
puts "\n'#{File.basename(output_file)}' saved to #{File.dirname(output_file)}"
|
|
98
|
+
|
|
99
|
+
dataset
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Loads arff files of given features and merge them into one arff file.
|
|
103
|
+
# Returns the merged arff file.
|
|
104
|
+
def merge_feature_arffs(features, output_directory)
|
|
105
|
+
filter = Weka::Filters::Unsupervised::Attribute::Remove.new
|
|
106
|
+
filter.use_options('-R last')
|
|
107
|
+
|
|
108
|
+
merged_dataset = nil
|
|
109
|
+
|
|
110
|
+
features.each do |feature_name|
|
|
111
|
+
file_name = "#{feature_name.tr(' ', '_').downcase}.arff"
|
|
112
|
+
arff_file = File.join(output_directory, file_name)
|
|
113
|
+
|
|
114
|
+
feature_dataset = Weka::Core::Instances.from_arff(arff_file)
|
|
115
|
+
print '.'
|
|
116
|
+
|
|
117
|
+
if merged_dataset
|
|
118
|
+
merged_dataset = merged_dataset.apply_filters(filter, filter)
|
|
119
|
+
merged_dataset = Weka::Core::Instances.merge_instances(merged_dataset, feature_dataset)
|
|
120
|
+
else
|
|
121
|
+
merged_dataset = feature_dataset
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
add_ground_truth_class_to(merged_dataset)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Adds the ground truth class attribute and values to the given dataset
|
|
129
|
+
# and returns the merged
|
|
130
|
+
def add_ground_truth_class_to(dataset)
|
|
131
|
+
config = Wikipedia::VandalismDetection.config
|
|
132
|
+
|
|
133
|
+
arff_file = File.join(config.output_base_directory, 'test', 'class.arff')
|
|
134
|
+
class_dataset = Instances.empty_for_test_class
|
|
135
|
+
|
|
136
|
+
if File.exist?(arff_file)
|
|
137
|
+
class_dataset = Weka::Core::Instances.from_arff(arff_file)
|
|
138
|
+
else
|
|
139
|
+
ground_truth_file_path = config.test_corpus_ground_truth_file
|
|
140
|
+
ground_truth = ground_truth_hash(ground_truth_file_path)
|
|
141
|
+
|
|
142
|
+
dataset.each do |instance|
|
|
143
|
+
old_revision_id = instance.values[-2].to_i
|
|
144
|
+
new_revision_id = instance.values[-1].to_i
|
|
145
|
+
key = :"#{old_revision_id}-#{new_revision_id}"
|
|
146
|
+
|
|
147
|
+
if ground_truth.key?(key)
|
|
148
|
+
class_value = Instances::CLASSES[Instances::CLASSES_SHORT.key(ground_truth[key][:class])]
|
|
149
|
+
class_dataset.add_instance([class_value || '?'])
|
|
150
|
+
else
|
|
151
|
+
class_dataset.add_instance(['?']) # missing
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
class_dataset.to_arff(arff_file)
|
|
156
|
+
puts "saved #{File.basename(arff_file)} to #{File.dirname(arff_file)}"
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
if dataset.size != class_dataset.size
|
|
160
|
+
raise Exception, "Different size: #{dataset.size} vs. #{class_dataset.size}"
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
dataset.merge(class_dataset)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
# Returns a hash for classification data from given ground truth file
|
|
167
|
+
def ground_truth_hash(ground_truth_file)
|
|
168
|
+
file = File.read(ground_truth_file)
|
|
169
|
+
ground_truth_samples = file.lines.to_a
|
|
170
|
+
|
|
171
|
+
ground_truth = {}
|
|
172
|
+
|
|
173
|
+
ground_truth_samples.each do |line|
|
|
174
|
+
line_parts = line.split(' ')
|
|
175
|
+
|
|
176
|
+
old_revision_id = line_parts[0].to_i
|
|
177
|
+
new_revision_id = line_parts[1].to_i
|
|
178
|
+
class_short = line_parts[2]
|
|
179
|
+
|
|
180
|
+
ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
|
|
181
|
+
old_revision_id: old_revision_id,
|
|
182
|
+
new_revision_id: new_revision_id,
|
|
183
|
+
class: class_short
|
|
184
|
+
}
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
ground_truth
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
# Saves and returns a file index hash of structure
|
|
191
|
+
# [file_name => full_path] for the given directory.
|
|
192
|
+
def create_corpus_file_index!
|
|
193
|
+
@config = Wikipedia::VandalismDetection.config
|
|
194
|
+
revisions_directory = @config.test_corpus_revisions_directory
|
|
195
|
+
|
|
196
|
+
raise RevisionsDirectoryNotConfiguredError unless revisions_directory
|
|
197
|
+
|
|
198
|
+
print "\nCreating test corpus index file…"
|
|
199
|
+
file_index = {}
|
|
200
|
+
|
|
201
|
+
Dir.open(revisions_directory) do |part_directories|
|
|
202
|
+
part_directories.each do |part_directory|
|
|
203
|
+
Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
|
|
204
|
+
contents.each do |file|
|
|
205
|
+
path = "#{revisions_directory}/#{part_directory}/#{file}"
|
|
206
|
+
|
|
207
|
+
if File.file?(path) && (file =~ /\d+.txt/) && annotated_revision?(file)
|
|
208
|
+
file_index[file] = path
|
|
209
|
+
print "\r processed #{file_index.count} files"
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
end
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
file = @config.test_output_index_file
|
|
217
|
+
dirname = File.dirname(file)
|
|
218
|
+
FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
|
|
219
|
+
|
|
220
|
+
written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
|
|
221
|
+
|
|
222
|
+
if written > 0
|
|
223
|
+
print "\nSaved test corpus index file to #{file}.\n"
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
file_index
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Returns the Edit with the given revision ids.
|
|
230
|
+
# Test corpus is searched for the revisions' data.
|
|
231
|
+
def edit(old_revision_id, new_revision_id)
|
|
232
|
+
@config = Wikipedia::VandalismDetection.config
|
|
233
|
+
edits_file = @config.test_corpus_edits_file
|
|
234
|
+
raise EditsFileNotConfiguredError unless edits_file
|
|
235
|
+
|
|
236
|
+
@edits_csv ||= CSV.parse(File.read(edits_file), headers: true)
|
|
237
|
+
|
|
238
|
+
edit_data = @edits_csv.find do |row|
|
|
239
|
+
row['oldrevisionid'] == old_revision_id &&
|
|
240
|
+
row['newrevisionid'] == new_revision_id
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
return unless edit_data
|
|
244
|
+
return unless annotated_revision?(old_revision_id)
|
|
245
|
+
return unless annotated_revision?(new_revision_id)
|
|
246
|
+
|
|
247
|
+
create_edit_from(edit_data)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
private
|
|
251
|
+
|
|
252
|
+
# Returns whether the given revision is annotated in the configured gold
|
|
253
|
+
# annotation file.
|
|
254
|
+
def annotated_revision?(revision_file_or_id)
|
|
255
|
+
@annotated_revisions ||= annotated_revisions
|
|
256
|
+
|
|
257
|
+
revision_id = revision_file_or_id.to_s.gsub('.txt', '')
|
|
258
|
+
@annotated_revisions[revision_id.to_sym]
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
# Returns a Hash with the used revision ids from edits_file.
|
|
262
|
+
def annotated_revisions
|
|
263
|
+
annotations_file = @config.test_corpus_ground_truth_file
|
|
264
|
+
annotations = File.read(annotations_file).lines
|
|
265
|
+
|
|
266
|
+
annotated_revisions = {}
|
|
267
|
+
|
|
268
|
+
annotations.each do |annotation|
|
|
269
|
+
data = annotation.split(' ')
|
|
270
|
+
|
|
271
|
+
annotated_revisions[data[0].to_sym] = true
|
|
272
|
+
annotated_revisions[data[1].to_sym] = true
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
@annotated_revisions ||= annotated_revisions
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# Removes all instances with missing attributes
|
|
279
|
+
def remove_missing(dataset)
|
|
280
|
+
dataset.each_attribute do |attribute|
|
|
281
|
+
dataset.delete_with_missing(attribute)
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
dataset
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# Returns the normalized dataset (important for lib svm one class
|
|
288
|
+
# classification)
|
|
289
|
+
def normalize(dataset)
|
|
290
|
+
remove = Weka::Filters::Unsupervised::Attribute::Remove.new
|
|
291
|
+
remove.use_options("-V -R 1-#{@config.features.count}")
|
|
292
|
+
numerics_dataset = remove.filter(dataset)
|
|
293
|
+
|
|
294
|
+
remove.use_options("-R 1-#{@config.features.count}")
|
|
295
|
+
non_numerics_dataset = remove.filter(dataset)
|
|
296
|
+
|
|
297
|
+
normalize = Weka::Filters::Unsupervised::Attribute::Normalize.new
|
|
298
|
+
normalized_dataset = normalize.filter(numerics_dataset)
|
|
299
|
+
|
|
300
|
+
normalized_dataset.merge(non_numerics_dataset)
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
# Creates a Wikipedia::Edit out of an edit's data from edit_file
|
|
304
|
+
# configured in wikipedia-vandalism-detection.yml
|
|
305
|
+
def create_edit_from(edit_data)
|
|
306
|
+
@file_index ||= load_corpus_file_index
|
|
307
|
+
|
|
308
|
+
old_revision_id = edit_data['oldrevisionid'].to_i
|
|
309
|
+
new_revision_id = edit_data['newrevisionid'].to_i
|
|
310
|
+
|
|
311
|
+
editor = edit_data['editor']
|
|
312
|
+
comment = edit_data['editcomment']
|
|
313
|
+
new_timestamp = edit_data['edittime']
|
|
314
|
+
page_id = edit_data['articleid']
|
|
315
|
+
page_title = edit_data['articletitle']
|
|
316
|
+
|
|
317
|
+
old_revision_file = @file_index["#{old_revision_id}.txt"]
|
|
318
|
+
new_revision_file = @file_index["#{new_revision_id}.txt"]
|
|
319
|
+
|
|
320
|
+
unless File.exist?(old_revision_file)
|
|
321
|
+
message = "Old revision file #{old_revision_file} not found"
|
|
322
|
+
raise RevisionFileNotFound, message
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
unless File.exist?(new_revision_file)
|
|
326
|
+
message = "New revision file #{new_revision_file} not found"
|
|
327
|
+
raise RevisionFileNotFound, message
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
old_revision_text = File.read(old_revision_file)
|
|
331
|
+
new_revision_text = File.read(new_revision_file)
|
|
332
|
+
|
|
333
|
+
old_revision = Revision.new
|
|
334
|
+
old_revision.id = old_revision_id
|
|
335
|
+
old_revision.text = Text.new(old_revision_text)
|
|
336
|
+
|
|
337
|
+
new_revision = Revision.new
|
|
338
|
+
new_revision.id = new_revision_id
|
|
339
|
+
new_revision.text = Text.new(new_revision_text)
|
|
340
|
+
new_revision.parent_id = old_revision_id
|
|
341
|
+
new_revision.comment = Text.new(comment)
|
|
342
|
+
new_revision.contributor = editor
|
|
343
|
+
new_revision.timestamp = new_timestamp
|
|
344
|
+
|
|
345
|
+
page = Page.new
|
|
346
|
+
page.id = page_id
|
|
347
|
+
page.title = page_title
|
|
348
|
+
|
|
349
|
+
Edit.new(old_revision, new_revision, page: page)
|
|
350
|
+
end
|
|
351
|
+
|
|
352
|
+
# Gets or creates the corpus index file, which holds a hash of revision
|
|
353
|
+
# files name and their path in the article revisions directory.
|
|
354
|
+
def load_corpus_file_index
|
|
355
|
+
index_file = @config.test_output_index_file
|
|
356
|
+
|
|
357
|
+
if File.exist?(index_file)
|
|
358
|
+
puts " (Using #{index_file}) \n"
|
|
359
|
+
YAML.load_file(index_file)
|
|
360
|
+
else
|
|
361
|
+
create_corpus_file_index!
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# Prints the progress to the $stdout
|
|
366
|
+
def print_progress(processed_count, total_count, message)
|
|
367
|
+
processed_absolute = "#{processed_count}/#{total_count}"
|
|
368
|
+
processed_percentage = ((processed_count * 100.00) / total_count).round(2)
|
|
369
|
+
print "\r#{message}… #{processed_absolute} | #{'%.2f' % processed_percentage}%"
|
|
370
|
+
end
|
|
371
|
+
end
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
class Text < String
|
|
6
|
+
def initialize(text = '')
|
|
7
|
+
super text.encode(
|
|
8
|
+
'UTF-8',
|
|
9
|
+
'binary',
|
|
10
|
+
invalid: :replace,
|
|
11
|
+
undef: :replace,
|
|
12
|
+
replace: ''
|
|
13
|
+
)
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Extracts the plaintext from mediawiki markup and removes all line breaks
|
|
17
|
+
# & multiple spaces Return the cleaned plaintext.
|
|
18
|
+
def clean
|
|
19
|
+
@clean ||= WikitextExtractor.extract_clean self
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
require 'find'
|
|
2
|
+
require 'yaml'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'active_support/core_ext/string'
|
|
5
|
+
require 'weka'
|
|
6
|
+
require 'parallel'
|
|
7
|
+
|
|
8
|
+
require 'wikipedia/vandalism_detection/configuration'
|
|
9
|
+
require 'wikipedia/vandalism_detection/text'
|
|
10
|
+
require 'wikipedia/vandalism_detection/revision'
|
|
11
|
+
require 'wikipedia/vandalism_detection/edit'
|
|
12
|
+
require 'wikipedia/vandalism_detection/feature_calculator'
|
|
13
|
+
require 'wikipedia/vandalism_detection/instances'
|
|
14
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
|
15
|
+
require 'weka/filters/supervised/instance/smote'
|
|
16
|
+
|
|
17
|
+
module Wikipedia
|
|
18
|
+
module VandalismDetection
|
|
19
|
+
# This class provides methods for getting and creating a training ARFF file
|
|
20
|
+
# from a configured training corpus.
|
|
21
|
+
class TrainingDataset
|
|
22
|
+
# Returns an instance dataset from the configured gold annotation file
|
|
23
|
+
# using the configured features from feature_calculator parameter.
|
|
24
|
+
def self.build
|
|
25
|
+
@config = Wikipedia::VandalismDetection.config
|
|
26
|
+
|
|
27
|
+
print "\ncreating training dataset…"
|
|
28
|
+
|
|
29
|
+
annotations_file = @config.training_corpus_annotations_file
|
|
30
|
+
raise AnnotationsFileNotConfiguredError unless annotations_file
|
|
31
|
+
|
|
32
|
+
annotations = CSV.parse(File.read(annotations_file), headers: true)
|
|
33
|
+
|
|
34
|
+
annotation_data = annotations.map do |row|
|
|
35
|
+
{ edit_id: row['editid'], class: row['class'] }
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
output_directory = File.join(@config.output_base_directory, 'training')
|
|
39
|
+
FileUtils.mkdir_p(output_directory) unless Dir.exist?(output_directory)
|
|
40
|
+
|
|
41
|
+
unless Dir.exist?(@config.output_base_directory)
|
|
42
|
+
FileUtils.mkdir_p(@config.output_base_directory)
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
feature_calculator = FeatureCalculator.new
|
|
46
|
+
|
|
47
|
+
@config.features.each do |feature|
|
|
48
|
+
file_name = "#{feature.tr(' ', '_').downcase}.arff"
|
|
49
|
+
arff_file = File.join(output_directory, file_name)
|
|
50
|
+
|
|
51
|
+
next if File.exist?(arff_file)
|
|
52
|
+
|
|
53
|
+
dataset = Instances.empty_for_feature(feature)
|
|
54
|
+
|
|
55
|
+
values = Parallel.map(annotation_data, progress: feature) do |row|
|
|
56
|
+
edit_id = row[:edit_id]
|
|
57
|
+
vandalism = row[:class]
|
|
58
|
+
edit = create_edit_from(edit_id)
|
|
59
|
+
|
|
60
|
+
value = feature_calculator.calculate_feature_for(edit, feature)
|
|
61
|
+
[value, vandalism]
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
dataset.add_instances(values)
|
|
65
|
+
dataset.to_arff(arff_file)
|
|
66
|
+
puts "'#{File.basename(arff_file)}' saved to #{File.dirname(arff_file)}"
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
dataset = merge_feature_arffs(@config.features, output_directory)
|
|
70
|
+
dataset.class_index = @config.features.count
|
|
71
|
+
|
|
72
|
+
if @config.replace_training_data_missing_values?
|
|
73
|
+
dataset = replace_missing_values(dataset)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
dataset
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
class << self
|
|
80
|
+
alias instances build
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Returns the balanced training dataset (same number of vandalism &
|
|
84
|
+
# regular instances, Uniform distribution => removes majority instances)
|
|
85
|
+
def self.balanced_instances
|
|
86
|
+
filter = Weka::Filters::Supervised::Instance::SpreadSubsample.new
|
|
87
|
+
filter.use_options('-M 1')
|
|
88
|
+
filter.filter(build)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Returns an oversampled training dataset.
|
|
92
|
+
# Oversampling options can be set by using e.g:
|
|
93
|
+
# percentage: 200
|
|
94
|
+
# undersampling: false
|
|
95
|
+
#
|
|
96
|
+
# For oversampling Weka SMOTE package is used.
|
|
97
|
+
# For SMOTE method see paper: http://arxiv.org/pdf/1106.1813.pdf
|
|
98
|
+
# Doc: http://weka.sourceforge.net/doc.packages/SMOTE/weka/filters/supervised/instance/SMOTE.html
|
|
99
|
+
def self.oversampled_instances(options = {})
|
|
100
|
+
config = Wikipedia::VandalismDetection.config
|
|
101
|
+
default_options = config.oversampling_options
|
|
102
|
+
|
|
103
|
+
options[:percentage] ||= default_options[:percentage]
|
|
104
|
+
options[:undersampling] ||= default_options[:undersampling]
|
|
105
|
+
|
|
106
|
+
percentage = options[:percentage]
|
|
107
|
+
smote_options = "-P #{percentage.to_i}" if percentage
|
|
108
|
+
|
|
109
|
+
smote = Weka::Filters::Supervised::Instance::SMOTE.new
|
|
110
|
+
smote.use_options(smote_options) if smote_options
|
|
111
|
+
smote_dataset = smote.filter(build)
|
|
112
|
+
|
|
113
|
+
undersampling = options[:undersampling] / 100.0
|
|
114
|
+
|
|
115
|
+
if undersampling > 0.0
|
|
116
|
+
# balance (remove majority instances)
|
|
117
|
+
subsample = Weka::Filters::Supervised::Instance::SpreadSubsample.new
|
|
118
|
+
subsample.use_options("-M #{undersampling}")
|
|
119
|
+
smote_dataset.apply_filter(subsample)
|
|
120
|
+
else
|
|
121
|
+
smote_dataset
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
def self.replace_missing_values(dataset)
|
|
126
|
+
puts 'replacing missing values…'
|
|
127
|
+
filter = Weka::Filters::Unsupervised::Attribute::ReplaceMissingValues.new
|
|
128
|
+
dataset.apply_filter(filter)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Saves and returns a file index hash of structure
|
|
132
|
+
# [file_name => full_path] for the given directory.
|
|
133
|
+
def self.create_corpus_file_index!
|
|
134
|
+
@config = Wikipedia::VandalismDetection.config
|
|
135
|
+
revisions_directory = @config.training_corpus_revisions_directory
|
|
136
|
+
|
|
137
|
+
raise RevisionsDirectoryNotConfiguredError unless revisions_directory
|
|
138
|
+
|
|
139
|
+
print "\ncreating file index…"
|
|
140
|
+
file_index = {}
|
|
141
|
+
|
|
142
|
+
Dir.open revisions_directory do |part_directories|
|
|
143
|
+
part_directories.each do |part_directory|
|
|
144
|
+
Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
|
|
145
|
+
contents.each do |file|
|
|
146
|
+
path = "#{revisions_directory}/#{part_directory}/#{file}"
|
|
147
|
+
|
|
148
|
+
if File.file?(path) && (file =~ /\d+.txt/)
|
|
149
|
+
file_index[file] = path
|
|
150
|
+
print "\r processed #{file_index.count} files"
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
file = @config.training_output_index_file
|
|
158
|
+
dirname = File.dirname(file)
|
|
159
|
+
|
|
160
|
+
FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
|
|
161
|
+
|
|
162
|
+
written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
|
|
163
|
+
print "Index file saved to #{file}.\n" if written > 0
|
|
164
|
+
|
|
165
|
+
file_index
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
# Loads arff files of given features and merge them into one arff file.
|
|
169
|
+
# Returns the merged arff file.
|
|
170
|
+
def self.merge_feature_arffs(features, output_directory)
|
|
171
|
+
filter = Weka::Filters::Unsupervised::Attribute::Remove.new
|
|
172
|
+
filter.use_options('-R last')
|
|
173
|
+
merged_dataset = nil
|
|
174
|
+
|
|
175
|
+
features.each do |feature|
|
|
176
|
+
file_name = "#{feature.tr(' ', '_').downcase}.arff"
|
|
177
|
+
arff_file = File.join(output_directory, file_name)
|
|
178
|
+
|
|
179
|
+
feature_dataset = Weka::Core::Instances.from_arff(arff_file)
|
|
180
|
+
puts "using #{File.basename(arff_file)}"
|
|
181
|
+
|
|
182
|
+
if merged_dataset
|
|
183
|
+
merged_dataset = merged_dataset
|
|
184
|
+
.apply_filter(filter)
|
|
185
|
+
.merge(feature_dataset)
|
|
186
|
+
else
|
|
187
|
+
merged_dataset = feature_dataset
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
merged_dataset
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Creates a Wikipedia::Edit out of an annotation's edit id using files
|
|
195
|
+
# form wikipedia-vandalism-detection.yml
|
|
196
|
+
def self.create_edit_from(edit_id)
|
|
197
|
+
@file_index ||= load_corpus_file_index
|
|
198
|
+
edit_data = find_edits_data_for(edit_id)
|
|
199
|
+
|
|
200
|
+
old_revision_id = edit_data['oldrevisionid'].to_i
|
|
201
|
+
new_revision_id = edit_data['newrevisionid'].to_i
|
|
202
|
+
|
|
203
|
+
editor = edit_data['editor']
|
|
204
|
+
comment = edit_data['editcomment']
|
|
205
|
+
new_timestamp = edit_data['edittime']
|
|
206
|
+
page_id = edit_data['articleid']
|
|
207
|
+
page_title = edit_data['articletitle']
|
|
208
|
+
|
|
209
|
+
old_revision_file = @file_index["#{old_revision_id}.txt"]
|
|
210
|
+
new_revision_file = @file_index["#{new_revision_id}.txt"]
|
|
211
|
+
|
|
212
|
+
unless File.exist?(old_revision_file)
|
|
213
|
+
message = "Old revision file #{old_revision_file} not found"
|
|
214
|
+
raise RevisionFileNotFound, message
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
unless File.exist?(new_revision_file)
|
|
218
|
+
message = "New revision file #{new_revision_file} not found"
|
|
219
|
+
raise RevisionFileNotFound, message
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
old_revision_text = File.read(old_revision_file)
|
|
223
|
+
new_revision_text = File.read(new_revision_file)
|
|
224
|
+
|
|
225
|
+
old_revision = Revision.new
|
|
226
|
+
old_revision.id = old_revision_id
|
|
227
|
+
old_revision.text = Text.new(old_revision_text)
|
|
228
|
+
|
|
229
|
+
new_revision = Revision.new
|
|
230
|
+
new_revision.id = new_revision_id
|
|
231
|
+
new_revision.text = Text.new(new_revision_text)
|
|
232
|
+
new_revision.parent_id = old_revision_id
|
|
233
|
+
new_revision.comment = Text.new(comment)
|
|
234
|
+
new_revision.contributor = editor
|
|
235
|
+
new_revision.timestamp = new_timestamp
|
|
236
|
+
|
|
237
|
+
page = Page.new
|
|
238
|
+
page.id = page_id
|
|
239
|
+
page.title = page_title
|
|
240
|
+
|
|
241
|
+
Edit.new(old_revision, new_revision, page: page)
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
# Gets or creates the corpus index file, which holds a hash of revision
|
|
245
|
+
# files name and their path in the article revisions directory.
|
|
246
|
+
def self.load_corpus_file_index
|
|
247
|
+
index_file = @config.training_output_index_file
|
|
248
|
+
|
|
249
|
+
if File.exist? index_file
|
|
250
|
+
puts "\n(Using #{index_file})\n"
|
|
251
|
+
YAML.load_file index_file
|
|
252
|
+
else
|
|
253
|
+
create_corpus_file_index!
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Returns the line array of the edits.csv file with given edit id.
|
|
258
|
+
def self.find_edits_data_for(edit_id)
|
|
259
|
+
edits_file = Wikipedia::VandalismDetection.config.training_corpus_edits_file
|
|
260
|
+
raise EditsFileNotConfiguredError unless edits_file
|
|
261
|
+
|
|
262
|
+
@edits_file_content ||= File.read(edits_file)
|
|
263
|
+
@edits_csv ||= CSV.parse(@edits_file_content, headers: true)
|
|
264
|
+
|
|
265
|
+
edit_data = @edits_csv.find { |row| row['editid'] == edit_id }
|
|
266
|
+
|
|
267
|
+
unless edit_data
|
|
268
|
+
directory = File.basename(edits_file)
|
|
269
|
+
raise "Edit data for edit id #{edit_id} not found in #{directory}."
|
|
270
|
+
end
|
|
271
|
+
|
|
272
|
+
edit_data
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
private_class_method :create_edit_from,
|
|
276
|
+
:merge_feature_arffs,
|
|
277
|
+
:find_edits_data_for,
|
|
278
|
+
:load_corpus_file_index,
|
|
279
|
+
:replace_missing_values
|
|
280
|
+
end
|
|
281
|
+
end
|
|
282
|
+
end
|