wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,374 @@
1
+ require 'find'
2
+ require 'yaml'
3
+ require 'fileutils'
4
+ require 'csv'
5
+ require 'weka'
6
+
7
+ require 'wikipedia/vandalism_detection/configuration'
8
+ require 'wikipedia/vandalism_detection/text'
9
+ require 'wikipedia/vandalism_detection/revision'
10
+ require 'wikipedia/vandalism_detection/edit'
11
+ require 'wikipedia/vandalism_detection/feature_calculator'
12
+ require 'wikipedia/vandalism_detection/instances'
13
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
14
+
15
+ module Wikipedia
16
+ module VandalismDetection
17
+ # This class provides methods for getting and creating a test ARFF file from
18
+ # a configured test corpus.
19
+ class TestDataset
20
+ class << self
21
+ # Returns an instance dataset from the configured gold annotation file
22
+ # using the configured features from feature_calculator parameter.
23
+ def build
24
+ @config = Wikipedia::VandalismDetection.config
25
+ print "\ncreating test dataset…"
26
+
27
+ edits_file = @config.test_corpus_edits_file
28
+ raise EditsFileNotConfiguredError unless edits_file
29
+
30
+ edits = CSV.parse(File.read(edits_file), headers: true)
31
+
32
+ output_directory = File.join(@config.output_base_directory, 'test')
33
+
34
+ unless Dir.exist?(output_directory)
35
+ FileUtils.mkdir_p(output_directory)
36
+ end
37
+
38
+ unless Dir.exist?(@config.output_base_directory)
39
+ FileUtils.mkdir_p(@config.output_base_directory)
40
+ end
41
+
42
+ # create feature file hash with io objects
43
+ feature_files = @config.features.each_with_object({}) do |feature_name, hash|
44
+ file_name = "#{feature_name.tr(' ', '_').downcase}.arff"
45
+ arff_file = File.join(output_directory, file_name)
46
+
47
+ next if File.exist?(arff_file)
48
+
49
+ dataset = Instances.empty_for_test_feature(feature_name)
50
+ dataset.to_arff(arff_file)
51
+ hash[feature_name] = File.open(arff_file, 'a')
52
+ end
53
+
54
+ feature_calculator = FeatureCalculator.new
55
+
56
+ unless feature_files.empty?
57
+ processed_edits = 0
58
+ edits_count = edits.count
59
+
60
+ edits.each do |edit_data|
61
+ old_revision_id = edit_data['oldrevisionid']
62
+ new_revision_id = edit_data['newrevisionid']
63
+
64
+ processed_edits += 1
65
+ print_progress(processed_edits, edits_count, 'computing test features')
66
+
67
+ annotated_old_revision = annotated_revision?(old_revision_id)
68
+ annotated_new_revision = annotated_revision?(new_revision_id)
69
+ next unless annotated_old_revision && annotated_new_revision
70
+
71
+ edit = create_edit_from(edit_data)
72
+
73
+ feature_files.each do |feature_name, file|
74
+ value = feature_calculator.calculate_feature_for(edit, feature_name)
75
+ file.puts [value, old_revision_id, new_revision_id].join(',')
76
+ end
77
+ end
78
+
79
+ # close all io objects
80
+ feature_files.each_value(&:close)
81
+ end
82
+
83
+ merge_feature_arffs(@config.features, output_directory)
84
+ end
85
+
86
+ alias instances build
87
+
88
+ # Saves and returns the dataset as ARFF file.
89
+ # As test data the configured data corpus from /config/wikipedia-vandalism-detection.yml is used.
90
+ def build!
91
+ @config = Wikipedia::VandalismDetection.config
92
+
93
+ dataset = instances
94
+ output_file = @config.test_output_arff_file
95
+
96
+ dataset.to_arff(output_file)
97
+ puts "\n'#{File.basename(output_file)}' saved to #{File.dirname(output_file)}"
98
+
99
+ dataset
100
+ end
101
+
102
+ # Loads arff files of given features and merge them into one arff file.
103
+ # Returns the merged arff file.
104
+ def merge_feature_arffs(features, output_directory)
105
+ filter = Weka::Filters::Unsupervised::Attribute::Remove.new
106
+ filter.use_options('-R last')
107
+
108
+ merged_dataset = nil
109
+
110
+ features.each do |feature_name|
111
+ file_name = "#{feature_name.tr(' ', '_').downcase}.arff"
112
+ arff_file = File.join(output_directory, file_name)
113
+
114
+ feature_dataset = Weka::Core::Instances.from_arff(arff_file)
115
+ print '.'
116
+
117
+ if merged_dataset
118
+ merged_dataset = merged_dataset.apply_filters(filter, filter)
119
+ merged_dataset = Weka::Core::Instances.merge_instances(merged_dataset, feature_dataset)
120
+ else
121
+ merged_dataset = feature_dataset
122
+ end
123
+ end
124
+
125
+ add_ground_truth_class_to(merged_dataset)
126
+ end
127
+
128
+ # Adds the ground truth class attribute and values to the given dataset
129
+ # and returns the merged
130
+ def add_ground_truth_class_to(dataset)
131
+ config = Wikipedia::VandalismDetection.config
132
+
133
+ arff_file = File.join(config.output_base_directory, 'test', 'class.arff')
134
+ class_dataset = Instances.empty_for_test_class
135
+
136
+ if File.exist?(arff_file)
137
+ class_dataset = Weka::Core::Instances.from_arff(arff_file)
138
+ else
139
+ ground_truth_file_path = config.test_corpus_ground_truth_file
140
+ ground_truth = ground_truth_hash(ground_truth_file_path)
141
+
142
+ dataset.each do |instance|
143
+ old_revision_id = instance.values[-2].to_i
144
+ new_revision_id = instance.values[-1].to_i
145
+ key = :"#{old_revision_id}-#{new_revision_id}"
146
+
147
+ if ground_truth.key?(key)
148
+ class_value = Instances::CLASSES[Instances::CLASSES_SHORT.key(ground_truth[key][:class])]
149
+ class_dataset.add_instance([class_value || '?'])
150
+ else
151
+ class_dataset.add_instance(['?']) # missing
152
+ end
153
+ end
154
+
155
+ class_dataset.to_arff(arff_file)
156
+ puts "saved #{File.basename(arff_file)} to #{File.dirname(arff_file)}"
157
+ end
158
+
159
+ if dataset.size != class_dataset.size
160
+ raise Exception, "Different size: #{dataset.size} vs. #{class_dataset.size}"
161
+ end
162
+
163
+ dataset.merge(class_dataset)
164
+ end
165
+
166
+ # Returns a hash for classification data from given ground truth file
167
+ def ground_truth_hash(ground_truth_file)
168
+ file = File.read(ground_truth_file)
169
+ ground_truth_samples = file.lines.to_a
170
+
171
+ ground_truth = {}
172
+
173
+ ground_truth_samples.each do |line|
174
+ line_parts = line.split(' ')
175
+
176
+ old_revision_id = line_parts[0].to_i
177
+ new_revision_id = line_parts[1].to_i
178
+ class_short = line_parts[2]
179
+
180
+ ground_truth[:"#{old_revision_id}-#{new_revision_id}"] = {
181
+ old_revision_id: old_revision_id,
182
+ new_revision_id: new_revision_id,
183
+ class: class_short
184
+ }
185
+ end
186
+
187
+ ground_truth
188
+ end
189
+
190
+ # Saves and returns a file index hash of structure
191
+ # [file_name => full_path] for the given directory.
192
+ def create_corpus_file_index!
193
+ @config = Wikipedia::VandalismDetection.config
194
+ revisions_directory = @config.test_corpus_revisions_directory
195
+
196
+ raise RevisionsDirectoryNotConfiguredError unless revisions_directory
197
+
198
+ print "\nCreating test corpus index file…"
199
+ file_index = {}
200
+
201
+ Dir.open(revisions_directory) do |part_directories|
202
+ part_directories.each do |part_directory|
203
+ Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
204
+ contents.each do |file|
205
+ path = "#{revisions_directory}/#{part_directory}/#{file}"
206
+
207
+ if File.file?(path) && (file =~ /\d+.txt/) && annotated_revision?(file)
208
+ file_index[file] = path
209
+ print "\r processed #{file_index.count} files"
210
+ end
211
+ end
212
+ end
213
+ end
214
+ end
215
+
216
+ file = @config.test_output_index_file
217
+ dirname = File.dirname(file)
218
+ FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
219
+
220
+ written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
221
+
222
+ if written > 0
223
+ print "\nSaved test corpus index file to #{file}.\n"
224
+ end
225
+
226
+ file_index
227
+ end
228
+
229
+ # Returns the Edit with the given revision ids.
230
+ # Test corpus is searched for the revisions' data.
231
+ def edit(old_revision_id, new_revision_id)
232
+ @config = Wikipedia::VandalismDetection.config
233
+ edits_file = @config.test_corpus_edits_file
234
+ raise EditsFileNotConfiguredError unless edits_file
235
+
236
+ @edits_csv ||= CSV.parse(File.read(edits_file), headers: true)
237
+
238
+ edit_data = @edits_csv.find do |row|
239
+ row['oldrevisionid'] == old_revision_id &&
240
+ row['newrevisionid'] == new_revision_id
241
+ end
242
+
243
+ return unless edit_data
244
+ return unless annotated_revision?(old_revision_id)
245
+ return unless annotated_revision?(new_revision_id)
246
+
247
+ create_edit_from(edit_data)
248
+ end
249
+
250
+ private
251
+
252
+ # Returns whether the given revision is annotated in the configured gold
253
+ # annotation file.
254
+ def annotated_revision?(revision_file_or_id)
255
+ @annotated_revisions ||= annotated_revisions
256
+
257
+ revision_id = revision_file_or_id.to_s.gsub('.txt', '')
258
+ @annotated_revisions[revision_id.to_sym]
259
+ end
260
+
261
+ # Returns a Hash with the used revision ids from edits_file.
262
+ def annotated_revisions
263
+ annotations_file = @config.test_corpus_ground_truth_file
264
+ annotations = File.read(annotations_file).lines
265
+
266
+ annotated_revisions = {}
267
+
268
+ annotations.each do |annotation|
269
+ data = annotation.split(' ')
270
+
271
+ annotated_revisions[data[0].to_sym] = true
272
+ annotated_revisions[data[1].to_sym] = true
273
+ end
274
+
275
+ @annotated_revisions ||= annotated_revisions
276
+ end
277
+
278
+ # Removes all instances with missing attributes
279
+ def remove_missing(dataset)
280
+ dataset.each_attribute do |attribute|
281
+ dataset.delete_with_missing(attribute)
282
+ end
283
+
284
+ dataset
285
+ end
286
+
287
+ # Returns the normalized dataset (important for lib svm one class
288
+ # classification)
289
+ def normalize(dataset)
290
+ remove = Weka::Filters::Unsupervised::Attribute::Remove.new
291
+ remove.use_options("-V -R 1-#{@config.features.count}")
292
+ numerics_dataset = remove.filter(dataset)
293
+
294
+ remove.use_options("-R 1-#{@config.features.count}")
295
+ non_numerics_dataset = remove.filter(dataset)
296
+
297
+ normalize = Weka::Filters::Unsupervised::Attribute::Normalize.new
298
+ normalized_dataset = normalize.filter(numerics_dataset)
299
+
300
+ normalized_dataset.merge(non_numerics_dataset)
301
+ end
302
+
303
+ # Creates a Wikipedia::Edit out of an edit's data from edit_file
304
+ # configured in wikipedia-vandalism-detection.yml
305
+ def create_edit_from(edit_data)
306
+ @file_index ||= load_corpus_file_index
307
+
308
+ old_revision_id = edit_data['oldrevisionid'].to_i
309
+ new_revision_id = edit_data['newrevisionid'].to_i
310
+
311
+ editor = edit_data['editor']
312
+ comment = edit_data['editcomment']
313
+ new_timestamp = edit_data['edittime']
314
+ page_id = edit_data['articleid']
315
+ page_title = edit_data['articletitle']
316
+
317
+ old_revision_file = @file_index["#{old_revision_id}.txt"]
318
+ new_revision_file = @file_index["#{new_revision_id}.txt"]
319
+
320
+ unless File.exist?(old_revision_file)
321
+ message = "Old revision file #{old_revision_file} not found"
322
+ raise RevisionFileNotFound, message
323
+ end
324
+
325
+ unless File.exist?(new_revision_file)
326
+ message = "New revision file #{new_revision_file} not found"
327
+ raise RevisionFileNotFound, message
328
+ end
329
+
330
+ old_revision_text = File.read(old_revision_file)
331
+ new_revision_text = File.read(new_revision_file)
332
+
333
+ old_revision = Revision.new
334
+ old_revision.id = old_revision_id
335
+ old_revision.text = Text.new(old_revision_text)
336
+
337
+ new_revision = Revision.new
338
+ new_revision.id = new_revision_id
339
+ new_revision.text = Text.new(new_revision_text)
340
+ new_revision.parent_id = old_revision_id
341
+ new_revision.comment = Text.new(comment)
342
+ new_revision.contributor = editor
343
+ new_revision.timestamp = new_timestamp
344
+
345
+ page = Page.new
346
+ page.id = page_id
347
+ page.title = page_title
348
+
349
+ Edit.new(old_revision, new_revision, page: page)
350
+ end
351
+
352
+ # Gets or creates the corpus index file, which holds a hash of revision
353
+ # files name and their path in the article revisions directory.
354
+ def load_corpus_file_index
355
+ index_file = @config.test_output_index_file
356
+
357
+ if File.exist?(index_file)
358
+ puts " (Using #{index_file}) \n"
359
+ YAML.load_file(index_file)
360
+ else
361
+ create_corpus_file_index!
362
+ end
363
+ end
364
+
365
+ # Prints the progress to the $stdout
366
+ def print_progress(processed_count, total_count, message)
367
+ processed_absolute = "#{processed_count}/#{total_count}"
368
+ processed_percentage = ((processed_count * 100.00) / total_count).round(2)
369
+ print "\r#{message}… #{processed_absolute} | #{'%.2f' % processed_percentage}%"
370
+ end
371
+ end
372
+ end
373
+ end
374
+ end
@@ -0,0 +1,23 @@
1
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
2
+
3
+ module Wikipedia
4
+ module VandalismDetection
5
+ class Text < String
6
+ def initialize(text = '')
7
+ super text.encode(
8
+ 'UTF-8',
9
+ 'binary',
10
+ invalid: :replace,
11
+ undef: :replace,
12
+ replace: ''
13
+ )
14
+ end
15
+
16
+ # Extracts the plaintext from mediawiki markup and removes all line breaks
17
+ # & multiple spaces Return the cleaned plaintext.
18
+ def clean
19
+ @clean ||= WikitextExtractor.extract_clean self
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,282 @@
1
+ require 'find'
2
+ require 'yaml'
3
+ require 'fileutils'
4
+ require 'active_support/core_ext/string'
5
+ require 'weka'
6
+ require 'parallel'
7
+
8
+ require 'wikipedia/vandalism_detection/configuration'
9
+ require 'wikipedia/vandalism_detection/text'
10
+ require 'wikipedia/vandalism_detection/revision'
11
+ require 'wikipedia/vandalism_detection/edit'
12
+ require 'wikipedia/vandalism_detection/feature_calculator'
13
+ require 'wikipedia/vandalism_detection/instances'
14
+ require 'wikipedia/vandalism_detection/wikitext_extractor'
15
+ require 'weka/filters/supervised/instance/smote'
16
+
17
+ module Wikipedia
18
+ module VandalismDetection
19
+ # This class provides methods for getting and creating a training ARFF file
20
+ # from a configured training corpus.
21
+ class TrainingDataset
22
+ # Returns an instance dataset from the configured gold annotation file
23
+ # using the configured features from feature_calculator parameter.
24
+ def self.build
25
+ @config = Wikipedia::VandalismDetection.config
26
+
27
+ print "\ncreating training dataset…"
28
+
29
+ annotations_file = @config.training_corpus_annotations_file
30
+ raise AnnotationsFileNotConfiguredError unless annotations_file
31
+
32
+ annotations = CSV.parse(File.read(annotations_file), headers: true)
33
+
34
+ annotation_data = annotations.map do |row|
35
+ { edit_id: row['editid'], class: row['class'] }
36
+ end
37
+
38
+ output_directory = File.join(@config.output_base_directory, 'training')
39
+ FileUtils.mkdir_p(output_directory) unless Dir.exist?(output_directory)
40
+
41
+ unless Dir.exist?(@config.output_base_directory)
42
+ FileUtils.mkdir_p(@config.output_base_directory)
43
+ end
44
+
45
+ feature_calculator = FeatureCalculator.new
46
+
47
+ @config.features.each do |feature|
48
+ file_name = "#{feature.tr(' ', '_').downcase}.arff"
49
+ arff_file = File.join(output_directory, file_name)
50
+
51
+ next if File.exist?(arff_file)
52
+
53
+ dataset = Instances.empty_for_feature(feature)
54
+
55
+ values = Parallel.map(annotation_data, progress: feature) do |row|
56
+ edit_id = row[:edit_id]
57
+ vandalism = row[:class]
58
+ edit = create_edit_from(edit_id)
59
+
60
+ value = feature_calculator.calculate_feature_for(edit, feature)
61
+ [value, vandalism]
62
+ end
63
+
64
+ dataset.add_instances(values)
65
+ dataset.to_arff(arff_file)
66
+ puts "'#{File.basename(arff_file)}' saved to #{File.dirname(arff_file)}"
67
+ end
68
+
69
+ dataset = merge_feature_arffs(@config.features, output_directory)
70
+ dataset.class_index = @config.features.count
71
+
72
+ if @config.replace_training_data_missing_values?
73
+ dataset = replace_missing_values(dataset)
74
+ end
75
+
76
+ dataset
77
+ end
78
+
79
+ class << self
80
+ alias instances build
81
+ end
82
+
83
+ # Returns the balanced training dataset (same number of vandalism &
84
+ # regular instances, Uniform distribution => removes majority instances)
85
+ def self.balanced_instances
86
+ filter = Weka::Filters::Supervised::Instance::SpreadSubsample.new
87
+ filter.use_options('-M 1')
88
+ filter.filter(build)
89
+ end
90
+
91
+ # Returns an oversampled training dataset.
92
+ # Oversampling options can be set by using e.g:
93
+ # percentage: 200
94
+ # undersampling: false
95
+ #
96
+ # For oversampling Weka SMOTE package is used.
97
+ # For SMOTE method see paper: http://arxiv.org/pdf/1106.1813.pdf
98
+ # Doc: http://weka.sourceforge.net/doc.packages/SMOTE/weka/filters/supervised/instance/SMOTE.html
99
+ def self.oversampled_instances(options = {})
100
+ config = Wikipedia::VandalismDetection.config
101
+ default_options = config.oversampling_options
102
+
103
+ options[:percentage] ||= default_options[:percentage]
104
+ options[:undersampling] ||= default_options[:undersampling]
105
+
106
+ percentage = options[:percentage]
107
+ smote_options = "-P #{percentage.to_i}" if percentage
108
+
109
+ smote = Weka::Filters::Supervised::Instance::SMOTE.new
110
+ smote.use_options(smote_options) if smote_options
111
+ smote_dataset = smote.filter(build)
112
+
113
+ undersampling = options[:undersampling] / 100.0
114
+
115
+ if undersampling > 0.0
116
+ # balance (remove majority instances)
117
+ subsample = Weka::Filters::Supervised::Instance::SpreadSubsample.new
118
+ subsample.use_options("-M #{undersampling}")
119
+ smote_dataset.apply_filter(subsample)
120
+ else
121
+ smote_dataset
122
+ end
123
+ end
124
+
125
+ def self.replace_missing_values(dataset)
126
+ puts 'replacing missing values…'
127
+ filter = Weka::Filters::Unsupervised::Attribute::ReplaceMissingValues.new
128
+ dataset.apply_filter(filter)
129
+ end
130
+
131
+ # Saves and returns a file index hash of structure
132
+ # [file_name => full_path] for the given directory.
133
+ def self.create_corpus_file_index!
134
+ @config = Wikipedia::VandalismDetection.config
135
+ revisions_directory = @config.training_corpus_revisions_directory
136
+
137
+ raise RevisionsDirectoryNotConfiguredError unless revisions_directory
138
+
139
+ print "\ncreating file index…"
140
+ file_index = {}
141
+
142
+ Dir.open revisions_directory do |part_directories|
143
+ part_directories.each do |part_directory|
144
+ Dir.open "#{revisions_directory}/#{part_directory}" do |contents|
145
+ contents.each do |file|
146
+ path = "#{revisions_directory}/#{part_directory}/#{file}"
147
+
148
+ if File.file?(path) && (file =~ /\d+.txt/)
149
+ file_index[file] = path
150
+ print "\r processed #{file_index.count} files"
151
+ end
152
+ end
153
+ end
154
+ end
155
+ end
156
+
157
+ file = @config.training_output_index_file
158
+ dirname = File.dirname(file)
159
+
160
+ FileUtils.mkdir(dirname) unless Dir.exist?(dirname)
161
+
162
+ written = File.open(file, 'w') { |f| f.write(file_index.to_yaml) }
163
+ print "Index file saved to #{file}.\n" if written > 0
164
+
165
+ file_index
166
+ end
167
+
168
+ # Loads arff files of given features and merge them into one arff file.
169
+ # Returns the merged arff file.
170
+ def self.merge_feature_arffs(features, output_directory)
171
+ filter = Weka::Filters::Unsupervised::Attribute::Remove.new
172
+ filter.use_options('-R last')
173
+ merged_dataset = nil
174
+
175
+ features.each do |feature|
176
+ file_name = "#{feature.tr(' ', '_').downcase}.arff"
177
+ arff_file = File.join(output_directory, file_name)
178
+
179
+ feature_dataset = Weka::Core::Instances.from_arff(arff_file)
180
+ puts "using #{File.basename(arff_file)}"
181
+
182
+ if merged_dataset
183
+ merged_dataset = merged_dataset
184
+ .apply_filter(filter)
185
+ .merge(feature_dataset)
186
+ else
187
+ merged_dataset = feature_dataset
188
+ end
189
+ end
190
+
191
+ merged_dataset
192
+ end
193
+
194
+ # Creates a Wikipedia::Edit out of an annotation's edit id using files
195
+ # form wikipedia-vandalism-detection.yml
196
+ def self.create_edit_from(edit_id)
197
+ @file_index ||= load_corpus_file_index
198
+ edit_data = find_edits_data_for(edit_id)
199
+
200
+ old_revision_id = edit_data['oldrevisionid'].to_i
201
+ new_revision_id = edit_data['newrevisionid'].to_i
202
+
203
+ editor = edit_data['editor']
204
+ comment = edit_data['editcomment']
205
+ new_timestamp = edit_data['edittime']
206
+ page_id = edit_data['articleid']
207
+ page_title = edit_data['articletitle']
208
+
209
+ old_revision_file = @file_index["#{old_revision_id}.txt"]
210
+ new_revision_file = @file_index["#{new_revision_id}.txt"]
211
+
212
+ unless File.exist?(old_revision_file)
213
+ message = "Old revision file #{old_revision_file} not found"
214
+ raise RevisionFileNotFound, message
215
+ end
216
+
217
+ unless File.exist?(new_revision_file)
218
+ message = "New revision file #{new_revision_file} not found"
219
+ raise RevisionFileNotFound, message
220
+ end
221
+
222
+ old_revision_text = File.read(old_revision_file)
223
+ new_revision_text = File.read(new_revision_file)
224
+
225
+ old_revision = Revision.new
226
+ old_revision.id = old_revision_id
227
+ old_revision.text = Text.new(old_revision_text)
228
+
229
+ new_revision = Revision.new
230
+ new_revision.id = new_revision_id
231
+ new_revision.text = Text.new(new_revision_text)
232
+ new_revision.parent_id = old_revision_id
233
+ new_revision.comment = Text.new(comment)
234
+ new_revision.contributor = editor
235
+ new_revision.timestamp = new_timestamp
236
+
237
+ page = Page.new
238
+ page.id = page_id
239
+ page.title = page_title
240
+
241
+ Edit.new(old_revision, new_revision, page: page)
242
+ end
243
+
244
+ # Gets or creates the corpus index file, which holds a hash of revision
245
+ # files name and their path in the article revisions directory.
246
+ def self.load_corpus_file_index
247
+ index_file = @config.training_output_index_file
248
+
249
+ if File.exist? index_file
250
+ puts "\n(Using #{index_file})\n"
251
+ YAML.load_file index_file
252
+ else
253
+ create_corpus_file_index!
254
+ end
255
+ end
256
+
257
+ # Returns the line array of the edits.csv file with given edit id.
258
+ def self.find_edits_data_for(edit_id)
259
+ edits_file = Wikipedia::VandalismDetection.config.training_corpus_edits_file
260
+ raise EditsFileNotConfiguredError unless edits_file
261
+
262
+ @edits_file_content ||= File.read(edits_file)
263
+ @edits_csv ||= CSV.parse(@edits_file_content, headers: true)
264
+
265
+ edit_data = @edits_csv.find { |row| row['editid'] == edit_id }
266
+
267
+ unless edit_data
268
+ directory = File.basename(edits_file)
269
+ raise "Edit data for edit id #{edit_id} not found in #{directory}."
270
+ end
271
+
272
+ edit_data
273
+ end
274
+
275
+ private_class_method :create_edit_from,
276
+ :merge_feature_arffs,
277
+ :find_edits_data_for,
278
+ :load_corpus_file_index,
279
+ :replace_missing_values
280
+ end
281
+ end
282
+ end
@@ -0,0 +1,5 @@
1
+ module Wikipedia
2
+ module VandalismDetection
3
+ VERSION = '0.1.0'.freeze
4
+ end
5
+ end