wikipedia-vandalism_detection 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (247) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +17 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE.txt +4 -0
  5. data/README.md +265 -0
  6. data/Rakefile +12 -0
  7. data/lib/java/LibSVM.jar +0 -0
  8. data/lib/java/SMOTE.jar +0 -0
  9. data/lib/java/balancedRandomForest.jar +0 -0
  10. data/lib/java/diffutils-1.3.0.jar +0 -0
  11. data/lib/java/libsvm.jar +0 -0
  12. data/lib/java/oneClassClassifier.jar +0 -0
  13. data/lib/java/realAdaBoost.jar +0 -0
  14. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  15. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  16. data/lib/weka/classifiers/functions/lib_svm.rb +15 -0
  17. data/lib/weka/classifiers/meta/one_class_classifier.rb +25 -0
  18. data/lib/weka/classifiers/meta/real_ada_boost.rb +17 -0
  19. data/lib/weka/classifiers/trees/balanced_random_forest.rb +18 -0
  20. data/lib/weka/filters/supervised/instance/smote.rb +22 -0
  21. data/lib/wikipedia.rb +51 -0
  22. data/lib/wikipedia/vandalism_detection.rb +30 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +18 -0
  24. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +69 -0
  25. data/lib/wikipedia/vandalism_detection/classifier.rb +186 -0
  26. data/lib/wikipedia/vandalism_detection/configuration.rb +321 -0
  27. data/lib/wikipedia/vandalism_detection/diff.rb +27 -0
  28. data/lib/wikipedia/vandalism_detection/edit.rb +75 -0
  29. data/lib/wikipedia/vandalism_detection/evaluator.rb +606 -0
  30. data/lib/wikipedia/vandalism_detection/exceptions.rb +40 -0
  31. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +89 -0
  32. data/lib/wikipedia/vandalism_detection/features.rb +67 -0
  33. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +23 -0
  34. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +22 -0
  35. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +19 -0
  36. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +29 -0
  37. data/lib/wikipedia/vandalism_detection/features/article_size.rb +18 -0
  38. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +23 -0
  39. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/base.rb +54 -0
  41. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +23 -0
  42. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +22 -0
  43. data/lib/wikipedia/vandalism_detection/features/blanking.rb +25 -0
  44. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +25 -0
  45. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +22 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +22 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +17 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +27 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +22 -0
  51. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +22 -0
  52. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +22 -0
  53. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +27 -0
  54. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +18 -0
  55. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +18 -0
  56. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +24 -0
  57. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +65 -0
  58. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  59. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  60. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +20 -0
  61. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +22 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +22 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +18 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  65. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +20 -0
  66. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +18 -0
  67. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +20 -0
  68. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +27 -0
  69. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +27 -0
  70. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +24 -0
  71. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +18 -0
  72. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +23 -0
  73. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +23 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +23 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +22 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +27 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +28 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +23 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +23 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +20 -0
  83. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +23 -0
  84. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +18 -0
  85. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +22 -0
  86. data/lib/wikipedia/vandalism_detection/features/reverted.rb +18 -0
  87. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +29 -0
  89. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +23 -0
  90. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +21 -0
  91. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +22 -0
  92. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +26 -0
  93. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +25 -0
  94. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +31 -0
  95. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +22 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +24 -0
  97. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +31 -0
  98. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +24 -0
  99. data/lib/wikipedia/vandalism_detection/features/user_reputation.rb +38 -0
  100. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +23 -0
  101. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +22 -0
  102. data/lib/wikipedia/vandalism_detection/features/weekday.rb +21 -0
  103. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +22 -0
  104. data/lib/wikipedia/vandalism_detection/instances.rb +130 -0
  105. data/lib/wikipedia/vandalism_detection/page.rb +88 -0
  106. data/lib/wikipedia/vandalism_detection/page_parser.rb +52 -0
  107. data/lib/wikipedia/vandalism_detection/revision.rb +69 -0
  108. data/lib/wikipedia/vandalism_detection/revision_parser.rb +43 -0
  109. data/lib/wikipedia/vandalism_detection/test_dataset.rb +367 -0
  110. data/lib/wikipedia/vandalism_detection/text.rb +18 -0
  111. data/lib/wikipedia/vandalism_detection/training_dataset.rb +303 -0
  112. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  113. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists.rb +19 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +12 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +21 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +22 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +12 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +15 -0
  120. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +12 -0
  121. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +97 -0
  122. data/spec/factories/edit.rb +20 -0
  123. data/spec/factories/page.rb +13 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/config.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +22 -0
  152. data/spec/support/macros/file_reading.rb +7 -0
  153. data/spec/support/macros/test_configuration.rb +71 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +36 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +317 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +517 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +137 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +671 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +128 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +36 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +58 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +61 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +23 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +35 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +36 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +59 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +49 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +36 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +58 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +38 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +35 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +37 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +34 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +34 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +27 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +34 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +34 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +34 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +34 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +42 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +33 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +33 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +35 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +49 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +36 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +51 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +26 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +41 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +46 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +35 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +35 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +35 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +35 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +35 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +36 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +59 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +35 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +26 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +36 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +59 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +36 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +36 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +36 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +46 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +36 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +36 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +36 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +36 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +35 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +36 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +35 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +44 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +28 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +46 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +60 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +36 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +59 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +35 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +57 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +38 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +50 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +22 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +35 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +37 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +35 -0
  227. data/spec/vandalism_detection/features/user_reputation_spec.rb +52 -0
  228. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +36 -0
  229. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +58 -0
  230. data/spec/vandalism_detection/features/weekday_spec.rb +22 -0
  231. data/spec/vandalism_detection/features/words_increment_spec.rb +35 -0
  232. data/spec/vandalism_detection/instances_spec.rb +156 -0
  233. data/spec/vandalism_detection/page_parser_spec.rb +184 -0
  234. data/spec/vandalism_detection/page_spec.rb +135 -0
  235. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  236. data/spec/vandalism_detection/revision_spec.rb +115 -0
  237. data/spec/vandalism_detection/test_dataset_spec.rb +231 -0
  238. data/spec/vandalism_detection/text_spec.rb +29 -0
  239. data/spec/vandalism_detection/training_dataset_spec.rb +264 -0
  240. data/spec/vandalism_detection/wikitext_extractor_spec.rb +72 -0
  241. data/spec/weka/classifiers/functions/lib_svm_spec.rb +38 -0
  242. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +76 -0
  243. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +40 -0
  244. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +40 -0
  245. data/spec/weka/filters/supervised/instance/smote_spec.rb +6 -0
  246. data/wikipedia-vandalism_detection.gemspec +30 -0
  247. metadata +512 -0
@@ -0,0 +1,135 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Page do
4
+
5
+ describe "constants" do
6
+ it "has a START_TAG constant" do
7
+ expect(Wikipedia::VandalismDetection::Page::START_TAG).to eq '<page>'
8
+ end
9
+
10
+ it "has an END_Tag constant" do
11
+ expect(Wikipedia::VandalismDetection::Page::END_TAG).to eq '</page>'
12
+ end
13
+ end
14
+
15
+ before do
16
+ @page = Wikipedia::VandalismDetection::Page.new
17
+ end
18
+
19
+ it "has a title" do
20
+ expect(@page).to respond_to :title
21
+ end
22
+
23
+ it "has an id" do
24
+ expect(@page).to respond_to :id
25
+ end
26
+
27
+ it "has revisions" do
28
+ expect(@page.revisions).to be_a Hash
29
+ end
30
+
31
+ it "has revisions with default {}" do
32
+ expect(@page.revisions).to be_empty
33
+ end
34
+
35
+ describe "#edits" do
36
+
37
+ it {should respond_to :edits }
38
+
39
+ it "returns an empty array if no revision is available" do
40
+ expect(@page.revisions).to be_empty
41
+ expect(@page.edits).to be_an(Array)
42
+ expect(@page.edits).to be_empty
43
+ end
44
+
45
+ it "resets the @revision_added flag to false" do
46
+ @page.add_revision build(:empty_revision, id: '1')
47
+ @page.edits
48
+ expect(@page.instance_variable_get(:@update_edits)).to be false
49
+ end
50
+
51
+ it "computes edits from the page's revisions" do
52
+ @page.add_revision build(:empty_revision, id: '1')
53
+ @page.add_revision build(:empty_revision, id: '3', parent_id: "2")
54
+ @page.add_revision build(:empty_revision, id: '2', parent_id: "1")
55
+
56
+ expect(@page.edits.count).to eq 2
57
+ end
58
+
59
+ it "computes edits of which each holds the parent page as reference" do
60
+ @page.id = '1234'
61
+ @page.title = 'Article'
62
+
63
+ @page.add_revision build(:empty_revision, id: '1')
64
+ @page.add_revision build(:empty_revision, id: '3', parent_id: "2")
65
+ @page.add_revision build(:empty_revision, id: '2', parent_id: "1")
66
+
67
+ @page.edits.each do |edit|
68
+ expect(edit.page).to eq @page
69
+ end
70
+ end
71
+ end
72
+
73
+ describe "#add_revision" do
74
+
75
+ it { should respond_to :add_revision }
76
+
77
+ it "takes a revision and adds it to revisions" do
78
+ revision = build :empty_revision
79
+ expect { @page.add_revision(revision) }.to change(@page.revisions, :count).by(1)
80
+ end
81
+
82
+ it "sets the @update_edits flag to true after adding a revision" do
83
+ revision = build :empty_revision
84
+ @page.add_revision(revision)
85
+ expect(@page.instance_variable_get(:@update_edits)).to be true
86
+ end
87
+
88
+ it "sets the @update_reverted_edits flag to true after adding a revision" do
89
+ revision = build :empty_revision
90
+ @page.add_revision(revision)
91
+ expect(@page.instance_variable_get(:@update_reverted_edits)).to be true
92
+ end
93
+ end
94
+
95
+ describe "#reverted_edits" do
96
+
97
+ it {should respond_to :reverted_edits }
98
+
99
+ it "returns reverted edits by comparing the sha1 values" do
100
+ # principle:
101
+ # in edit wars the in-between of the first revert triple which has another hash before
102
+ # can be seen as vandalism (here revision with id 2)
103
+
104
+ revision_1 = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash0')
105
+ revision_2 = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash1')
106
+ revision_3 = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash2')
107
+ revision_4 = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash1')
108
+ revision_5 = build(:empty_revision, id: 5, parent_id: 4, sha1: 'hash2')
109
+ revision_6 = build(:empty_revision, id: 6, parent_id: 5, sha1: 'hash3')
110
+
111
+ @page.add_revision(revision_3)
112
+ @page.add_revision(revision_6)
113
+ @page.add_revision(revision_1)
114
+ @page.add_revision(revision_5)
115
+ @page.add_revision(revision_4)
116
+ @page.add_revision(revision_2)
117
+
118
+ expect(@page.reverted_edits.map { |edit| edit.new_revision.id }).to eq [3]
119
+ end
120
+
121
+ it "returns reverted edit if no previous revision is available" do
122
+ revision_1 = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash1')
123
+ revision_2 = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash2')
124
+ revision_3 = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash1')
125
+ revision_4 = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash2')
126
+
127
+ @page.add_revision(revision_3)
128
+ @page.add_revision(revision_1)
129
+ @page.add_revision(revision_4)
130
+ @page.add_revision(revision_2)
131
+
132
+ expect(@page.reverted_edits.map { |edit| edit.new_revision.id }).to eq [2]
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::RevisionParser do
4
+
5
+ before do
6
+ @parser = Wikipedia::VandalismDetection::RevisionParser.new
7
+ @xml = load_file('revision_simplified.xml')
8
+
9
+ @revision = @parser.parse @xml
10
+ @expected_revision = build(:empty_revision,
11
+ id: 'id1',
12
+ parent_id: 'parentid1',
13
+ timestamp: 'time1',
14
+ contributor: 'ip1',
15
+ comment: 'comment 1',
16
+ text: "text 1",
17
+ sha1: 'hash1')
18
+ end
19
+
20
+ describe "#parse" do
21
+ it "returns a Wikipedia::Revision object" do
22
+ expect(@revision).to be_a Wikipedia::VandalismDetection::Revision
23
+ end
24
+
25
+ it "returns a revision with only the configured properties" do
26
+ @revision = @parser.parse(@xml, only: [:id, :parent_id])
27
+
28
+ [:id, :parent_id].each do |attr|
29
+ expect(@revision.send(attr)).not_to be_nil
30
+ end
31
+
32
+ [:timestamp, :contributor, :sha1].each do |attr|
33
+ expect(@revision.send(attr)).to be_nil
34
+ end
35
+
36
+ [:comment, :text].each do |attr|
37
+ expect(@revision.send(attr)).to eq ""
38
+ end
39
+ end
40
+ end
41
+
42
+ describe "a single revison content parsing" do
43
+ it "can read a single revsion dump text input" do
44
+ expect(@revision).to_not be_nil
45
+ end
46
+
47
+ [:id, :timestamp, :contributor, :comment, :text, :sha1].each do |attr|
48
+ it "has the expected #{attr}" do
49
+ expect(@revision.send(attr)).to eq @expected_revision.send(attr)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,115 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Revision do
4
+
5
+ describe "constants" do
6
+
7
+ it "has a START_TAG constant" do
8
+ expect(Wikipedia::VandalismDetection::Revision::START_TAG).to eq '<revision>'
9
+ end
10
+
11
+ it "has an END_TAG constant" do
12
+ expect(Wikipedia::VandalismDetection::Revision::END_TAG).to eq '</revision>'
13
+ end
14
+ end
15
+
16
+ before do
17
+ @revision = Wikipedia::VandalismDetection::Revision.new
18
+ @instance_variables = [:id, :parent_id, :timestamp, :comment, :text, :contributor_id, :contributor_ip, :sha1]
19
+ @nil_instance_variables = [:id, :parent_id, :timestamp, :contributor_id, :contributor_ip, :contributor_username, :sha1]
20
+ @read_only_attributes = [:contributor_id, :contributor_ip]
21
+ end
22
+
23
+ describe "#anonymous_user?" do
24
+ it { should respond_to :anonymous_contributor? }
25
+
26
+ it "returns true in case of an anonymous user" do
27
+ @anonymous_revision = build :anonymous_revision
28
+ expect(@anonymous_revision.anonymous_contributor?).to be true
29
+ end
30
+ end
31
+
32
+ describe "#contributor=" do
33
+ it { should respond_to :contributor= }
34
+
35
+ it "sets the @contributor_id if contributor is no IPv4" do
36
+ id = "12345"
37
+ @revision.contributor = id
38
+
39
+ expect(@revision.instance_variable_get(:@contributor_id)).to eq id
40
+ expect(@revision.instance_variable_get(:@contributor_ip)).to be_nil
41
+ end
42
+
43
+ it "sets the @contributor_ip if contributor is an IPv4" do
44
+ ip = "127.0.0.1"
45
+ @revision.contributor = ip
46
+
47
+ expect(@revision.instance_variable_get(:@contributor_ip)).to eq ip
48
+ expect(@revision.instance_variable_get(:@contributor_id)).to be_nil
49
+ end
50
+ end
51
+
52
+ describe "#contributor" do
53
+ it { should respond_to :contributor }
54
+
55
+ it "returns the contributor_id if set" do
56
+ id = "12345"
57
+ @revision.contributor = id
58
+
59
+ expect(@revision.contributor).to eq @revision.instance_variable_get(:@contributor_id)
60
+ end
61
+
62
+ it "returns the contributor_ip if set" do
63
+ ip = "127.0.0.1"
64
+ @revision.contributor = ip
65
+
66
+ expect(@revision.contributor).to eq @revision.instance_variable_get(:@contributor_ip)
67
+ end
68
+ end
69
+
70
+ it "has the revision attributes" do
71
+ @instance_variables.each do |name|
72
+ expect(@revision).to respond_to name
73
+ end
74
+ end
75
+
76
+ it "defaults its attributes to nil" do
77
+ @nil_instance_variables.each do |name|
78
+ expect(@revision.send(name)).to be_nil
79
+ end
80
+ end
81
+
82
+ it "raises an NoMethod error while accessing read only attributes" do
83
+ @read_only_attributes.each do |name|
84
+ expect { @revision.send("#{name}=", "") }.to raise_error NoMethodError
85
+ end
86
+ end
87
+
88
+ it "has an empty default text" do
89
+ expect(@revision.text).to be_empty
90
+ end
91
+
92
+ it "has a text of type Wikipedia::Text" do
93
+ expect(@revision.text).to be_a Wikipedia::VandalismDetection::Text
94
+ end
95
+
96
+ it "has an empty default comment" do
97
+ expect(@revision.comment).to be_empty
98
+ end
99
+
100
+ it "has a comment of type Wikipedia::Text" do
101
+ expect(@revision.comment).to be_a Wikipedia::VandalismDetection::Text
102
+ end
103
+
104
+ it { should respond_to :redirect? }
105
+
106
+ it "is marked as redirect if #REDIRECT appears in its text" do
107
+ @revision.text = "#REDIRECT [[Redirect Page Name]]\n"
108
+ expect(@revision.redirect?).to be true
109
+ end
110
+
111
+ it "is not marked as redirect if #REDIRECT does not appear in its text" do
112
+ @revision.text = "''text''"
113
+ expect(@revision.redirect?).to be false
114
+ end
115
+ end
@@ -0,0 +1,231 @@
1
+ require 'spec_helper'
2
+ require 'fileutils'
3
+ require 'ruby-band'
4
+
5
+ describe Wikipedia::VandalismDetection::TestDataset do
6
+
7
+ before do
8
+ use_test_configuration
9
+ @config = test_config
10
+
11
+ @arff_file = @config.test_output_arff_file
12
+ @index_file = @config.test_output_index_file
13
+ @features = @config.features
14
+
15
+ @arff_files_dir = File.join(@config.output_base_directory, 'test')
16
+ end
17
+
18
+ after do
19
+ if File.exists?(@arff_file)
20
+ File.delete(@arff_file)
21
+ FileUtils.rm_r(File.dirname @arff_file)
22
+ end
23
+
24
+ File.delete(@index_file) if File.exists?(@index_file)
25
+
26
+ # remove feature arff files
27
+ @config.features.each do |name|
28
+ file = File.join(@arff_files_dir, name.gsub(' ', '_') + '.arff')
29
+
30
+ if File.exists?(file)
31
+ File.delete(file)
32
+ FileUtils.rm_r(File.dirname file)
33
+ end
34
+ end
35
+ end
36
+
37
+ describe "#build" do
38
+ describe "exceptions" do
39
+ it "raises an EditsFileNotConfiguredError if no edits file is configured" do
40
+ config = test_config
41
+ config.instance_variable_set :@test_corpus_edits_file, nil
42
+ use_configuration(config)
43
+
44
+ expect { Wikipedia::VandalismDetection::TestDataset.build }.to raise_error \
45
+ Wikipedia::VandalismDetection::EditsFileNotConfiguredError
46
+ end
47
+ end
48
+
49
+ it "returns a weka instances" do
50
+ dataset = Wikipedia::VandalismDetection::TestDataset.build
51
+ expect(dataset.class).to be Java::WekaCore::Instances
52
+ end
53
+
54
+ Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS['features'].each do |name|
55
+ it "creates an arff file for the feature '#{name}'" do
56
+ config = test_config
57
+ config.instance_variable_set :@features, [name]
58
+ use_configuration(config)
59
+
60
+ file = File.join(@arff_files_dir, name.gsub(' ', '_') + '.arff')
61
+
62
+ expect(File.exist?(file)).to be false
63
+ Wikipedia::VandalismDetection::TestDataset.build
64
+ expect(File.exist?(file)).to be true
65
+ end
66
+ end
67
+
68
+ it "creates only feature files that are not available yet" do
69
+ config = test_config
70
+ config.instance_variable_set :@features, ['anonymity', 'comment length']
71
+ use_configuration(config)
72
+
73
+ anonymity_file = File.join(config.output_base_directory, 'test', 'anonymity.arff')
74
+
75
+ # create file manually, so it is existent when building dataset
76
+ data = [10000, 123456, 234567]
77
+ anonymity = Wikipedia::VandalismDetection::Instances.empty_for_test_feature('anonymity')
78
+ 6.times { anonymity.add_instance(data) }
79
+ anonymity.to_ARFF(anonymity_file)
80
+
81
+ Wikipedia::VandalismDetection::TestDataset.build
82
+
83
+ # anonymity should not be overwritten
84
+ expect(Core::Parser.parse_ARFF(anonymity_file).to_a2d.first).to eq data
85
+ end
86
+
87
+ describe "internal algorithm" do
88
+ it "builds the right number of data lines" do
89
+ edits_count = File.open(@config.training_corpus_edits_file, 'r').lines.count - 1
90
+ additional_header_lines = 4 # without class
91
+ revision_id_lines = 2 # old and new revision id attributes
92
+ class_line = 1
93
+
94
+ lines_count = additional_header_lines + edits_count + @features.count + revision_id_lines + class_line
95
+ dataset = Wikipedia::VandalismDetection::TestDataset.build
96
+
97
+ expect(dataset.to_s.lines.count).to eq lines_count
98
+ end
99
+
100
+ it "builds the right number of data columns" do
101
+ old_and_new_edit_attr_count = 2
102
+ class_value = 1
103
+ dataset = Wikipedia::VandalismDetection::TestDataset.build
104
+
105
+ expect(dataset.n_col).to eq @features.count + class_value + old_and_new_edit_attr_count
106
+ end
107
+
108
+ it "builds a class attribute" do
109
+ dataset = Wikipedia::VandalismDetection::TestDataset.build
110
+ expect(dataset.enumerate_attributes.to_a[-1].name).to eq 'class'
111
+ end
112
+ end
113
+
114
+ it "normalizes the numeric features if LibSVM is used as classifier" do
115
+ config = test_config
116
+ config.instance_variable_set :@classifier_type, 'Functions::LibSVM'
117
+ use_configuration(config)
118
+
119
+ dataset = Wikipedia::VandalismDetection::TestDataset.build
120
+ puts dataset
121
+
122
+ dataset.to_a2d.each do |instance|
123
+ puts instance.to_s
124
+ numerics = instance[0...-3] # feature values
125
+ edit_ids = instance[-3..-2] # revision ids
126
+
127
+ numerics.each { |value| expect(value).to be_between(0.0, 1.0) }
128
+ edit_ids.each { |value| expect(value).to be > 1 }
129
+ end
130
+ end
131
+ end
132
+
133
+ describe "#instances" do
134
+ it "is an alias method for #build" do
135
+ build = Wikipedia::VandalismDetection::TestDataset.build
136
+ instances = Wikipedia::VandalismDetection::TestDataset.instances
137
+
138
+ expect(build.to_s).to eq instances.to_s
139
+ end
140
+ end
141
+
142
+ describe "#create_corpus_index_file!" do
143
+ it "responds to #create_corpus_file_index!" do
144
+ expect(Wikipedia::VandalismDetection::TestDataset).to respond_to :create_corpus_file_index!
145
+ end
146
+
147
+ describe "exceptions" do
148
+ it "raises an RevisionsDirectoryNotConfiguredError if no revisions directory is configured" do
149
+ config = test_config
150
+ config.instance_variable_set :@test_corpus_revisions_directory, nil
151
+ use_configuration(config)
152
+
153
+ expect { Wikipedia::VandalismDetection::TestDataset.create_corpus_file_index! }.to raise_error \
154
+ Wikipedia::VandalismDetection::RevisionsDirectoryNotConfiguredError
155
+ end
156
+ end
157
+
158
+ it "creates a corpus_index.yml file in the build directory" do
159
+ expect(File.exist?(@index_file)).to be false
160
+ Wikipedia::VandalismDetection::TestDataset.create_corpus_file_index!
161
+ expect(File.exist?(@index_file)).to be true
162
+ end
163
+ end
164
+
165
+ describe "#build!" do
166
+ it "should respond to #build!" do
167
+ expect(Wikipedia::VandalismDetection::TestDataset).to respond_to :build!
168
+ end
169
+
170
+ it "creates an .arff file in the directory defined in config.yml" do
171
+ expect(File.exist?(@arff_file)).to be false
172
+ Wikipedia::VandalismDetection::TestDataset.build!
173
+ expect(File.exist?(@arff_file)).to be true
174
+ end
175
+
176
+ it "overwrites existing test arff file" do
177
+ use_test_configuration
178
+
179
+ # test config uses 3 features + 2 edit id columns + 1 class value = 6
180
+ Wikipedia::VandalismDetection::TestDataset.build!
181
+ first_parsed_dataset = Core::Parser.parse_ARFF(@arff_file)
182
+ expect(first_parsed_dataset.n_col).to eq 6
183
+
184
+ config = test_config
185
+ config.instance_variable_set(:@features, ['anonymity'])
186
+ use_configuration(config)
187
+
188
+ # uses only 1 feature + 2 edit id columns + 1 class vlaue = 4
189
+ Wikipedia::VandalismDetection::TestDataset.build!
190
+ second_parsed_dataset = Core::Parser.parse_ARFF(@arff_file)
191
+
192
+ expect(second_parsed_dataset.n_col).to eq 4
193
+ end
194
+ end
195
+
196
+ describe "#edit" do
197
+ it "raises an EditsFileNotConfiguredError if no edits file is configured" do
198
+ config = test_config
199
+ config.instance_variable_set :@test_corpus_edits_file, nil
200
+ use_configuration(config)
201
+
202
+ expect { Wikipedia::VandalismDetection::TestDataset.edit('1', '2') }.to raise_error \
203
+ Wikipedia::VandalismDetection::EditsFileNotConfiguredError
204
+ end
205
+
206
+ it "returns nil if Edit could not be found" do
207
+ edit = Wikipedia::VandalismDetection::TestDataset.edit('1', '2')
208
+ expect(edit).to be_nil
209
+ end
210
+
211
+ it "returns an Edit" do
212
+ edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
213
+ expect(edit).to be_a Wikipedia::VandalismDetection::Edit
214
+ end
215
+
216
+ it "returns an edit whose parent page title is not nil" do
217
+ edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
218
+ expect(edit.page.title).to_not be_nil
219
+ end
220
+
221
+ it "returns an edit whose parent page id is not nil" do
222
+ edit = Wikipedia::VandalismDetection::TestDataset.edit('307084144', '326873205')
223
+ expect(edit.page.id).to_not be_nil
224
+ end
225
+
226
+ it "returns nil for a not annotated edit with given revision ids" do
227
+ edit = Wikipedia::VandalismDetection::TestDataset.edit('328774088', '328774188')
228
+ expect(edit).to be_nil
229
+ end
230
+ end
231
+ end