wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,146 @@
1
+ require 'spec_helper'
2
+ require 'wikipedia/vandalism_detection/instances'
3
+
4
+ describe Wikipedia::VandalismDetection::Instances do
5
+ Instances = Wikipedia::VandalismDetection::Instances
6
+
7
+ it 'responds to #empty' do
8
+ expect(Instances).to respond_to :empty
9
+ end
10
+
11
+ describe '#empty' do
12
+ before do
13
+ use_test_configuration
14
+
15
+ @dataset = Instances.empty
16
+ @attributes = @dataset.attributes
17
+ @class_attribute = @dataset.class_attribute
18
+ end
19
+
20
+ it 'returns a weka dataset' do
21
+ expect(@dataset.class).to eq Java::WekaCore::Instances
22
+ end
23
+
24
+ it 'returns an empty dataset' do
25
+ expect(@dataset.size).to eq 0
26
+ end
27
+
28
+ it 'has all configured features and class as attributes' do
29
+ names = test_config.features.map { |name| name.tr(' ', '_') }
30
+ expect(@dataset.attribute_names).to eq names
31
+ end
32
+
33
+ it 'has feature attributes of type "numeric"' do
34
+ all_features_numeric = @attributes.reduce do |result, attribute|
35
+ result && attribute.numeric?
36
+ end
37
+
38
+ expect(all_features_numeric).to be true
39
+ end
40
+
41
+ it 'has a nominal class attribute' do
42
+ expect(@class_attribute).to be_nominal
43
+ end
44
+
45
+ it 'has a class attribute with values "vandalism" and "regular"' do
46
+ expect(@class_attribute.values).to eq %w[regular vandalism]
47
+ end
48
+ end
49
+
50
+ describe '#empty_for_feature' do
51
+ before do
52
+ @dataset = Instances.empty_for_feature('comment length')
53
+ @attributes = @dataset.attributes
54
+ @class_attribute = @dataset.class_attribute
55
+ end
56
+
57
+ it 'returns a weka dataset' do
58
+ expect(@dataset).to be_a Java::WekaCore::Instances
59
+ end
60
+
61
+ it 'returns an empty dataset' do
62
+ expect(@dataset).to be_empty
63
+ end
64
+
65
+ it 'has only given feature and class as attributes' do
66
+ expect(@dataset.attribute_names).to eq %w[comment_length]
67
+ end
68
+
69
+ it 'has numeric feature attributes' do
70
+ expect(@attributes.first).to be_numeric
71
+ end
72
+
73
+ it 'has a nominal class attribute' do
74
+ expect(@class_attribute).to be_nominal
75
+ end
76
+
77
+ it 'has a class attribute with values "vandalism" and "regular"' do
78
+ expect(@class_attribute.values).to eq %w[regular vandalism]
79
+ end
80
+ end
81
+
82
+ describe '#empty_for_test_feature' do
83
+ before do
84
+ @dataset = Instances.empty_for_test_feature('comment length')
85
+
86
+ @feature_attribute = @dataset.attributes.first
87
+ @old_revision_id_attribute = @dataset.attributes[-2]
88
+ @new_revision_id_attribute = @dataset.attributes.last
89
+ end
90
+
91
+ it 'returns a weka dataset' do
92
+ expect(@dataset).to be_a Java::WekaCore::Instances
93
+ end
94
+
95
+ it 'returns an empty dataset' do
96
+ expect(@dataset).to be_empty
97
+ end
98
+
99
+ it 'has one given feature as attributes' do
100
+ expect(@feature_attribute.name).to eq 'comment_length'
101
+ end
102
+
103
+ it 'has numeric feature attributes' do
104
+ expect(@feature_attribute).to be_numeric
105
+ end
106
+
107
+ it 'has an attribute with name "oldrevisionid"' do
108
+ expect(@old_revision_id_attribute.name).to eq 'oldrevisionid'
109
+ end
110
+
111
+ it 'has a numeric oldrevisionid attribute' do
112
+ expect(@old_revision_id_attribute).to be_numeric
113
+ end
114
+
115
+ it 'has an attribute with name "newrevisionid"' do
116
+ expect(@new_revision_id_attribute.name).to eq 'newrevisionid'
117
+ end
118
+
119
+ it 'has a numeric newrevisionid attribute' do
120
+ expect(@new_revision_id_attribute).to be_numeric
121
+ end
122
+ end
123
+
124
+ describe '#empty_for_test_class' do
125
+ before do
126
+ @dataset = Instances.empty_for_test_class
127
+ @class = @dataset.attributes.first
128
+ end
129
+
130
+ it 'returns a weka dataset' do
131
+ expect(@dataset).to be_a Java::WekaCore::Instances
132
+ end
133
+
134
+ it 'returns an empty dataset' do
135
+ expect(@dataset).to be_empty
136
+ end
137
+
138
+ it 'has one given feature as attributes' do
139
+ expect(@class.name).to eq 'class'
140
+ end
141
+
142
+ it 'has nominal feature attributes' do
143
+ expect(@class).to be_nominal
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,190 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::PageParser do
4
+ let(:xml) { load_file('vandalism_on_wikipedia.xml') }
5
+ let(:page) { subject.parse(xml) }
6
+
7
+ let(:simplified_xml) { load_file('vandalism_on_wikipedia_simplified.xml') }
8
+ let(:simplified_page) { subject.parse(simplified_xml) }
9
+
10
+ describe 'parser structure' do
11
+ describe '#parse' do
12
+ it 'returns a Wikipedia::Page object' do
13
+ expect(page).to be_a Wikipedia::VandalismDetection::Page
14
+ end
15
+
16
+ it 'returns a Wikipedia::Page with the right number of revisions' do
17
+ expect(page.revisions.count).to eq 5
18
+ end
19
+ end
20
+ end
21
+
22
+ describe 'a single page content parsing' do
23
+ let(:revision_a) { build(:empty_revision, id: '398880281') }
24
+ let(:revision_b) { build(:empty_revision, id: '398880502', parent_id: '398880281') }
25
+ let(:revision_c) { build(:empty_revision, id: '398883278', parent_id: '398880502') }
26
+ let(:revision_d) { build(:empty_revision, id: '398883675', parent_id: '398883278') }
27
+ let(:revision_e) { build(:empty_revision, id: '398885233', parent_id: '398883675') }
28
+
29
+ let(:revisions) do
30
+ {
31
+ revision_a.id => revision_a,
32
+ revision_b.id => revision_b,
33
+ revision_c.id => revision_c,
34
+ revision_d.id => revision_d,
35
+ revision_e.id => revision_e
36
+ }
37
+ end
38
+
39
+ it 'can read a single page dump text input' do
40
+ expect(page).to_not be_nil
41
+ end
42
+
43
+ it 'has a title' do
44
+ expect(page.title).to eq 'Vandalism on Wikipedia'
45
+ end
46
+
47
+ it 'has an id' do
48
+ expect(page.id).to eq '29753790'
49
+ end
50
+
51
+ describe 'page’s revisions' do
52
+ it 'has the right number of revisions' do
53
+ expect(page.revisions.count).to eq 5
54
+ end
55
+
56
+ it 'has revisions each with the right id' do
57
+ page_revisions = page.revisions
58
+
59
+ page_revisions.each do |key, value|
60
+ expect(value.id).to eq revisions[key].id
61
+ end
62
+ end
63
+
64
+ it 'has revisions each with the right parent_id' do
65
+ page_revisions = page.revisions
66
+
67
+ page_revisions.each do |key, value|
68
+ expect(value.parent_id).to eq revisions[key].parent_id
69
+ end
70
+ end
71
+
72
+ it 'has revisions each with the right text' do
73
+ revision_a = build(:empty_revision, id: '1', text: "text\n\n\n 1\n\n ")
74
+ revision_b = build(:empty_revision, id: '2', text: 'text 2')
75
+ revision_c = build(:empty_revision, id: '3', text: 'text 3')
76
+ revision_d = build(:empty_revision, id: '4', text: 'text 4')
77
+ revision_e = build(:empty_revision, id: '5', text: 'text 5')
78
+
79
+ revisions = {
80
+ revision_a.id => revision_a,
81
+ revision_b.id => revision_b,
82
+ revision_c.id => revision_c,
83
+ revision_d.id => revision_d,
84
+ revision_e.id => revision_e
85
+ }
86
+
87
+ simplified_page.revisions.each do |key, value|
88
+ expect(value.text).to eq revisions[key].text
89
+ end
90
+ end
91
+
92
+ it 'has revisions each with the right comment' do
93
+ revision_a = build(:empty_revision, id: '1', comment: "comment\n\n 1\n\n ")
94
+ revision_b = build(:empty_revision, id: '2', comment: 'comment 2')
95
+ revision_c = build(:empty_revision, id: '3')
96
+ revision_d = build(:empty_revision, id: '4')
97
+ revision_e = build(:empty_revision, id: '5', comment: 'comment 3')
98
+
99
+ revisions = {
100
+ revision_a.id => revision_a,
101
+ revision_b.id => revision_b,
102
+ revision_c.id => revision_c,
103
+ revision_d.id => revision_d,
104
+ revision_e.id => revision_e
105
+ }
106
+
107
+ simplified_page.revisions.each do |key, value|
108
+ expect(value.comment).to eq revisions[key].comment
109
+ end
110
+ end
111
+
112
+ describe 'contributor properties' do
113
+ let(:revision_a) { build(:empty_revision, id: '1', contributor: '1') }
114
+ let(:revision_b) { build(:empty_revision, id: '2', contributor: '10', contributor_username: 'user') }
115
+ let(:revision_c) { build(:empty_revision, id: '3', contributor: '11', contributor_username: 'user') }
116
+ let(:revision_d) { build(:empty_revision, id: '4', contributor: '12', contributor_username: 'user') }
117
+ let(:revision_e) { build(:empty_revision, id: '5', contributor: '2') }
118
+
119
+ let(:revisions) do
120
+ {
121
+ revision_a.id => revision_a,
122
+ revision_b.id => revision_b,
123
+ revision_c.id => revision_c,
124
+ revision_d.id => revision_d,
125
+ revision_e.id => revision_e
126
+ }
127
+ end
128
+
129
+ it 'has revisions each with the right contributor id' do
130
+ simplified_page.revisions.each do |key, value|
131
+ expect(value.contributor_id).to eq revisions[key].contributor_id
132
+ end
133
+ end
134
+
135
+ it 'has revisions each with the right contributor ip' do
136
+ simplified_page.revisions.each do |key, value|
137
+ expect(value.contributor_ip).to eq revisions[key].contributor_ip
138
+ end
139
+ end
140
+
141
+ it 'has revisions each with the right contributor username' do
142
+ simplified_page.revisions.each do |key, value|
143
+ username = revisions[key].contributor_username
144
+ expect(value.contributor_username).to eq username
145
+ end
146
+ end
147
+ end
148
+
149
+ it 'has revisions each with the right timestamp' do
150
+ revision_a = build :empty_revision, id: '1', timestamp: 'time 1'
151
+ revision_b = build :empty_revision, id: '2', timestamp: 'time 2'
152
+ revision_c = build :empty_revision, id: '3', timestamp: 'time 3'
153
+ revision_d = build :empty_revision, id: '4', timestamp: 'time 4'
154
+ revision_e = build :empty_revision, id: '5', timestamp: 'time 5'
155
+
156
+ revisions = {
157
+ revision_a.id => revision_a,
158
+ revision_b.id => revision_b,
159
+ revision_c.id => revision_c,
160
+ revision_d.id => revision_d,
161
+ revision_e.id => revision_e
162
+ }
163
+
164
+ simplified_page.revisions.each do |key, value|
165
+ expect(value.timestamp).to eq revisions[key].timestamp
166
+ end
167
+ end
168
+
169
+ it 'has revisions each with the right sha1 hash' do
170
+ revision_a = build :empty_revision, id: '1', sha1: 'hash1'
171
+ revision_b = build :empty_revision, id: '2', sha1: 'hash2'
172
+ revision_c = build :empty_revision, id: '3', sha1: 'hash3'
173
+ revision_d = build :empty_revision, id: '4', sha1: 'hash4'
174
+ revision_e = build :empty_revision, id: '5', sha1: 'hash5'
175
+
176
+ revisions = {
177
+ revision_a.id => revision_a,
178
+ revision_b.id => revision_b,
179
+ revision_c.id => revision_c,
180
+ revision_d.id => revision_d,
181
+ revision_e.id => revision_e
182
+ }
183
+
184
+ simplified_page.revisions.each do |key, value|
185
+ expect(value.sha1).to eq revisions[key].sha1
186
+ end
187
+ end
188
+ end
189
+ end
190
+ end
@@ -0,0 +1,134 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Page do
4
+ describe 'constants' do
5
+ it 'has a START_TAG constant' do
6
+ expect(Page::START_TAG).to eq '<page>'
7
+ end
8
+
9
+ it 'has an END_Tag constant' do
10
+ expect(Page::END_TAG).to eq '</page>'
11
+ end
12
+ end
13
+
14
+ it 'has a title' do
15
+ expect(subject).to respond_to :title
16
+ end
17
+
18
+ it 'has an id' do
19
+ expect(subject).to respond_to :id
20
+ end
21
+
22
+ it 'has revisions defaulting to an empty Hash' do
23
+ expect(subject).to respond_to :revisions
24
+ expect(subject.revisions).to be_a Hash
25
+ expect(subject.revisions).to be_empty
26
+ end
27
+
28
+ describe '#edits' do
29
+ it { is_expected.to respond_to :edits }
30
+
31
+ it 'returns an empty array if no revision is available' do
32
+ expect(subject.revisions).to be_empty
33
+ expect(subject.edits).to be_an Array
34
+ expect(subject.edits).to be_empty
35
+ end
36
+
37
+ it 'resets the @revision_added flag to false' do
38
+ subject.add_revision build(:empty_revision, id: '1')
39
+ subject.edits
40
+ expect(subject.instance_variable_get(:@update_edits)).to be false
41
+ end
42
+
43
+ it 'computes edits from the page’s revisions' do
44
+ subject.add_revision build(:empty_revision, id: '1')
45
+ subject.add_revision build(:empty_revision, id: '3', parent_id: '2')
46
+ subject.add_revision build(:empty_revision, id: '2', parent_id: '1')
47
+
48
+ expect(subject.edits.count).to eq 2
49
+ end
50
+
51
+ it 'computes edits of which each holds the parent page as reference' do
52
+ subject.id = '1234'
53
+ subject.title = 'Article'
54
+
55
+ subject.add_revision build(:empty_revision, id: '1')
56
+ subject.add_revision build(:empty_revision, id: '3', parent_id: '2')
57
+ subject.add_revision build(:empty_revision, id: '2', parent_id: '1')
58
+
59
+ subject.edits.each { |edit| expect(edit.page).to eq subject }
60
+ end
61
+ end
62
+
63
+ describe '#add_revision' do
64
+ it { is_expected.to respond_to :add_revision }
65
+
66
+ it 'takes a revision and adds it to revisions' do
67
+ revision = build(:empty_revision)
68
+
69
+ expect { subject.add_revision(revision) }
70
+ .to change(subject.revisions, :count)
71
+ .by(1)
72
+ end
73
+
74
+ it 'sets the @update_edits flag to true after adding a revision' do
75
+ revision = build :empty_revision
76
+ subject.add_revision(revision)
77
+ expect(subject.instance_variable_get(:@update_edits)).to be true
78
+ end
79
+
80
+ it 'sets the @update_reverted_edits flag to true after adding a revision' do
81
+ revision = build :empty_revision
82
+ subject.add_revision(revision)
83
+ expect(subject.instance_variable_get(:@update_reverted_edits)).to be true
84
+ end
85
+ end
86
+
87
+ describe '#reverted_edits' do
88
+ it { is_expected.to respond_to :reverted_edits }
89
+
90
+ it 'returns reverted edits by comparing the sha1 values' do
91
+ # principle:
92
+ # in edit wars the in-between of the first revert triple which has another
93
+ # hash before can be seen as vandalism (here revision with id 2)
94
+
95
+ revision_a = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash0')
96
+ revision_b = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash1')
97
+ revision_c = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash2')
98
+ revision_d = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash1')
99
+ revision_e = build(:empty_revision, id: 5, parent_id: 4, sha1: 'hash2')
100
+ revision_f = build(:empty_revision, id: 6, parent_id: 5, sha1: 'hash3')
101
+
102
+ subject.add_revision(revision_c)
103
+ subject.add_revision(revision_f)
104
+ subject.add_revision(revision_a)
105
+ subject.add_revision(revision_e)
106
+ subject.add_revision(revision_d)
107
+ subject.add_revision(revision_b)
108
+
109
+ reverted_ids = subject.reverted_edits.map do |edit|
110
+ edit.new_revision.id
111
+ end
112
+
113
+ expect(reverted_ids).to eq [revision_c.id]
114
+ end
115
+
116
+ it 'returns reverted edit if no previous revision is available' do
117
+ revision_a = build(:empty_revision, id: 1, parent_id: nil, sha1: 'hash1')
118
+ revision_b = build(:empty_revision, id: 2, parent_id: 1, sha1: 'hash2')
119
+ revision_c = build(:empty_revision, id: 3, parent_id: 2, sha1: 'hash1')
120
+ revision_d = build(:empty_revision, id: 4, parent_id: 3, sha1: 'hash2')
121
+
122
+ subject.add_revision(revision_c)
123
+ subject.add_revision(revision_a)
124
+ subject.add_revision(revision_d)
125
+ subject.add_revision(revision_b)
126
+
127
+ reverted_ids = subject.reverted_edits.map do |edit|
128
+ edit.new_revision.id
129
+ end
130
+
131
+ expect(reverted_ids).to eq [revision_b.id]
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,53 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::RevisionParser do
4
+ let(:xml) { load_file('revision_simplified.xml') }
5
+ let(:revision) { subject.parse(xml) }
6
+
7
+ let(:expected_revision) do
8
+ build(
9
+ :empty_revision,
10
+ id: 'id1',
11
+ parent_id: 'parentid1',
12
+ timestamp: 'time1',
13
+ contributor: 'ip1',
14
+ comment: 'comment 1',
15
+ text: 'text 1',
16
+ sha1: 'hash1'
17
+ )
18
+ end
19
+
20
+ describe '#parse' do
21
+ it 'returns a Wikipedia::Revision object' do
22
+ expect(revision).to be_a Wikipedia::VandalismDetection::Revision
23
+ end
24
+
25
+ it 'returns a revision with only the configured properties' do
26
+ revision = subject.parse(xml, only: %i[id parent_id])
27
+
28
+ %i[id parent_id].each do |attribute|
29
+ expect(revision.send(attribute)).not_to be_nil
30
+ end
31
+
32
+ %i[timestamp contributor sha1].each do |attribute|
33
+ expect(revision.send(attribute)).to be_nil
34
+ end
35
+
36
+ %i[comment text].each do |attribute|
37
+ expect(revision.send(attribute)).to eq ''
38
+ end
39
+ end
40
+ end
41
+
42
+ describe 'a single revison content parsing' do
43
+ it 'can read a single revsion dump text input' do
44
+ expect(revision).to_not be_nil
45
+ end
46
+
47
+ %i[id timestamp contributor comment text sha1].each do |attribute|
48
+ it "has the expected #{attribute}" do
49
+ expect(revision.send(attribute)).to eq expected_revision.send(attribute)
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,148 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Revision do
4
+ Revision = Wikipedia::VandalismDetection::Revision
5
+
6
+ describe 'constants' do
7
+ it 'has a START_TAG constant' do
8
+ expect(Revision::START_TAG).to eq '<revision>'
9
+ end
10
+
11
+ it 'has an END_TAG constant' do
12
+ expect(Revision::END_TAG).to eq '</revision>'
13
+ end
14
+ end
15
+
16
+ let(:revision) { Revision.new }
17
+
18
+ describe '#anonymous_user?' do
19
+ it { is_expected.to respond_to :anonymous_contributor? }
20
+
21
+ it 'returns true in case of an anonymous user' do
22
+ anonymous_revision = build(:anonymous_revision)
23
+ expect(anonymous_revision.anonymous_contributor?).to be true
24
+ end
25
+ end
26
+
27
+ describe '#contributor=' do
28
+ it { is_expected.to respond_to :contributor= }
29
+
30
+ it 'sets the @contributor_id if contributor is no IPv4' do
31
+ id = '12345'
32
+ revision.contributor = id
33
+
34
+ expect(revision.instance_variable_get(:@contributor_id)).to eq id
35
+ expect(revision.instance_variable_get(:@contributor_ip)).to be_nil
36
+ end
37
+
38
+ it 'sets the @contributor_ip if contributor is an IPv4' do
39
+ ip = '127.0.0.1'
40
+ revision.contributor = ip
41
+
42
+ expect(revision.instance_variable_get(:@contributor_ip)).to eq ip
43
+ expect(revision.instance_variable_get(:@contributor_id)).to be_nil
44
+ end
45
+ end
46
+
47
+ describe '#contributor' do
48
+ it { is_expected.to respond_to :contributor }
49
+
50
+ it 'returns the contributor_id if set' do
51
+ id = '12345'
52
+ revision.contributor = id
53
+ contributor_id = revision.instance_variable_get(:@contributor_id)
54
+
55
+ expect(revision.contributor).to eq contributor_id
56
+ end
57
+
58
+ it 'returns the contributor_ip if set' do
59
+ ip = '127.0.0.1'
60
+ revision.contributor = ip
61
+ contributor_ip = revision.instance_variable_get(:@contributor_ip)
62
+
63
+ expect(revision.contributor).to eq contributor_ip
64
+ end
65
+ end
66
+
67
+ it 'has the revision attributes' do
68
+ instance_variables = %i[
69
+ id
70
+ parent_id
71
+ timestamp
72
+ comment
73
+ text
74
+ contributor_id
75
+ contributor_ip
76
+ sha1
77
+ ]
78
+
79
+ instance_variables.each do |name|
80
+ expect(revision).to respond_to name
81
+ end
82
+ end
83
+
84
+ it 'defaults its attributes to nil' do
85
+ instance_variables = %i[
86
+ id
87
+ parent_id
88
+ timestamp
89
+ contributor_id
90
+ contributor_ip
91
+ contributor_username
92
+ sha1
93
+ ]
94
+
95
+ instance_variables.each do |name|
96
+ expect(revision.send(name)).to be_nil
97
+ end
98
+ end
99
+
100
+ it 'does not allow setting read only attributes' do
101
+ attributes = %i[contributor_id contributor_ip]
102
+ attributes.each { |name| expect(revision).not_to respond_to "#{name}=" }
103
+ end
104
+
105
+ it 'has an empty default text' do
106
+ expect(revision.text).to be_empty
107
+ end
108
+
109
+ it 'sets an empty text if no String is assigned' do
110
+ revision.text = nil
111
+ expect(revision.text).to eq ''
112
+
113
+ revision.text = []
114
+ expect(revision.text).to eq ''
115
+ end
116
+
117
+ it 'has a text of type Wikipedia::VandalismDetection::Text' do
118
+ expect(revision.text).to be_a Text
119
+ end
120
+
121
+ it 'has an empty default comment' do
122
+ expect(revision.comment).to be_empty
123
+ end
124
+
125
+ it 'has a comment of type Wikipedia::VandalismDetection::Text' do
126
+ expect(revision.comment).to be_a Text
127
+ end
128
+
129
+ it 'sets an empty comment if no String is assigned' do
130
+ revision.comment = nil
131
+ expect(revision.comment).to eq ''
132
+
133
+ revision.comment = []
134
+ expect(revision.comment).to eq ''
135
+ end
136
+
137
+ it { is_expected.to respond_to :redirect? }
138
+
139
+ it 'is marked as redirect if #REDIRECT appears in its text' do
140
+ revision.text = "#REDIRECT [[Redirect Page Name]]\n"
141
+ expect(revision.redirect?).to be true
142
+ end
143
+
144
+ it 'is not marked as redirect if #REDIRECT does not appear in its text' do
145
+ revision.text = "''text''"
146
+ expect(revision.redirect?).to be false
147
+ end
148
+ end