wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,234 @@
1
+ <mediawiki>
2
+ <siteinfo>
3
+ <sitename>Wikipedia</sitename>
4
+ <base>http://en.wikipedia.org/wiki/Main_Page</base>
5
+ <generator>MediaWiki 1.23wmf12</generator>
6
+ <case>first-letter</case>
7
+ <namespaces>
8
+ <namespace key="-2" case="first-letter">Media</namespace>
9
+ <namespace key="-1" case="first-letter">Special</namespace>
10
+ <namespace key="0" case="first-letter"/>
11
+ <namespace key="1" case="first-letter">Talk</namespace>
12
+ <namespace key="2" case="first-letter">User</namespace>
13
+ <namespace key="3" case="first-letter">User talk</namespace>
14
+ <namespace key="4" case="first-letter">Wikipedia</namespace>
15
+ <namespace key="5" case="first-letter">Wikipedia talk</namespace>
16
+ <namespace key="6" case="first-letter">File</namespace>
17
+ <namespace key="7" case="first-letter">File talk</namespace>
18
+ <namespace key="8" case="first-letter">MediaWiki</namespace>
19
+ <namespace key="9" case="first-letter">MediaWiki talk</namespace>
20
+ <namespace key="10" case="first-letter">Template</namespace>
21
+ <namespace key="11" case="first-letter">Template talk</namespace>
22
+ <namespace key="12" case="first-letter">Help</namespace>
23
+ <namespace key="13" case="first-letter">Help talk</namespace>
24
+ <namespace key="14" case="first-letter">Category</namespace>
25
+ <namespace key="15" case="first-letter">Category talk</namespace>
26
+ <namespace key="100" case="first-letter">Portal</namespace>
27
+ <namespace key="101" case="first-letter">Portal talk</namespace>
28
+ <namespace key="108" case="first-letter">Book</namespace>
29
+ <namespace key="109" case="first-letter">Book talk</namespace>
30
+ <namespace key="118" case="first-letter">Draft</namespace>
31
+ <namespace key="119" case="first-letter">Draft talk</namespace>
32
+ <namespace key="446" case="first-letter">Education Program</namespace>
33
+ <namespace key="447" case="first-letter">Education Program talk</namespace>
34
+ <namespace key="710" case="first-letter">TimedText</namespace>
35
+ <namespace key="711" case="first-letter">TimedText talk</namespace>
36
+ <namespace key="828" case="first-letter">Module</namespace>
37
+ <namespace key="829" case="first-letter">Module talk</namespace>
38
+ </namespaces>
39
+ </siteinfo>
40
+ <page>
41
+ <title>Vandalism on Wikipedia</title>
42
+ <ns>0</ns>
43
+ <id>29753790</id>
44
+ <revision>
45
+ <id>398880281</id>
46
+ <timestamp>2010-11-25T23:50:21Z</timestamp>
47
+ <contributor>
48
+ <username>Hellno2</username>
49
+ <id>3020504</id>
50
+ </contributor>
51
+ <comment>[[WP:AES|←]]Created page with '{{newpage}} On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, remov...'</comment>
52
+ <text xml:space="preserve" bytes="1880">{{newpage}}
53
+ On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
54
+
55
+ Frequent targets of vandalism include articles on hot and controversial topics and current events&lt;ref name=Newzealand/&gt;.
56
+
57
+ ==Fighting vandalism==
58
+ The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
59
+ *Reverting the vandalism by restoring the article to the last version before the vandalism occurred&lt;ref name=Newzealand/&gt;
60
+ *Locking articles so only established users, or in some cases, only administrators can edit them&lt;ref name=Newzealand/&gt;
61
+ *Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely&lt;ref name=Newzealand/&gt;
62
+
63
+ ==Notable acts of vandalism==
64
+ *In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation&lt;ref&gt;http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism&lt;/ref&gt;.
65
+ *Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him&lt;ref&gt;http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html&lt;/ref&gt;.
66
+ *Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays &quot;hate Australian people.&quot;&lt;ref name=Newzealand&gt;http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&amp;objectid=10432042&lt;/ref&gt;
67
+
68
+ ==References==
69
+ {{reflist}}
70
+
71
+ {{Wikipedia}}</text>
72
+ <sha1>eju7ojn2omej7atr11ll7k64hzhpkaq</sha1>
73
+ <model>wikitext</model>
74
+ <format>text/x-wiki</format>
75
+ </revision>
76
+ <revision>
77
+ <id>398880502</id>
78
+ <parentid>398880281</parentid>
79
+ <timestamp>2010-11-25T23:52:13Z</timestamp>
80
+ <contributor>
81
+ <username>Hellno2</username>
82
+ <id>3020504</id>
83
+ </contributor>
84
+ <comment>inuse</comment>
85
+ <text xml:space="preserve" bytes="1914">{{inuse}}
86
+ {{newpage}}
87
+ On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
88
+
89
+ Frequent targets of vandalism include articles on hot and controversial topics and current events&lt;ref name=Newzealand/&gt;.
90
+
91
+ ==Fighting vandalism==
92
+ The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
93
+ *Reverting the vandalism by restoring the article to the last version before the vandalism occurred&lt;ref name=Newzealand/&gt;
94
+ *Locking articles so only established users, or in some cases, only administrators can edit them&lt;ref name=Newzealand/&gt;
95
+ *Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely&lt;ref name=Newzealand/&gt;
96
+
97
+ ==Notable acts of vandalism==
98
+ *In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation&lt;ref&gt;http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism&lt;/ref&gt;.
99
+ *Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him&lt;ref&gt;http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html&lt;/ref&gt;.
100
+ *Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays &quot;hate Australian people.&quot;&lt;ref name=Newzealand&gt;http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&amp;objectid=10432042&lt;/ref&gt;
101
+
102
+ ==References==
103
+ {{reflist}}
104
+
105
+ {{Wikipedia}}
106
+
107
+ [[Category:Wikipedia]]</text>
108
+ <sha1>rwmi3pu1ormoc1mqgs7mej6r63u9uxk</sha1>
109
+ <model>wikitext</model>
110
+ <format>text/x-wiki</format>
111
+ </revision>
112
+ <revision>
113
+ <id>398883278</id>
114
+ <parentid>398880502</parentid>
115
+ <timestamp>2010-11-26T00:13:45Z</timestamp>
116
+ <contributor>
117
+ <username>Hellno2</username>
118
+ <id>3020504</id>
119
+ </contributor>
120
+ <text xml:space="preserve" bytes="3177">On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
121
+
122
+ Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site&lt;ref name=newscientist/&gt;. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information&lt;ref name=BBC&gt;http://news.bbc.co.uk/2/hi/4502846.stm&lt;/ref&gt;.
123
+
124
+ Most vandalism is committed on impulse&lt;ref name=BBC/&gt;. Frequent targets of vandalism include articles on hot and controversial topics and current events&lt;ref name=Newzealand/&gt;&lt;ref&gt;http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news&lt;/ref&gt;.
125
+
126
+ ==Fighting vandalism==
127
+ The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
128
+ *Reverting the vandalism by restoring the article to the last version before the vandalism occurred&lt;ref name=Newzealand/&gt;
129
+ *Locking articles so only established users, or in some cases, only administrators can edit them&lt;ref name=Newzealand/&gt;
130
+ *Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely&lt;ref name=Newzealand/&gt;
131
+
132
+ In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination&lt;ref name=newscientist&gt;http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html&lt;/ref&gt;.
133
+
134
+ In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site&lt;ref&gt;http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html&lt;/ref&gt;.
135
+
136
+ ==Notable acts of vandalism==
137
+ *In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation&lt;ref&gt;http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism&lt;/ref&gt;.
138
+ *Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him&lt;ref&gt;http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html&lt;/ref&gt;.
139
+ *Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays &quot;hate Australian people.&quot;&lt;ref name=Newzealand&gt;http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&amp;objectid=10432042&lt;/ref&gt;
140
+
141
+ ==References==
142
+ {{reflist}}
143
+
144
+ {{Wikipedia}}
145
+
146
+ [[Category:Wikipedia]]</text>
147
+ <sha1>hya1xftsfkq6wml6uigb6j480p4x2nt</sha1>
148
+ <model>wikitext</model>
149
+ <format>text/x-wiki</format>
150
+ </revision>
151
+ <revision>
152
+ <id>398883675</id>
153
+ <parentid>398883278</parentid>
154
+ <timestamp>2010-11-26T00:17:04Z</timestamp>
155
+ <contributor>
156
+ <username>Hellno2</username>
157
+ <id>3020504</id>
158
+ </contributor>
159
+ <text xml:space="preserve" bytes="3261">[[Image:Wikipedia vandalism.svg|thumb|300 px|[[Vandalism]] of a Wikipedia article]]
160
+ On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
161
+
162
+ Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site&lt;ref name=newscientist/&gt;. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information&lt;ref name=BBC&gt;http://news.bbc.co.uk/2/hi/4502846.stm&lt;/ref&gt;.
163
+
164
+ Most vandalism is committed on impulse&lt;ref name=BBC/&gt;. Frequent targets of vandalism include articles on hot and controversial topics and current events&lt;ref name=Newzealand/&gt;&lt;ref&gt;http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news&lt;/ref&gt;.
165
+
166
+ ==Fighting vandalism==
167
+ The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
168
+ *Reverting the vandalism by restoring the article to the last version before the vandalism occurred&lt;ref name=Newzealand/&gt;
169
+ *Locking articles so only established users, or in some cases, only administrators can edit them&lt;ref name=Newzealand/&gt;
170
+ *Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely&lt;ref name=Newzealand/&gt;
171
+
172
+ In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination&lt;ref name=newscientist&gt;http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html&lt;/ref&gt;.
173
+
174
+ In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site&lt;ref&gt;http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html&lt;/ref&gt;.
175
+
176
+ ==Notable acts of vandalism==
177
+ *In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation&lt;ref&gt;http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism&lt;/ref&gt;.
178
+ *Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him&lt;ref&gt;http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html&lt;/ref&gt;.
179
+ *Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays &quot;hate Australian people.&quot;&lt;ref name=Newzealand&gt;http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&amp;objectid=10432042&lt;/ref&gt;
180
+
181
+ ==References==
182
+ {{reflist}}
183
+
184
+ {{Wikipedia}}
185
+
186
+ [[Category:Wikipedia]]</text>
187
+ <sha1>ebb1e4tgy49mqdwtyk0rafzdcokp4lh</sha1>
188
+ <model>wikitext</model>
189
+ <format>text/x-wiki</format>
190
+ </revision>
191
+ <revision>
192
+ <id>398885233</id>
193
+ <parentid>398883675</parentid>
194
+ <timestamp>2010-11-26T00:29:53Z</timestamp>
195
+ <contributor>
196
+ <username>Hellno2</username>
197
+ <id>3020504</id>
198
+ </contributor>
199
+ <comment>/* Notable acts of vandalism */</comment>
200
+ <text xml:space="preserve" bytes="3541">[[Image:Wikipedia vandalism.svg|thumb|300 px|[[Vandalism]] of a Wikipedia article]]
201
+ On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
202
+
203
+ Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site&lt;ref name=newscientist/&gt;. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information&lt;ref name=BBC&gt;http://news.bbc.co.uk/2/hi/4502846.stm&lt;/ref&gt;.
204
+
205
+ Most vandalism is committed on impulse&lt;ref name=BBC/&gt;. Frequent targets of vandalism include articles on hot and controversial topics and current events&lt;ref name=Newzealand/&gt;&lt;ref&gt;http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news&lt;/ref&gt;.
206
+
207
+ ==Fighting vandalism==
208
+ The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
209
+ *Reverting the vandalism by restoring the article to the last version before the vandalism occurred&lt;ref name=Newzealand/&gt;
210
+ *Locking articles so only established users, or in some cases, only administrators can edit them&lt;ref name=Newzealand/&gt;
211
+ *Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely&lt;ref name=Newzealand/&gt;
212
+
213
+ In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination&lt;ref name=newscientist&gt;http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html&lt;/ref&gt;.
214
+
215
+ In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site&lt;ref&gt;http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html&lt;/ref&gt;.
216
+
217
+ ==Notable acts of vandalism==
218
+ *In 2006, comedian [[Steve Colbert]] vandalized the article [[elephant]] publicly on the air. This resulted in Colbert being blocked from editing, and a lot of elephant-related articles being protected&lt;ref&gt;http://www.tvsquad.com/2006/08/01/did-colbert-hack-wikipedia-video/&lt;/ref&gt;.
219
+ *In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation&lt;ref&gt;http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism&lt;/ref&gt;.
220
+ *Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him&lt;ref&gt;http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html&lt;/ref&gt;.
221
+ *Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays &quot;hate Australian people.&quot;&lt;ref name=Newzealand&gt;http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&amp;objectid=10432042&lt;/ref&gt;
222
+
223
+ ==References==
224
+ {{reflist}}
225
+
226
+ {{Wikipedia}}
227
+
228
+ [[Category:Wikipedia]]</text>
229
+ <sha1>t8s84rnkje13fkdkw0exui4hrs3fx8x</sha1>
230
+ <model>wikitext</model>
231
+ <format>text/x-wiki</format>
232
+ </revision>
233
+ </page>
234
+ </mediawiki>
@@ -0,0 +1,119 @@
1
+ <mediawiki>
2
+ <siteinfo>
3
+ <sitename>Wikipedia</sitename>
4
+ <base>http://en.wikipedia.org/wiki/Main_Page</base>
5
+ <generator>MediaWiki 1.23wmf12</generator>
6
+ <case>first-letter</case>
7
+ <namespaces>
8
+ <namespace key="-2" case="first-letter">Media</namespace>
9
+ <namespace key="-1" case="first-letter">Special</namespace>
10
+ <namespace key="0" case="first-letter"/>
11
+ <namespace key="1" case="first-letter">Talk</namespace>
12
+ <namespace key="2" case="first-letter">User</namespace>
13
+ <namespace key="3" case="first-letter">User talk</namespace>
14
+ <namespace key="4" case="first-letter">Wikipedia</namespace>
15
+ <namespace key="5" case="first-letter">Wikipedia talk</namespace>
16
+ <namespace key="6" case="first-letter">File</namespace>
17
+ <namespace key="7" case="first-letter">File talk</namespace>
18
+ <namespace key="8" case="first-letter">MediaWiki</namespace>
19
+ <namespace key="9" case="first-letter">MediaWiki talk</namespace>
20
+ <namespace key="10" case="first-letter">Template</namespace>
21
+ <namespace key="11" case="first-letter">Template talk</namespace>
22
+ <namespace key="12" case="first-letter">Help</namespace>
23
+ <namespace key="13" case="first-letter">Help talk</namespace>
24
+ <namespace key="14" case="first-letter">Category</namespace>
25
+ <namespace key="15" case="first-letter">Category talk</namespace>
26
+ <namespace key="100" case="first-letter">Portal</namespace>
27
+ <namespace key="101" case="first-letter">Portal talk</namespace>
28
+ <namespace key="108" case="first-letter">Book</namespace>
29
+ <namespace key="109" case="first-letter">Book talk</namespace>
30
+ <namespace key="118" case="first-letter">Draft</namespace>
31
+ <namespace key="119" case="first-letter">Draft talk</namespace>
32
+ <namespace key="446" case="first-letter">Education Program</namespace>
33
+ <namespace key="447" case="first-letter">Education Program talk</namespace>
34
+ <namespace key="710" case="first-letter">TimedText</namespace>
35
+ <namespace key="711" case="first-letter">TimedText talk</namespace>
36
+ <namespace key="828" case="first-letter">Module</namespace>
37
+ <namespace key="829" case="first-letter">Module talk</namespace>
38
+ </namespaces>
39
+ </siteinfo>
40
+ <page>
41
+ <title>Vandalism on Wikipedia</title>
42
+ <ns>0</ns>
43
+ <id>100</id>
44
+ <revision>
45
+ <id>1</id>
46
+ <timestamp>time 1</timestamp>
47
+ <contributor>
48
+ <ip>1</ip>
49
+ </contributor>
50
+ <comment>comment
51
+
52
+ 1
53
+
54
+ </comment>
55
+ <text xml:space="preserve" bytes="1880">text
56
+
57
+
58
+ 1
59
+
60
+ </text>
61
+ <sha1>hash1</sha1>
62
+ <model>wikitext</model>
63
+ <format>text/x-wiki</format>
64
+ </revision>
65
+ <revision>
66
+ <id>2</id>
67
+ <parentid>1</parentid>
68
+ <timestamp>time 2</timestamp>
69
+ <contributor>
70
+ <username>user</username>
71
+ <id>10</id>
72
+ </contributor>
73
+ <comment>comment 2</comment>
74
+ <text xml:space="preserve" bytes="1914">text 2</text>
75
+ <sha1>hash2</sha1>
76
+ <model>wikitext</model>
77
+ <format>text/x-wiki</format>
78
+ </revision>
79
+ <revision>
80
+ <id>3</id>
81
+ <parentid>2</parentid>
82
+ <timestamp>time 3</timestamp>
83
+ <contributor>
84
+ <username>user</username>
85
+ <id>11</id>
86
+ </contributor>
87
+ <text xml:space="preserve" bytes="3177">text 3</text>
88
+ <sha1>hash3</sha1>
89
+ <model>wikitext</model>
90
+ <format>text/x-wiki</format>
91
+ </revision>
92
+ <revision>
93
+ <id>4</id>
94
+ <parentid>3</parentid>
95
+ <timestamp>time 4</timestamp>
96
+ <contributor>
97
+ <username>user</username>
98
+ <id>12</id>
99
+ </contributor>
100
+ <text xml:space="preserve" bytes="3261">text 4</text>
101
+ <sha1>hash4</sha1>
102
+ <model>wikitext</model>
103
+ <format>text/x-wiki</format>
104
+ </revision>
105
+ <revision>
106
+ <id>5</id>
107
+ <parentid>4</parentid>
108
+ <timestamp>time 5</timestamp>
109
+ <contributor>
110
+ <ip>2</ip>
111
+ </contributor>
112
+ <comment>comment 3</comment>
113
+ <text xml:space="preserve" bytes="3541">text 5</text>
114
+ <sha1>hash5</sha1>
115
+ <model>wikitext</model>
116
+ <format>text/x-wiki</format>
117
+ </revision>
118
+ </page>
119
+ </mediawiki>
@@ -0,0 +1,30 @@
1
+ {{text}}
2
+ [[text]]
3
+ [[text:text]]
4
+ [[text|text]]
5
+ [http://domain.com]
6
+ =text=
7
+ ==text==
8
+ ===text===
9
+ ====text====
10
+ =====text=====
11
+ ======text======
12
+ ----
13
+ <text>
14
+ :text
15
+ ::text
16
+ :::text
17
+ ::::text
18
+ '''text'''
19
+ ''text''
20
+ * text
21
+ ** text
22
+ *** text
23
+ **** text
24
+ # text
25
+ ## text
26
+ ### text
27
+ #### text
28
+ &lsaquo;
29
+ &quot;
30
+ [http://www.wikipedia.com/images/uploads/beaver.jpg text text text]
@@ -0,0 +1,38 @@
1
+ require 'rspec'
2
+ require 'factory_bot'
3
+ require 'fileutils'
4
+
5
+ def require_files_from(paths = [])
6
+ paths.each do |path|
7
+ Dir[File.join(File.expand_path("#{path}*.rb", __FILE__))].each do |file|
8
+ require file
9
+ end
10
+ end
11
+ end
12
+
13
+ RSpec.configure do |config|
14
+ base_path = '../../lib/wikipedia/vandalism_detection'
15
+ lib_file = File.expand_path(base_path, __FILE__)
16
+ require lib_file
17
+
18
+ dirs = %w[../factories/**/ ../support/**/]
19
+ require_files_from dirs
20
+
21
+ config.include FileReading
22
+ config.include TestConfiguration
23
+ config.include FactoryBot::Syntax::Methods
24
+
25
+ config.after(:suite) do
26
+ test_build_dir = File.expand_path('../resources/build', __FILE__)
27
+ FileUtils.remove_dir(test_build_dir) if Dir.exist?(test_build_dir)
28
+ end
29
+
30
+ Classifier = Wikipedia::VandalismDetection::Classifier
31
+ Edit = Wikipedia::VandalismDetection::Edit
32
+ Evaluator = Wikipedia::VandalismDetection::Evaluator
33
+ Features = Wikipedia::VandalismDetection::Features
34
+ Instances = Wikipedia::VandalismDetection::Instances
35
+ Page = Wikipedia::VandalismDetection::Page
36
+ Text = Wikipedia::VandalismDetection::Text
37
+ TrainingDataset = Wikipedia::VandalismDetection::TrainingDataset
38
+ end
@@ -0,0 +1,6 @@
1
+ module FileReading
2
+ def load_file(file_name)
3
+ ressources_path = File.expand_path('../../../resources', __FILE__)
4
+ File.read("#{ressources_path}/#{file_name}")
5
+ end
6
+ end
@@ -0,0 +1,81 @@
1
+ module TestConfiguration
2
+ require 'yaml'
3
+
4
+ SOURCE_DIR = File.expand_path('../../../../spec/resources/', __FILE__)
5
+ CONFIG_DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
6
+
7
+ def source_dir
8
+ SOURCE_DIR
9
+ end
10
+
11
+ def use_configuration(override)
12
+ allow(Wikipedia::VandalismDetection)
13
+ .to receive(:config)
14
+ .and_return(override)
15
+ end
16
+
17
+ def use_test_configuration
18
+ use_configuration(test_config)
19
+ end
20
+
21
+ def use_default_configuration
22
+ use_configuration(CONFIG_DEFAULTS)
23
+ end
24
+
25
+ def test_configuration_content
26
+ config_file = 'config/wikipedia-vandalism-detection.yml'
27
+ config_path = File.join(source_dir, config_file)
28
+ YAML.load_file(config_path)
29
+ end
30
+
31
+ def merged_configuration(override = test_configuration_content)
32
+ default_config = CONFIG_DEFAULTS.merge('source' => source_dir)
33
+ default_config.deep_merge(override)
34
+ end
35
+
36
+ def test_config
37
+ allow_any_instance_of(Wikipedia::VandalismDetection::DefaultConfiguration)
38
+ .to receive(:source)
39
+ .and_return(source_dir)
40
+
41
+ Wikipedia::VandalismDetection::Configuration.send(:new)
42
+ end
43
+
44
+ def paths
45
+ config = test_configuration_content
46
+ corpus_config = config['corpora']
47
+
48
+ base_directory = File.expand_path(corpus_config['base_directory'], __FILE__)
49
+ training = corpus_config['training']
50
+ test = corpus_config['test']
51
+
52
+ {
53
+ corpora: {
54
+ 'base_directory' => base_directory,
55
+ 'training' => {
56
+ 'base_directory' => 'training',
57
+ 'edits_file' => training['edits_file'],
58
+ 'annotations_file' => training['annotations_file'],
59
+ 'revisions_directory' => training['revisions_directory']
60
+ },
61
+ 'test' => {
62
+ 'base_directory' => 'test',
63
+ 'edits_file' => test['edits_file'],
64
+ 'revisions_directory' => test['revisions_directory']
65
+ }
66
+
67
+ },
68
+ output: {
69
+ 'base_directory' => base_directory,
70
+ 'training' => {
71
+ 'index_file' => training['index_file'],
72
+ 'arff_file' => training['arff_file']
73
+ },
74
+ 'test' => {
75
+ 'index_file' => test['index_file'],
76
+ 'arff_file' => test['arff_file']
77
+ }
78
+ }
79
+ }
80
+ end
81
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wikipedia::VandalismDetection::Algorithms::KullbackLeiblerDivergence do
4
+ it { is_expected.to respond_to :of }
5
+
6
+ describe '#of' do
7
+ it 'returns missing value if no character in either of the texts' do
8
+ expect(subject.of('&', '?')).to eq Features::MISSING_VALUE
9
+ end
10
+
11
+ it 'returns zero for equal texts' do
12
+ text = 'Text sample'
13
+ expect(subject.of(text, text)).to eq 0.0
14
+ end
15
+
16
+ it 'returns a value bigger than zero for different texts' do
17
+ expect(subject.of('Text 1', 'Text 2')).to be > 0.0
18
+ end
19
+
20
+ it 'returns a higher value for a more different text' do
21
+ lower_divergence = subject.of('text a', 'text b')
22
+ higher_divergence = subject.of('text a', 'bla bla bla')
23
+
24
+ expect(lower_divergence).to be < higher_divergence
25
+ end
26
+
27
+ it 'can handle invalid byte sequences' do
28
+ invalid_byte_sequence = "text \255".force_encoding('UTF-8')
29
+ result = subject.of(invalid_byte_sequence, invalid_byte_sequence)
30
+
31
+ expect(result).to eq 0.0
32
+ end
33
+ end
34
+ end