wikipedia-vandalism_detection 0.1.0-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,121 @@
1
+ require 'weka'
2
+ require 'wikipedia/vandalism_detection/configuration'
3
+ require 'weka/classifiers/meta/one_class_classifier'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class Instances
8
+ REGULAR_CLASS_INDEX = 0
9
+ VANDALISM_CLASS_INDEX = 1
10
+ NOT_KNOWN_INDEX = 2
11
+
12
+ CLASS = 'class'.freeze
13
+ VANDALISM = 'vandalism'.freeze
14
+ REGULAR = 'regular'.freeze
15
+ NOT_KNOWN = '?'.freeze
16
+ OUTLIER = Weka::Classifiers::Meta::OneClassClassifier::OUTLIER_LABEL
17
+
18
+ VANDALISM_SHORT = 'V'.freeze
19
+ REGULAR_SHORT = 'R'.freeze
20
+
21
+ OLD_REVISION_ID = 'oldrevisionid'.freeze
22
+ NEW_REVISION_ID = 'newrevisionid'.freeze
23
+
24
+ CLASSES = {
25
+ REGULAR_CLASS_INDEX => REGULAR,
26
+ VANDALISM_CLASS_INDEX => VANDALISM,
27
+ NOT_KNOWN_INDEX => NOT_KNOWN
28
+ }.freeze
29
+
30
+ CLASSES_SHORT = {
31
+ REGULAR_CLASS_INDEX => REGULAR_SHORT,
32
+ VANDALISM_CLASS_INDEX => VANDALISM_SHORT,
33
+ NOT_KNOWN_INDEX => NOT_KNOWN
34
+ }.freeze
35
+
36
+ class << self
37
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
38
+ # This dataset is used for feature computation and classification for
39
+ # Wikipedia vandalism detection while training.
40
+ #
41
+ # @example
42
+ # datset = Wikipedia::VandalismDetection::Instances.empty
43
+ # => #<Java::WekaCore::Instances:0xf0f9a00
44
+ # @positions=[
45
+ # #<Java::WekaCore::Attribute:0x17207a76>,
46
+ # #<Java::WekaCore::Attribute:0x5547e4d6>,
47
+ # #<Java::WekaCore::Attribute:0x6300c957>,
48
+ # ...,
49
+ # #<Java::WekaCore::Attribute:0x5a74fae4>]>
50
+ def empty
51
+ features = Wikipedia::VandalismDetection.config.features
52
+ classes = dataset_classes
53
+
54
+ dataset = Weka::Core::Instances.new.with_attributes do
55
+ features.each do |name|
56
+ numeric name.tr(' ', '_')
57
+ end
58
+
59
+ nominal :class, values: classes, class_attribute: true
60
+ end
61
+
62
+ dataset
63
+ end
64
+
65
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
66
+ # This dataset is used for feature computation and classification for
67
+ # Wikipedia vandalism detection while training.
68
+ #
69
+ # @example
70
+ # datset = Wikipedia::VandalismDetection::Instances.empty
71
+ # => #<Java::WekaCore::Instances:0xf0f9a00
72
+ # @positions=[
73
+ # #<Java::WekaCore::Attribute:0x17207a76>
74
+ def empty_for_feature(name)
75
+ classes = dataset_classes
76
+
77
+ Weka::Core::Instances.new.with_attributes do
78
+ numeric name.tr(' ', '_')
79
+ nominal :class, values: classes, class_attribute: true
80
+ end
81
+ end
82
+
83
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
84
+ # This dataset is used for feature computation and classification for
85
+ # Wikipedia vandalism detection while testing.
86
+ #
87
+ # @example
88
+ # datset = Wikipedia::VandalismDetection::Instances.empty_for_test
89
+ # => #<Java::WekaCore::Instances:0xf0f9a00
90
+ # @positions=[
91
+ # #<Java::WekaCore::Attribute:0x17207a76>]>
92
+ def empty_for_test_feature(name)
93
+ Weka::Core::Instances.new.with_attributes do
94
+ numeric name.tr(' ', '_')
95
+ numeric OLD_REVISION_ID
96
+ numeric NEW_REVISION_ID
97
+ end
98
+ end
99
+
100
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
101
+ # This dataset is used for creating the ground truth classification.
102
+ def empty_for_test_class
103
+ classes = dataset_classes
104
+
105
+ Weka::Core::Instances.new.with_attributes do
106
+ nominal :class, values: classes
107
+ end
108
+ end
109
+
110
+ private
111
+
112
+ def dataset_classes
113
+ classes = []
114
+ classes[VANDALISM_CLASS_INDEX] = VANDALISM
115
+ classes[REGULAR_CLASS_INDEX] = REGULAR
116
+ classes
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,101 @@
1
+ require 'nokogiri'
2
+ require_relative 'edit'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ # Represents a full wikipedia page history.
7
+ class Page
8
+ START_TAG = '<page>'.freeze
9
+ END_TAG = '</page>'.freeze
10
+
11
+ attr_accessor :id, :title
12
+ attr_reader :revisions
13
+
14
+ def initialize
15
+ @revisions = {}
16
+ @edits = []
17
+ @update_edits = false
18
+ @update_reverted_edits = false
19
+ end
20
+
21
+ def add_revision(revision)
22
+ @revisions[revision.id] = revision
23
+
24
+ @update_edits = true
25
+ @update_reverted_edits = true
26
+ end
27
+
28
+ def edits
29
+ @edits = create_edits_from @revisions if @update_edits
30
+ @edits
31
+ end
32
+
33
+ def reverted_edits
34
+ if @update_reverted_edits
35
+ @reverted_edits = create_reverted_edits_from @revisions
36
+ end
37
+
38
+ @reverted_edits
39
+ end
40
+
41
+ private
42
+
43
+ def create_edits_from(revisions)
44
+ @update_edits = false
45
+ edits = []
46
+
47
+ revisions.each do |_, new_revision|
48
+ old_revision = revisions[new_revision.parent_id]
49
+ edits << Edit.new(old_revision, new_revision) unless old_revision.nil?
50
+ end
51
+
52
+ edits.each { |edit| edit.instance_variable_set(:@page, self) }
53
+ edits
54
+ end
55
+
56
+ # Returns the reverted edits by comparing the text's sha1 hashes of
57
+ # multiple revisions.
58
+ # If the next but one revision has the same sha1 hash as a base revision
59
+ # and the base revision has another hash than the one before, the
60
+ # in-between revision is reverted.
61
+ # The resulting edit holds the base revision as old revision and the
62
+ # reverted as new revision.
63
+ def create_reverted_edits_from(revisions)
64
+ @update_reverted_edits = false
65
+ edits = []
66
+
67
+ revisions.each do |current_id, first_revision|
68
+ second_revision_select = revisions
69
+ .select { |_, value| value.parent_id == current_id }
70
+ .first
71
+
72
+ next unless second_revision_select
73
+
74
+ second_revision = second_revision_select[1]
75
+
76
+ third_revision_select = revisions
77
+ .select { |_, value| value.parent_id == second_revision.id }
78
+ .first
79
+
80
+ next unless third_revision_select
81
+
82
+ first_sha1 = first_revision.sha1
83
+ second_sha1 = second_revision.sha1
84
+ third_sha1 = third_revision_select[1].sha1
85
+
86
+ previous_revision_select = revisions
87
+ .select { |_, value| value.id == first_revision.parent_id }
88
+ .first
89
+
90
+ previous_sha1 = previous_revision_select && previous_revision_select[1].sha1
91
+
92
+ if (first_sha1 == third_sha1) && (second_sha1 != previous_sha1)
93
+ edits << Edit.new(first_revision, second_revision)
94
+ end
95
+ end
96
+
97
+ edits
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,63 @@
1
+ require 'nokogiri'
2
+ require_relative 'page'
3
+ require_relative 'revision'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class PageParser
8
+ # Parses an xml string and returns a Wikipedia::VandalismDetection::Page.
9
+ def parse(xml)
10
+ @page = Page.new
11
+ document = Nokogiri::XML(xml, nil, 'UTF-8')
12
+
13
+ @page.title = document.xpath('//page/title').inner_text
14
+ @page.id = document.xpath('//page/id').inner_text
15
+
16
+ build_revisions_from(document)
17
+
18
+ @page
19
+ end
20
+
21
+ private
22
+
23
+ def node_value(document, xpath)
24
+ node = document.xpath(xpath.to_s)
25
+ return if node.empty?
26
+
27
+ node.inner_text
28
+ end
29
+
30
+ def node_text(document, xpath)
31
+ value = node_value(document, xpath)
32
+ return if value.blank?
33
+
34
+ Text.new(value)
35
+ end
36
+
37
+ # Builds and saves the available revisions to the @page variable
38
+ def build_revisions_from(document)
39
+ elements = document.xpath('//revision')
40
+
41
+ elements.each do |element|
42
+ revision = Revision.new
43
+
44
+ revision.id = node_value(element, :id)
45
+ revision.timestamp = node_value(element, :timestamp)
46
+ revision.comment = node_text(element, :comment)
47
+ revision.text = node_text(element, :text)
48
+ revision.sha1 = node_value(element, :sha1)
49
+ revision.parent_id = node_value(element, :parentid)
50
+ revision.contributor_username = node_value(element, 'contributor/username')
51
+
52
+ contributor_id = node_value(element, 'contributor/id')
53
+ contributor_ip = node_value(element, 'contributor/ip')
54
+
55
+ revision.contributor = contributor_id if contributor_id
56
+ revision.contributor = contributor_ip if contributor_ip
57
+
58
+ @page.add_revision(revision)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,75 @@
1
+ require_relative 'text'
2
+ require 'zlib'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ class Revision
7
+
8
+ START_TAG = '<revision>'.freeze
9
+ END_TAG = '</revision>'.freeze
10
+ REDIRECT_PATTERN = /#REDIRECT\s+\[\[.*?\]\]/
11
+
12
+ attr_accessor :id,
13
+ :parent_id,
14
+ :timestamp,
15
+ :contributor_username,
16
+ :sha1
17
+
18
+ attr_reader :comment,
19
+ :contributor_id,
20
+ :contributor_ip
21
+
22
+ def initialize
23
+ @text = Zlib::Deflate.deflate('')
24
+ @comment = Text.new
25
+ end
26
+
27
+ def contributor=(contributor)
28
+ if ip_v4? contributor
29
+ @contributor_ip = contributor
30
+ else
31
+ @contributor_id = contributor
32
+ end
33
+ end
34
+
35
+ def contributor
36
+ @contributor_id || @contributor_ip
37
+ end
38
+
39
+ def anonymous_contributor?
40
+ !@contributor_ip.nil?
41
+ end
42
+
43
+ def redirect?
44
+ !!(text =~ REDIRECT_PATTERN)
45
+ end
46
+
47
+ # Compresses text when set
48
+ def text=(text)
49
+ text = '' unless text.is_a?(String)
50
+
51
+ # remove invalid utf-8 byte sequences
52
+ text.encode!('UTF-16', 'UTF-8', invalid: :replace, replace: '')
53
+ text.encode!('UTF-8', 'UTF-16')
54
+ @text = Zlib::Deflate.deflate(text)
55
+ end
56
+
57
+ # Decompresses text when called
58
+ def text
59
+ Text.new(Zlib::Inflate.inflate(@text).force_encoding('utf-8'))
60
+ end
61
+
62
+ def comment=(comment)
63
+ comment = '' unless comment.is_a?(String)
64
+ @comment = Text.new(comment)
65
+ end
66
+
67
+ private
68
+
69
+ # Returns whether the given value is an IPv4.
70
+ def ip_v4?(value)
71
+ !!value.to_s.match(/(\d+)\.(\d+)\.(\d+)\.(\d+)/)
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+ require 'nokogiri'
4
+ require 'wikipedia/vandalism_detection/revision'
5
+
6
+ module Wikipedia
7
+ module VandalismDetection
8
+ class RevisionParser
9
+ DEFAULT_PROPERTIES = %i[
10
+ id
11
+ parent_id
12
+ timestamp
13
+ contributor
14
+ comment
15
+ text
16
+ sha1
17
+ ].freeze
18
+
19
+ # Parses an xml string and returns a Revision.
20
+ def parse(xml, options = {})
21
+ document = Nokogiri::XML(xml, nil, 'UTF-8').xpath('//revision')
22
+ revision = Revision.new
23
+
24
+ properties = options[:only] || DEFAULT_PROPERTIES
25
+
26
+ revision.id = node_value(document, properties, :id)
27
+ revision.timestamp = node_value(document, properties, :timestamp)
28
+ revision.comment = node_text(document, properties, :comment)
29
+ revision.text = node_text(document, properties, :text)
30
+ revision.sha1 = node_value(document, properties, :sha1)
31
+
32
+ if properties.include?(:contributor)
33
+ revision.contributor = node_presence(document, 'contributor/id')
34
+ revision.contributor = node_presence(document, 'contributor/ip')
35
+ revision.contributor_username = node_presence(document, 'contributor/username')
36
+ end
37
+
38
+ if properties.include?(:parent_id)
39
+ revision.parent_id = node_presence(document, 'parentid')
40
+ end
41
+
42
+ revision
43
+ end
44
+
45
+ private
46
+
47
+ def node_value(document, properties, attribute)
48
+ return unless properties.include?(attribute)
49
+ node_presence(document, attribute)
50
+ end
51
+
52
+ def node_text(document, properties, attribute)
53
+ value = node_value(document, properties, attribute)
54
+ return if value.blank?
55
+
56
+ Text.new(value)
57
+ end
58
+
59
+ def node_presence(document, xpath)
60
+ node = document.xpath(xpath.to_s)
61
+ return if node.empty?
62
+
63
+ node.inner_text
64
+ end
65
+ end
66
+ end
67
+ end