wikipedia-vandalism_detection 0.1.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +19 -0
  3. data/.rspec +2 -0
  4. data/.rubocop.yml +35 -0
  5. data/.travis.yml +6 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE.txt +4 -0
  8. data/README.md +288 -0
  9. data/Rakefile +11 -0
  10. data/config/wikipedia-vandalism-detection.yml.example +103 -0
  11. data/lib/java/SMOTE.jar +0 -0
  12. data/lib/java/balancedRandomForest.jar +0 -0
  13. data/lib/java/diffutils-1.3.0.jar +0 -0
  14. data/lib/java/oneClassClassifier.jar +0 -0
  15. data/lib/java/realAdaBoost.jar +0 -0
  16. data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
  17. data/lib/java/sweble-wikitext-extractor.jar +0 -0
  18. data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
  19. data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
  20. data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
  21. data/lib/weka/filters/supervised/instance/smote.rb +15 -0
  22. data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
  23. data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
  24. data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
  25. data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
  26. data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
  27. data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
  28. data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
  29. data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
  30. data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
  31. data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
  32. data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
  33. data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
  34. data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
  35. data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
  36. data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
  37. data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
  38. data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
  39. data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
  40. data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
  41. data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
  42. data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
  43. data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
  44. data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
  45. data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
  46. data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
  47. data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
  48. data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
  49. data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
  50. data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
  51. data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
  52. data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
  53. data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
  54. data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
  55. data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
  56. data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
  57. data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
  58. data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
  59. data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
  60. data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
  61. data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
  62. data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
  63. data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
  64. data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
  65. data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
  66. data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
  67. data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
  68. data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
  69. data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
  70. data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
  71. data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
  72. data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
  73. data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
  74. data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
  75. data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
  76. data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
  77. data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
  78. data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
  79. data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
  80. data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
  81. data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
  82. data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
  83. data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
  84. data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
  85. data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
  86. data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
  87. data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
  88. data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
  89. data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
  90. data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
  91. data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
  92. data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
  93. data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
  94. data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
  95. data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
  96. data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
  97. data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
  98. data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
  99. data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
  100. data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
  101. data/lib/wikipedia/vandalism_detection/features.rb +66 -0
  102. data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
  103. data/lib/wikipedia/vandalism_detection/page.rb +101 -0
  104. data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
  105. data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
  106. data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
  107. data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
  108. data/lib/wikipedia/vandalism_detection/text.rb +23 -0
  109. data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
  110. data/lib/wikipedia/vandalism_detection/version.rb +5 -0
  111. data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
  112. data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
  113. data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
  114. data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
  115. data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
  116. data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
  117. data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
  118. data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
  119. data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
  120. data/lib/wikipedia/vandalism_detection.rb +29 -0
  121. data/lib/wikipedia.rb +41 -0
  122. data/spec/factories/edit.rb +19 -0
  123. data/spec/factories/page.rb +12 -0
  124. data/spec/factories/revision.rb +51 -0
  125. data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
  126. data/spec/resources/corpora/test/edits.csv +8 -0
  127. data/spec/resources/corpora/test/ground-truth.txt +3 -0
  128. data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
  129. data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
  130. data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
  131. data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
  132. data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
  133. data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
  134. data/spec/resources/corpora/training/annotations.csv +7 -0
  135. data/spec/resources/corpora/training/edits.csv +7 -0
  136. data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
  137. data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
  138. data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
  139. data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
  140. data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
  141. data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
  142. data/spec/resources/page_with_redirects.xml +85 -0
  143. data/spec/resources/redirect_page.xml +59 -0
  144. data/spec/resources/revision_simplified.xml +13 -0
  145. data/spec/resources/sample_revision.txt +137 -0
  146. data/spec/resources/sample_revision_clean_text.txt +1 -0
  147. data/spec/resources/sample_revision_plain_text.txt +183 -0
  148. data/spec/resources/vandalism_on_wikipedia.xml +234 -0
  149. data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
  150. data/spec/resources/wikipedia_tokens.txt +30 -0
  151. data/spec/spec_helper.rb +38 -0
  152. data/spec/support/macros/file_reading.rb +6 -0
  153. data/spec/support/macros/test_configuration.rb +81 -0
  154. data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
  155. data/spec/vandalism_detection/classifier_spec.rb +330 -0
  156. data/spec/vandalism_detection/configuration_spec.rb +601 -0
  157. data/spec/vandalism_detection/diff_spec.rb +40 -0
  158. data/spec/vandalism_detection/edit_spec.rb +122 -0
  159. data/spec/vandalism_detection/evaluator_spec.rb +711 -0
  160. data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
  161. data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
  162. data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
  163. data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
  164. data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
  165. data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
  166. data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
  167. data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
  168. data/spec/vandalism_detection/features/base_spec.rb +41 -0
  169. data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
  170. data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
  171. data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
  172. data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
  173. data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
  174. data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
  175. data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
  176. data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
  177. data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
  178. data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
  179. data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
  180. data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
  181. data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
  182. data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
  183. data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
  184. data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
  185. data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
  186. data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
  187. data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
  188. data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
  189. data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
  190. data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
  191. data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
  192. data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
  193. data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
  194. data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
  195. data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
  196. data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
  197. data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
  198. data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
  199. data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
  200. data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
  201. data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
  202. data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
  203. data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
  204. data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
  205. data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
  206. data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
  207. data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
  208. data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
  209. data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
  210. data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
  211. data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
  212. data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
  213. data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
  214. data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
  215. data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
  216. data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
  217. data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
  218. data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
  219. data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
  220. data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
  221. data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
  222. data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
  223. data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
  224. data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
  225. data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
  226. data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
  227. data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
  228. data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
  229. data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
  230. data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
  231. data/spec/vandalism_detection/instances_spec.rb +146 -0
  232. data/spec/vandalism_detection/page_parser_spec.rb +190 -0
  233. data/spec/vandalism_detection/page_spec.rb +134 -0
  234. data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
  235. data/spec/vandalism_detection/revision_spec.rb +148 -0
  236. data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
  237. data/spec/vandalism_detection/text_spec.rb +29 -0
  238. data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
  239. data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
  240. data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
  241. data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
  242. data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
  243. data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
  244. data/wikipedia-vandalism_detection.gemspec +37 -0
  245. metadata +550 -0
@@ -0,0 +1,121 @@
1
+ require 'weka'
2
+ require 'wikipedia/vandalism_detection/configuration'
3
+ require 'weka/classifiers/meta/one_class_classifier'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class Instances
8
+ REGULAR_CLASS_INDEX = 0
9
+ VANDALISM_CLASS_INDEX = 1
10
+ NOT_KNOWN_INDEX = 2
11
+
12
+ CLASS = 'class'.freeze
13
+ VANDALISM = 'vandalism'.freeze
14
+ REGULAR = 'regular'.freeze
15
+ NOT_KNOWN = '?'.freeze
16
+ OUTLIER = Weka::Classifiers::Meta::OneClassClassifier::OUTLIER_LABEL
17
+
18
+ VANDALISM_SHORT = 'V'.freeze
19
+ REGULAR_SHORT = 'R'.freeze
20
+
21
+ OLD_REVISION_ID = 'oldrevisionid'.freeze
22
+ NEW_REVISION_ID = 'newrevisionid'.freeze
23
+
24
+ CLASSES = {
25
+ REGULAR_CLASS_INDEX => REGULAR,
26
+ VANDALISM_CLASS_INDEX => VANDALISM,
27
+ NOT_KNOWN_INDEX => NOT_KNOWN
28
+ }.freeze
29
+
30
+ CLASSES_SHORT = {
31
+ REGULAR_CLASS_INDEX => REGULAR_SHORT,
32
+ VANDALISM_CLASS_INDEX => VANDALISM_SHORT,
33
+ NOT_KNOWN_INDEX => NOT_KNOWN
34
+ }.freeze
35
+
36
+ class << self
37
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
38
+ # This dataset is used for feature computation and classification for
39
+ # Wikipedia vandalism detection while training.
40
+ #
41
+ # @example
42
+ # datset = Wikipedia::VandalismDetection::Instances.empty
43
+ # => #<Java::WekaCore::Instances:0xf0f9a00
44
+ # @positions=[
45
+ # #<Java::WekaCore::Attribute:0x17207a76>,
46
+ # #<Java::WekaCore::Attribute:0x5547e4d6>,
47
+ # #<Java::WekaCore::Attribute:0x6300c957>,
48
+ # ...,
49
+ # #<Java::WekaCore::Attribute:0x5a74fae4>]>
50
+ def empty
51
+ features = Wikipedia::VandalismDetection.config.features
52
+ classes = dataset_classes
53
+
54
+ dataset = Weka::Core::Instances.new.with_attributes do
55
+ features.each do |name|
56
+ numeric name.tr(' ', '_')
57
+ end
58
+
59
+ nominal :class, values: classes, class_attribute: true
60
+ end
61
+
62
+ dataset
63
+ end
64
+
65
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
66
+ # This dataset is used for feature computation and classification for
67
+ # Wikipedia vandalism detection while training.
68
+ #
69
+ # @example
70
+ # datset = Wikipedia::VandalismDetection::Instances.empty
71
+ # => #<Java::WekaCore::Instances:0xf0f9a00
72
+ # @positions=[
73
+ # #<Java::WekaCore::Attribute:0x17207a76>
74
+ def empty_for_feature(name)
75
+ classes = dataset_classes
76
+
77
+ Weka::Core::Instances.new.with_attributes do
78
+ numeric name.tr(' ', '_')
79
+ nominal :class, values: classes, class_attribute: true
80
+ end
81
+ end
82
+
83
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
84
+ # This dataset is used for feature computation and classification for
85
+ # Wikipedia vandalism detection while testing.
86
+ #
87
+ # @example
88
+ # datset = Wikipedia::VandalismDetection::Instances.empty_for_test
89
+ # => #<Java::WekaCore::Instances:0xf0f9a00
90
+ # @positions=[
91
+ # #<Java::WekaCore::Attribute:0x17207a76>]>
92
+ def empty_for_test_feature(name)
93
+ Weka::Core::Instances.new.with_attributes do
94
+ numeric name.tr(' ', '_')
95
+ numeric OLD_REVISION_ID
96
+ numeric NEW_REVISION_ID
97
+ end
98
+ end
99
+
100
+ # Returns an empty instances dataset of type Java::WekaCore::Instances.
101
+ # This dataset is used for creating the ground truth classification.
102
+ def empty_for_test_class
103
+ classes = dataset_classes
104
+
105
+ Weka::Core::Instances.new.with_attributes do
106
+ nominal :class, values: classes
107
+ end
108
+ end
109
+
110
+ private
111
+
112
+ def dataset_classes
113
+ classes = []
114
+ classes[VANDALISM_CLASS_INDEX] = VANDALISM
115
+ classes[REGULAR_CLASS_INDEX] = REGULAR
116
+ classes
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,101 @@
1
+ require 'nokogiri'
2
+ require_relative 'edit'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ # Represents a full wikipedia page history.
7
+ class Page
8
+ START_TAG = '<page>'.freeze
9
+ END_TAG = '</page>'.freeze
10
+
11
+ attr_accessor :id, :title
12
+ attr_reader :revisions
13
+
14
+ def initialize
15
+ @revisions = {}
16
+ @edits = []
17
+ @update_edits = false
18
+ @update_reverted_edits = false
19
+ end
20
+
21
+ def add_revision(revision)
22
+ @revisions[revision.id] = revision
23
+
24
+ @update_edits = true
25
+ @update_reverted_edits = true
26
+ end
27
+
28
+ def edits
29
+ @edits = create_edits_from @revisions if @update_edits
30
+ @edits
31
+ end
32
+
33
+ def reverted_edits
34
+ if @update_reverted_edits
35
+ @reverted_edits = create_reverted_edits_from @revisions
36
+ end
37
+
38
+ @reverted_edits
39
+ end
40
+
41
+ private
42
+
43
+ def create_edits_from(revisions)
44
+ @update_edits = false
45
+ edits = []
46
+
47
+ revisions.each do |_, new_revision|
48
+ old_revision = revisions[new_revision.parent_id]
49
+ edits << Edit.new(old_revision, new_revision) unless old_revision.nil?
50
+ end
51
+
52
+ edits.each { |edit| edit.instance_variable_set(:@page, self) }
53
+ edits
54
+ end
55
+
56
+ # Returns the reverted edits by comparing the text's sha1 hashes of
57
+ # multiple revisions.
58
+ # If the next but one revision has the same sha1 hash as a base revision
59
+ # and the base revision has another hash than the one before, the
60
+ # in-between revision is reverted.
61
+ # The resulting edit holds the base revision as old revision and the
62
+ # reverted as new revision.
63
+ def create_reverted_edits_from(revisions)
64
+ @update_reverted_edits = false
65
+ edits = []
66
+
67
+ revisions.each do |current_id, first_revision|
68
+ second_revision_select = revisions
69
+ .select { |_, value| value.parent_id == current_id }
70
+ .first
71
+
72
+ next unless second_revision_select
73
+
74
+ second_revision = second_revision_select[1]
75
+
76
+ third_revision_select = revisions
77
+ .select { |_, value| value.parent_id == second_revision.id }
78
+ .first
79
+
80
+ next unless third_revision_select
81
+
82
+ first_sha1 = first_revision.sha1
83
+ second_sha1 = second_revision.sha1
84
+ third_sha1 = third_revision_select[1].sha1
85
+
86
+ previous_revision_select = revisions
87
+ .select { |_, value| value.id == first_revision.parent_id }
88
+ .first
89
+
90
+ previous_sha1 = previous_revision_select && previous_revision_select[1].sha1
91
+
92
+ if (first_sha1 == third_sha1) && (second_sha1 != previous_sha1)
93
+ edits << Edit.new(first_revision, second_revision)
94
+ end
95
+ end
96
+
97
+ edits
98
+ end
99
+ end
100
+ end
101
+ end
@@ -0,0 +1,63 @@
1
+ require 'nokogiri'
2
+ require_relative 'page'
3
+ require_relative 'revision'
4
+
5
+ module Wikipedia
6
+ module VandalismDetection
7
+ class PageParser
8
+ # Parses an xml string and returns a Wikipedia::VandalismDetection::Page.
9
+ def parse(xml)
10
+ @page = Page.new
11
+ document = Nokogiri::XML(xml, nil, 'UTF-8')
12
+
13
+ @page.title = document.xpath('//page/title').inner_text
14
+ @page.id = document.xpath('//page/id').inner_text
15
+
16
+ build_revisions_from(document)
17
+
18
+ @page
19
+ end
20
+
21
+ private
22
+
23
+ def node_value(document, xpath)
24
+ node = document.xpath(xpath.to_s)
25
+ return if node.empty?
26
+
27
+ node.inner_text
28
+ end
29
+
30
+ def node_text(document, xpath)
31
+ value = node_value(document, xpath)
32
+ return if value.blank?
33
+
34
+ Text.new(value)
35
+ end
36
+
37
+ # Builds and saves the available revisions to the @page variable
38
+ def build_revisions_from(document)
39
+ elements = document.xpath('//revision')
40
+
41
+ elements.each do |element|
42
+ revision = Revision.new
43
+
44
+ revision.id = node_value(element, :id)
45
+ revision.timestamp = node_value(element, :timestamp)
46
+ revision.comment = node_text(element, :comment)
47
+ revision.text = node_text(element, :text)
48
+ revision.sha1 = node_value(element, :sha1)
49
+ revision.parent_id = node_value(element, :parentid)
50
+ revision.contributor_username = node_value(element, 'contributor/username')
51
+
52
+ contributor_id = node_value(element, 'contributor/id')
53
+ contributor_ip = node_value(element, 'contributor/ip')
54
+
55
+ revision.contributor = contributor_id if contributor_id
56
+ revision.contributor = contributor_ip if contributor_ip
57
+
58
+ @page.add_revision(revision)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,75 @@
1
+ require_relative 'text'
2
+ require 'zlib'
3
+
4
+ module Wikipedia
5
+ module VandalismDetection
6
+ class Revision
7
+
8
+ START_TAG = '<revision>'.freeze
9
+ END_TAG = '</revision>'.freeze
10
+ REDIRECT_PATTERN = /#REDIRECT\s+\[\[.*?\]\]/
11
+
12
+ attr_accessor :id,
13
+ :parent_id,
14
+ :timestamp,
15
+ :contributor_username,
16
+ :sha1
17
+
18
+ attr_reader :comment,
19
+ :contributor_id,
20
+ :contributor_ip
21
+
22
+ def initialize
23
+ @text = Zlib::Deflate.deflate('')
24
+ @comment = Text.new
25
+ end
26
+
27
+ def contributor=(contributor)
28
+ if ip_v4? contributor
29
+ @contributor_ip = contributor
30
+ else
31
+ @contributor_id = contributor
32
+ end
33
+ end
34
+
35
+ def contributor
36
+ @contributor_id || @contributor_ip
37
+ end
38
+
39
+ def anonymous_contributor?
40
+ !@contributor_ip.nil?
41
+ end
42
+
43
+ def redirect?
44
+ !!(text =~ REDIRECT_PATTERN)
45
+ end
46
+
47
+ # Compresses text when set
48
+ def text=(text)
49
+ text = '' unless text.is_a?(String)
50
+
51
+ # remove invalid utf-8 byte sequences
52
+ text.encode!('UTF-16', 'UTF-8', invalid: :replace, replace: '')
53
+ text.encode!('UTF-8', 'UTF-16')
54
+ @text = Zlib::Deflate.deflate(text)
55
+ end
56
+
57
+ # Decompresses text when called
58
+ def text
59
+ Text.new(Zlib::Inflate.inflate(@text).force_encoding('utf-8'))
60
+ end
61
+
62
+ def comment=(comment)
63
+ comment = '' unless comment.is_a?(String)
64
+ @comment = Text.new(comment)
65
+ end
66
+
67
+ private
68
+
69
+ # Returns whether the given value is an IPv4.
70
+ def ip_v4?(value)
71
+ !!value.to_s.match(/(\d+)\.(\d+)\.(\d+)\.(\d+)/)
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,67 @@
1
+ # encoding: utf-8
2
+
3
+ require 'nokogiri'
4
+ require 'wikipedia/vandalism_detection/revision'
5
+
6
+ module Wikipedia
7
+ module VandalismDetection
8
+ class RevisionParser
9
+ DEFAULT_PROPERTIES = %i[
10
+ id
11
+ parent_id
12
+ timestamp
13
+ contributor
14
+ comment
15
+ text
16
+ sha1
17
+ ].freeze
18
+
19
+ # Parses an xml string and returns a Revision.
20
+ def parse(xml, options = {})
21
+ document = Nokogiri::XML(xml, nil, 'UTF-8').xpath('//revision')
22
+ revision = Revision.new
23
+
24
+ properties = options[:only] || DEFAULT_PROPERTIES
25
+
26
+ revision.id = node_value(document, properties, :id)
27
+ revision.timestamp = node_value(document, properties, :timestamp)
28
+ revision.comment = node_text(document, properties, :comment)
29
+ revision.text = node_text(document, properties, :text)
30
+ revision.sha1 = node_value(document, properties, :sha1)
31
+
32
+ if properties.include?(:contributor)
33
+ revision.contributor = node_presence(document, 'contributor/id')
34
+ revision.contributor = node_presence(document, 'contributor/ip')
35
+ revision.contributor_username = node_presence(document, 'contributor/username')
36
+ end
37
+
38
+ if properties.include?(:parent_id)
39
+ revision.parent_id = node_presence(document, 'parentid')
40
+ end
41
+
42
+ revision
43
+ end
44
+
45
+ private
46
+
47
+ def node_value(document, properties, attribute)
48
+ return unless properties.include?(attribute)
49
+ node_presence(document, attribute)
50
+ end
51
+
52
+ def node_text(document, properties, attribute)
53
+ value = node_value(document, properties, attribute)
54
+ return if value.blank?
55
+
56
+ Text.new(value)
57
+ end
58
+
59
+ def node_presence(document, xpath)
60
+ node = document.xpath(xpath.to_s)
61
+ return if node.empty?
62
+
63
+ node.inner_text
64
+ end
65
+ end
66
+ end
67
+ end