wikipedia-vandalism_detection 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bf756c5448798deaecad9dff7f1158124f1665eae7f65e6e3cd1c018dcb4b273
|
4
|
+
data.tar.gz: ec45e4a4a402eb9dadada7570f094cd5be294634da3e31ce28603bd48666e74c
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: a72ec32117e19bbac2764eb01022f608c4eb91121e6d552c1a05a230b559a5279e51fe8e7970b48667d6450ebb0b23fc36338ade74bb47d729018fbdb4b39868
|
7
|
+
data.tar.gz: 8eb0fb8fe4d2e0ed681543cf0a76dd9a806253cf8e43ce2dd224137ad0970d1f7e9f84caf2b1fd22f289d3553414e449799d5f50c010d435ac2d6a3d5afa4a93
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
AllCops:
|
2
|
+
TargetRubyVersion: 2.4
|
3
|
+
Exclude:
|
4
|
+
- 'bin/**/*'
|
5
|
+
- '*.gemspec'
|
6
|
+
- 'Gemfile'
|
7
|
+
- 'Gemfile.lock'
|
8
|
+
|
9
|
+
Style/Copyright:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
Style/Documentation:
|
13
|
+
Enabled: false
|
14
|
+
|
15
|
+
Metrics/LineLength:
|
16
|
+
Max: 80
|
17
|
+
Exclude:
|
18
|
+
- '**/*_spec.rb'
|
19
|
+
- 'spec/factories/*.rb'
|
20
|
+
|
21
|
+
Layout/MultilineMethodCallIndentation:
|
22
|
+
EnforcedStyle: indented
|
23
|
+
|
24
|
+
Style/FrozenStringLiteralComment:
|
25
|
+
Enabled: false
|
26
|
+
|
27
|
+
Metrics/ModuleLength:
|
28
|
+
Exclude:
|
29
|
+
- '**/*_spec.rb'
|
30
|
+
- 'spec/factories/*.rb'
|
31
|
+
|
32
|
+
Metrics/BlockLength:
|
33
|
+
Exclude:
|
34
|
+
- '**/*_spec.rb'
|
35
|
+
- 'spec/factories/*.rb'
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
data/README.md
ADDED
@@ -0,0 +1,288 @@
|
|
1
|
+
# Wikipedia Vandalism Detection
|
2
|
+
|
3
|
+
Vandalism detection on the Wikipedia history with JRuby v9.1.0.0+.
|
4
|
+
|
5
|
+
The Wikipedia Vandalism Detection Gem uses the Weka Machine-Learning Library
|
6
|
+
via the [weka](https://github.com/paulgoetze/weka-jruby) gem.
|
7
|
+
|
8
|
+
[![Gem Version](https://badge.fury.io/rb/wikipedia-vandalism_detection.svg)](http://badge.fury.io/rb/wikipedia-vandalism_detection)
|
9
|
+
[![Build Status](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection.png?branch=develop)](https://travis-ci.org/paulgoetze/wikipedia-vandalism-detection)
|
10
|
+
|
11
|
+
## What You can do with it
|
12
|
+
|
13
|
+
* parsing Wikipedia history pages to get edits and revisions
|
14
|
+
* creating training and test ARFF files from
|
15
|
+
the [WVC-PAN-10](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-10) and
|
16
|
+
the [WVC-PAN-11](https://www.uni-weimar.de/en/media/chairs/computer-science-and-media/webis/corpora/corpus-pan-wvc-11)
|
17
|
+
(See also http://pan.webis.de under category Wikipedia Vandalism Detection: [CLEF 2010](http://pan.webis.de/clef10/pan10-web/wikipedia-vandalism-detection) & [CLEF 2011](http://pan.webis.de/clef11/pan11-web/wikipedia-vandalism-detection))
|
18
|
+
|
19
|
+
* calculating vandalism features for a Wikipedia page (XML) from the history dump
|
20
|
+
* creating and evaluating a classifier with the created training ARFF file
|
21
|
+
* classifing new instances of Wikipedia edits as 'regular' or 'vandalism'
|
22
|
+
|
23
|
+
## Installation
|
24
|
+
|
25
|
+
Add this line to your application's Gemfile:
|
26
|
+
|
27
|
+
gem 'wikipedia-vandalism_detection'
|
28
|
+
|
29
|
+
And then execute:
|
30
|
+
|
31
|
+
$ bundle
|
32
|
+
|
33
|
+
Or install it yourself as:
|
34
|
+
|
35
|
+
$ gem install wikipedia-vandalism_detection
|
36
|
+
|
37
|
+
## Usage
|
38
|
+
|
39
|
+
require 'wikipedia/vandalism_detection'
|
40
|
+
|
41
|
+
### Configuration
|
42
|
+
|
43
|
+
To configure the system put a `wikipedia-vandalism-detection.yml` file in the
|
44
|
+
`config/` or `lib/config/` directory.
|
45
|
+
|
46
|
+
You can configure:
|
47
|
+
|
48
|
+
A) the training and test corpora directories and essential input and output files
|
49
|
+
|
50
|
+
```YAML
|
51
|
+
corpora:
|
52
|
+
base_directory: /home/user/corpora
|
53
|
+
|
54
|
+
training:
|
55
|
+
base_directory: training
|
56
|
+
annotations_file: annotations.csv
|
57
|
+
edits_file: edits.csv
|
58
|
+
revisions_directory: revisions
|
59
|
+
|
60
|
+
test:
|
61
|
+
base_directory: test
|
62
|
+
edits_file: edits.csv
|
63
|
+
revisions_directory: revisons
|
64
|
+
|
65
|
+
output:
|
66
|
+
base_directory: /home/user/output_path
|
67
|
+
training:
|
68
|
+
arff_file: training.arff
|
69
|
+
index_file: training_index.yml
|
70
|
+
test:
|
71
|
+
arff_file: test.arff
|
72
|
+
index_file: test_index.yml
|
73
|
+
```
|
74
|
+
|
75
|
+
Evaluation outputs are saved under the output base directory path.
|
76
|
+
|
77
|
+
B) the features used by the feature calculator
|
78
|
+
|
79
|
+
```YAML
|
80
|
+
features:
|
81
|
+
- anonymity
|
82
|
+
- biased frequency
|
83
|
+
- character sequence
|
84
|
+
- ...
|
85
|
+
```
|
86
|
+
|
87
|
+
C) the classifier type and its options and the number of cross validation splits
|
88
|
+
for the classifier evaluation
|
89
|
+
|
90
|
+
```YAML
|
91
|
+
classifier:
|
92
|
+
type: Trees::RandomForest # Weka classifier class
|
93
|
+
options: -I 10 -K 0.5 # same as for Weka, for further classifier options see Weka-dev documentation
|
94
|
+
cross-validation-fold: 5 # default is 10
|
95
|
+
training-data-options: balanced # default is unbalanced
|
96
|
+
```
|
97
|
+
|
98
|
+
`training-data-options` is used to resample the training dataset:
|
99
|
+
|
100
|
+
* `unbalanced` is the default value and uses the original dataset
|
101
|
+
* `balanced` uses random undersampling of the majority class
|
102
|
+
* `oversampled` uses SMOTE oversampling (with percentage `-p`) and random undersampling (with minority/majority class balance `-u`)
|
103
|
+
|
104
|
+
Examples:
|
105
|
+
|
106
|
+
```YAML
|
107
|
+
# 200% SMOTE oversampling with 300% random undersampling
|
108
|
+
training-data-options: oversampled -p 200 -u true 300
|
109
|
+
|
110
|
+
# default 100% SMOTE oversampling with 300% random undersampling
|
111
|
+
training-data-options: oversampled -u true 300
|
112
|
+
|
113
|
+
# 200% SMOTE oversampling with default full (100% minority/majority class balance)
|
114
|
+
# random undersampling
|
115
|
+
training-data-options: oversampled -p 200
|
116
|
+
|
117
|
+
# default 100% SMOTE oversampling without undersampling
|
118
|
+
training-data-options: oversampled -u false
|
119
|
+
```
|
120
|
+
|
121
|
+
Instead of the `true` option you can also use `t`, `y` and `yes` as well as their upper case pendants.
|
122
|
+
|
123
|
+
### Examples
|
124
|
+
|
125
|
+
**Create training and test ARFF file from configured corpus:**
|
126
|
+
|
127
|
+
```ruby
|
128
|
+
training_dataset = Wikipedia::VandalismDetection::TrainingDataset.build
|
129
|
+
test_dataset = Wikipedia::VandalismDetection::TestDataset.build
|
130
|
+
```
|
131
|
+
|
132
|
+
While creating the training and test datasets, for each a corpus file index is created into the configured `index_file`
|
133
|
+
directory.
|
134
|
+
To run the corpus file index creation manually use:
|
135
|
+
|
136
|
+
```ruby
|
137
|
+
Wikipedia::VandalismDetection::TrainingDataset.create_file_index!
|
138
|
+
Wikipedia::VandalismDetection::TestDataset.create_file_index!
|
139
|
+
```
|
140
|
+
|
141
|
+
**Parse a Wikipedia page content:**
|
142
|
+
|
143
|
+
At the moment no namespaces are supported while parsing a page.
|
144
|
+
So, the `<page>...</page>` tags should not be included in a namespaced xml tag!
|
145
|
+
|
146
|
+
```ruby
|
147
|
+
xml = File.read(wikipedia_page.xml)
|
148
|
+
parser = Wikipedia::VandalismDetection::PageParser.new
|
149
|
+
page = parser.parse(xml)
|
150
|
+
|
151
|
+
# Work with revisions and edits from the page
|
152
|
+
page.revisions.each do |revision|
|
153
|
+
puts revison.id
|
154
|
+
puts revison.parent_id
|
155
|
+
end
|
156
|
+
|
157
|
+
page.edits.each do |edit|
|
158
|
+
puts edit.new_revision.id
|
159
|
+
puts edit.old_revision.id
|
160
|
+
end
|
161
|
+
```
|
162
|
+
|
163
|
+
**Use a classifier of configured type:**
|
164
|
+
|
165
|
+
Create the classifier:
|
166
|
+
|
167
|
+
```ruby
|
168
|
+
classifier = Wikipedia::VandalismDetection::Classifier.new
|
169
|
+
```
|
170
|
+
|
171
|
+
Evaluation of the classifier against the configured training corpus:
|
172
|
+
|
173
|
+
```ruby
|
174
|
+
# classifier.classifier_instance returns the weka classifier instance
|
175
|
+
evaluation = classifier.classifier_instance.cross_validate(folds: 10)
|
176
|
+
puts evaluation.class_details
|
177
|
+
```
|
178
|
+
|
179
|
+
Classify a new edit:
|
180
|
+
|
181
|
+
```ruby
|
182
|
+
# Classification of a Wikipedia Edit or a feature set
|
183
|
+
# 'edit' is a Wikipedia::VandalismDetection::Edit, this can be built manually or by
|
184
|
+
# parsing a Wikipedia page content and getting its edits
|
185
|
+
# The returned confidence is a value between 0.0 and 1.0 were 0.0 means 'regular' and 1.0 means 'vandalism'
|
186
|
+
confidence = classifier.classify(edit)
|
187
|
+
|
188
|
+
feature_calculator = Wikipedia::VandalismDetection::FeatureCalculator.new
|
189
|
+
features = feature_calculator.calculate_features_for(edit)
|
190
|
+
confidence = classifier.classify(features)
|
191
|
+
```
|
192
|
+
|
193
|
+
Evaluate test corpus classification:
|
194
|
+
|
195
|
+
```ruby
|
196
|
+
evaluator = classifier.evaluator
|
197
|
+
# or create a new evaluator
|
198
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
199
|
+
|
200
|
+
performance_data = evaluator.evaluate_testcorpus_classification #default sample_count = 100
|
201
|
+
performance_data = evaluator.evaluate_testcorpus_classification(sample_count: 200)
|
202
|
+
|
203
|
+
# following attributes can be used for further computations
|
204
|
+
recall_values = performance_data[:recalls] # recall values for e.g. x-values of PRC or y-values of ROC
|
205
|
+
precision_values = performance_data[:precisions] # precision values for e.g. y-values of PRC
|
206
|
+
fp_rate_values = performance_data[:fp_rates] # false positive rate values for e.g. x-values of ROC
|
207
|
+
area_under_curve_pr = performance_data[:pr_auc] # computed from the precision and recall values
|
208
|
+
area_under_curve_ro = performance_data[:roc_auc] # computed from the recall and fp-rate values
|
209
|
+
total_recall = performance_data[:total_recall] # precison and recall values with maximum area (rectangle area)
|
210
|
+
total_precision = performance_data[:total_precision]
|
211
|
+
```
|
212
|
+
|
213
|
+
Get each features predictive value for analysis:
|
214
|
+
|
215
|
+
```ruby
|
216
|
+
evaluator = classifier.evaluator
|
217
|
+
# or create a new evaluator
|
218
|
+
evaluator = Wikipedia::VandalismDetection::Evaluator.new(classifier)
|
219
|
+
|
220
|
+
analysis_data = evaluator.feature_analysis #default sample_count = 100
|
221
|
+
analysis_data = evaluator.feature_analysis(sample_count: 1000)
|
222
|
+
```
|
223
|
+
|
224
|
+
This returns a hash comprising all feature names as configured as keys and the threshold hashes as values.
|
225
|
+
|
226
|
+
```ruby
|
227
|
+
{
|
228
|
+
feature_name_1:
|
229
|
+
{
|
230
|
+
0.0 => {fp:… , fn:… , tp:… , tn:… },
|
231
|
+
…,
|
232
|
+
1.0 => {fp:… , fn:… , tp:… , tn:… }
|
233
|
+
},
|
234
|
+
…,
|
235
|
+
feature_name_n:
|
236
|
+
{
|
237
|
+
0.0 => {fp:… , fn:… , tp:… , tn:… },
|
238
|
+
…,
|
239
|
+
1.0 => {fp:… , fn:… , tp:… , tn:… }
|
240
|
+
},
|
241
|
+
}
|
242
|
+
```
|
243
|
+
|
244
|
+
**Creating new Features:**
|
245
|
+
|
246
|
+
You can define your own new Feature classes and use them by configuration in the config.yml.
|
247
|
+
|
248
|
+
Make sure to define the Feature class inside of the `Wikipedia::VandalismDetection::Features` module
|
249
|
+
and to implement the `calculate` method
|
250
|
+
(also refer to the `Wikipedia::VandalismDetection::Features::Base` class definition).
|
251
|
+
|
252
|
+
```ruby
|
253
|
+
module Wikipedia
|
254
|
+
module VandalismDetection
|
255
|
+
module Features
|
256
|
+
class MyNewFeature < Base
|
257
|
+
def calculate(edit)
|
258
|
+
super # ensures raising an error if 'edit' is not an Edit.
|
259
|
+
|
260
|
+
# ...your implementation
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
end
|
265
|
+
end
|
266
|
+
```
|
267
|
+
|
268
|
+
While creating new Feature classes you should be aware of the following naming convention:
|
269
|
+
The feature's name in the config.yml is the *downcased name with spaces or dashes* of the feature class name
|
270
|
+
|
271
|
+
E.g.:
|
272
|
+
|
273
|
+
```YAML
|
274
|
+
features:
|
275
|
+
- my new feature
|
276
|
+
- my-new-feature
|
277
|
+
```
|
278
|
+
|
279
|
+
both search for a Feature class with the name `MyNewFeature`.
|
280
|
+
|
281
|
+
|
282
|
+
## Contributing
|
283
|
+
|
284
|
+
1. Fork it ( http://github.com/paulgoetze/wikipedia-vandalism_detection/fork )
|
285
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
286
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
287
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
288
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,103 @@
|
|
1
|
+
# Configuring the training and test corpora directories and essential input and output files.
|
2
|
+
# As corpora the WVC-PAN-10 and WVC-PAN-11 can be used (see http://webis.de/ under Research -> Corpora).
|
3
|
+
|
4
|
+
corpora:
|
5
|
+
base_directory: /home/user/corpora
|
6
|
+
|
7
|
+
training:
|
8
|
+
base_directory: training
|
9
|
+
annotations_file: annotations.csv
|
10
|
+
edits_file: edits.csv
|
11
|
+
revisions_directory: revisions
|
12
|
+
|
13
|
+
test:
|
14
|
+
base_directory: test
|
15
|
+
edits_file: edits.csv
|
16
|
+
revisions_directory: revisons
|
17
|
+
|
18
|
+
output:
|
19
|
+
base_directory: /home/user/output_path
|
20
|
+
training:
|
21
|
+
arff_file: training.arff
|
22
|
+
index_file: training_index.yml
|
23
|
+
test:
|
24
|
+
arff_file: test.arff
|
25
|
+
index_file: test_index.yml
|
26
|
+
|
27
|
+
|
28
|
+
# Configuring the used features.
|
29
|
+
# See
|
30
|
+
|
31
|
+
features:
|
32
|
+
- anonymity
|
33
|
+
- anonymity previous
|
34
|
+
- all wordlists frequency
|
35
|
+
- all wordlists impact
|
36
|
+
- article size
|
37
|
+
- bad frequency
|
38
|
+
- bad impact
|
39
|
+
- biased frequency
|
40
|
+
- biased impact
|
41
|
+
- blanking
|
42
|
+
- character sequence
|
43
|
+
- character diversity
|
44
|
+
- comment length
|
45
|
+
- comment biased frequency
|
46
|
+
- comment pronoun frequency
|
47
|
+
- comment vulgarism frequency
|
48
|
+
- compressibility
|
49
|
+
- copyedit
|
50
|
+
- digit ratio
|
51
|
+
- edits per user
|
52
|
+
- emoticons frequency
|
53
|
+
- emoticons impact
|
54
|
+
- inserted size
|
55
|
+
- inserted words
|
56
|
+
- inserted character distribution
|
57
|
+
- inserted external links
|
58
|
+
- inserted internal links
|
59
|
+
- longest word
|
60
|
+
- markup frequency
|
61
|
+
- markup impact
|
62
|
+
- non-alphanumeric ratio
|
63
|
+
- personal life
|
64
|
+
- pronoun frequency
|
65
|
+
- pronoun impact
|
66
|
+
- removed size
|
67
|
+
- removed words
|
68
|
+
- removed all wordlists frequency
|
69
|
+
- removed bad frequency
|
70
|
+
- removed biased frequency
|
71
|
+
- removed character distribution
|
72
|
+
- removed emoticons frequency
|
73
|
+
- removed markup frequency
|
74
|
+
- removed pronoun frequency
|
75
|
+
- removed sex frequency
|
76
|
+
- removed vulgarism frequency
|
77
|
+
- replacement similarity
|
78
|
+
- reverted
|
79
|
+
- revisions character distribution
|
80
|
+
- sex frequency
|
81
|
+
- sex impact
|
82
|
+
- same editor
|
83
|
+
- size increment
|
84
|
+
- size ratio
|
85
|
+
- term frequency
|
86
|
+
- time interval
|
87
|
+
- time of day
|
88
|
+
- upper case ratio
|
89
|
+
- upper case words ratio
|
90
|
+
- upper to lower case ratio
|
91
|
+
- vulgarism frequency
|
92
|
+
- vulgarism impact
|
93
|
+
- weekday
|
94
|
+
- words increment
|
95
|
+
|
96
|
+
|
97
|
+
# Configuring the used classifier
|
98
|
+
|
99
|
+
classifier:
|
100
|
+
type: Trees::RandomForest # Weka classifier class
|
101
|
+
options: -I 10 -K 0.5 # same as for Weka, for further classifier options see the Weka-dev documentation
|
102
|
+
cross-validation-fold: 5 # default is 10
|
103
|
+
training-data-options: balanced # default is unbalanced
|
data/lib/java/SMOTE.jar
ADDED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'weka'
|
2
|
+
require 'weka/class_builder'
|
3
|
+
|
4
|
+
module Weka
|
5
|
+
module Classifiers
|
6
|
+
module Meta
|
7
|
+
require 'java/oneClassClassifier.jar'
|
8
|
+
include ClassBuilder
|
9
|
+
|
10
|
+
# One class classifier by C. Hempstalk (cite: http://dl.acm.org/citation.cfm?id=1431987)
|
11
|
+
# Jar can be downloaded at: http://sourceforge.net/projects/weka/files/weka-packages/oneClassClassifier1.0.4.zip
|
12
|
+
build_class :OneClassClassifier
|
13
|
+
|
14
|
+
class OneClassClassifier
|
15
|
+
def self.type
|
16
|
+
'Meta::OneClassClassifier'
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'weka'
|
2
|
+
require 'weka/class_builder'
|
3
|
+
|
4
|
+
module Weka
|
5
|
+
module Classifiers
|
6
|
+
module Meta
|
7
|
+
require 'java/realAdaBoost.jar'
|
8
|
+
include ClassBuilder
|
9
|
+
|
10
|
+
# Real ada boost classifier, see: http://www.stanford.edu/~hastie/Papers/AdditiveLogisticRegression/alr.pdf
|
11
|
+
# Jar can be downloaded at: http://prdownloads.sourceforge.net/weka/realAdaBoost1.0.1.zip?download
|
12
|
+
build_class :RealAdaBoost
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'weka'
|
2
|
+
require 'weka/class_builder'
|
3
|
+
|
4
|
+
module Weka
|
5
|
+
module Classifiers
|
6
|
+
module Trees
|
7
|
+
require 'java/balancedRandomForest.jar'
|
8
|
+
include ClassBuilder
|
9
|
+
|
10
|
+
# balanced RandomForest classifier,
|
11
|
+
# Modified from https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomForest.java
|
12
|
+
# and https://github.com/jdurbin/durbinlib/blob/master/src/durbin/weka/BalancedRandomTree.java
|
13
|
+
build_class :BalancedRandomForest
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
2
|
+
|
3
|
+
module Wikipedia
|
4
|
+
module VandalismDetection
|
5
|
+
module Algorithms
|
6
|
+
class KullbackLeiblerDivergence
|
7
|
+
ALLOWED_ERROR = 9e-6
|
8
|
+
|
9
|
+
# Returns the Symmetric Kullback-Leibler divergence with simple back-off
|
10
|
+
# of the given text's character distribution. For implementation details
|
11
|
+
# see: https://web.archive.org/web/20130508191111/http://staff.science.uva.nl/~tsagias/?p=185.
|
12
|
+
def of(text_a, text_b)
|
13
|
+
text_a = cleanup_text(text_a)
|
14
|
+
text_b = cleanup_text(text_b)
|
15
|
+
|
16
|
+
unless text_a.match(/[[:alnum:]]/) && text_b.match(/[[:alnum:]]/)
|
17
|
+
return Features::MISSING_VALUE
|
18
|
+
end
|
19
|
+
|
20
|
+
distribution_a = character_distribution(text_a)
|
21
|
+
distribution_b = character_distribution(text_b)
|
22
|
+
|
23
|
+
sum_a = distribution_a.values.inject(0, :+)
|
24
|
+
sum_b = distribution_b.values.inject(0, :+)
|
25
|
+
|
26
|
+
character_diff = distribution_b.keys - distribution_a.keys
|
27
|
+
|
28
|
+
epsilon = [
|
29
|
+
distribution_a.values.min / sum_a,
|
30
|
+
distribution_b.values.min / sum_b
|
31
|
+
].min * 0.001
|
32
|
+
|
33
|
+
gamma = 1 - character_diff.size * epsilon
|
34
|
+
|
35
|
+
check_integrity(distribution_a, sum_a)
|
36
|
+
check_integrity(distribution_b, sum_b)
|
37
|
+
|
38
|
+
divergence = 0.0
|
39
|
+
|
40
|
+
distribution_a.each do |character, distribution|
|
41
|
+
prob_a = distribution / sum_a
|
42
|
+
|
43
|
+
character_distribution = distribution_b[character]
|
44
|
+
|
45
|
+
prob_b =
|
46
|
+
if character_distribution
|
47
|
+
gamma * (character_distribution / sum_b)
|
48
|
+
else
|
49
|
+
epsilon
|
50
|
+
end
|
51
|
+
|
52
|
+
divergence += (prob_a - prob_b) * Math.log(prob_a / prob_b)
|
53
|
+
end
|
54
|
+
|
55
|
+
divergence
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
# Removes invalid utf-8 characters
|
61
|
+
def cleanup_text(text)
|
62
|
+
text.encode(
|
63
|
+
'UTF-8',
|
64
|
+
'binary',
|
65
|
+
invalid: :replace,
|
66
|
+
undef: :replace,
|
67
|
+
replace: ''
|
68
|
+
)
|
69
|
+
end
|
70
|
+
|
71
|
+
# Returns a hash representing each character's distribution
|
72
|
+
def character_distribution(text)
|
73
|
+
distribution = {}
|
74
|
+
return distribution if text.empty?
|
75
|
+
|
76
|
+
characters = text.downcase.scan(/[[:alnum:]]/)
|
77
|
+
|
78
|
+
characters.each do |character|
|
79
|
+
if distribution.key?(character.to_sym)
|
80
|
+
distribution[character.to_sym] += 1
|
81
|
+
else
|
82
|
+
distribution[character.to_sym] = 1
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
Hash[distribution.map do |key, value|
|
87
|
+
[key, value / characters.count.to_f]
|
88
|
+
end]
|
89
|
+
end
|
90
|
+
|
91
|
+
# Checks if values sum up to 1.0, raises an error if they don't.
|
92
|
+
def check_integrity(distribution, sum)
|
93
|
+
difference = 1.0 - distribution.values
|
94
|
+
.inject(0) { |result, value| result + (value / sum) }.abs
|
95
|
+
|
96
|
+
return if difference <= ALLOWED_ERROR
|
97
|
+
|
98
|
+
raise(Exception, 'Text distribution does not sum up to 1.0')
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|