wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# The WikitextExtractor imports the WikitextExtractor class from the
|
|
2
|
+
# sweble-wikitext-extractor.jar
|
|
3
|
+
# The sweble-wikitext-extractor.jar is a custom Java project which uses the
|
|
4
|
+
# Sweble wikitext parser to extract plaintext out of wikimarkup text.
|
|
5
|
+
#
|
|
6
|
+
# The Sweble WikitextExtractor currently depends on the swc-engine -v1.1.0 with
|
|
7
|
+
# dependencies,
|
|
8
|
+
# see: http://sweble.org/downloads/swc-devel/master-latest/ to download it.
|
|
9
|
+
#
|
|
10
|
+
# The Java source code can be found on:
|
|
11
|
+
# webis.uni-weimar.de:/srv/cvsroot/code-in-progress/wikipedia-vandalism-detection/sweble-wikitext-extractor
|
|
12
|
+
module Wikipedia
|
|
13
|
+
module VandalismDetection
|
|
14
|
+
require 'java'
|
|
15
|
+
require 'java/swc-engine-1.1.0-jar-with-dependencies.jar'
|
|
16
|
+
require 'java/sweble-wikitext-extractor.jar'
|
|
17
|
+
|
|
18
|
+
java_import 'de.webis.sweble.WikitextExtractor'
|
|
19
|
+
|
|
20
|
+
class WikitextExtractionError < StandardError; end
|
|
21
|
+
|
|
22
|
+
# This class wrapps the de.webis.sweble.WikitextExtractor Java class and
|
|
23
|
+
# provides methods to extract plaintext from wiki markup text both space
|
|
24
|
+
# preserving and cleaned without line breaks and whitespace.
|
|
25
|
+
class WikitextExtractor
|
|
26
|
+
REDIRECT = '#REDIRECT'.freeze
|
|
27
|
+
|
|
28
|
+
class << self
|
|
29
|
+
# Returns the extracted text from the given wiki markup preserving
|
|
30
|
+
# spacing with added section numbers.
|
|
31
|
+
def extract(wiki_text)
|
|
32
|
+
wiki_text = wiki_text.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
|
33
|
+
wiki_text = wiki_text.gsub(REDIRECT, '')
|
|
34
|
+
|
|
35
|
+
WikitextExtractor.new.extract(wiki_text)
|
|
36
|
+
rescue => exception
|
|
37
|
+
message = "Wikitext extraction failed: \n#{exception.message}"
|
|
38
|
+
raise WikitextExtractionError, message, caller
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Returns the cleaned extracted text from the given wiki markup.
|
|
42
|
+
# Cleaned means a single string without breaks, multiple spaces and
|
|
43
|
+
# section numbers.
|
|
44
|
+
def extract_clean(wiki_text)
|
|
45
|
+
wiki_text = extract wiki_text
|
|
46
|
+
|
|
47
|
+
wiki_text = remove_section_numbering_from wiki_text
|
|
48
|
+
wiki_text = remove_line_breaks_from wiki_text
|
|
49
|
+
wiki_text = remove_uris_from wiki_text
|
|
50
|
+
wiki_text = remove_special_signes_from wiki_text
|
|
51
|
+
wiki_text = remove_multiple_spaces_from wiki_text
|
|
52
|
+
wiki_text.strip
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
# removes 1., 1.1., 2.3.4. etc. at the beginning of a line
|
|
58
|
+
def remove_section_numbering_from(text)
|
|
59
|
+
text.gsub(/^(\d\.)+/, '')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def remove_line_breaks_from(text)
|
|
63
|
+
text.gsub(/\n+/, ' ')
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def remove_multiple_spaces_from(text)
|
|
67
|
+
text.gsub(/\s+/, ' ')
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def remove_uris_from(text)
|
|
71
|
+
text.gsub(%r{(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*}i, '')
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def remove_special_signes_from(text)
|
|
75
|
+
text.gsub(/\[\]\{\}\|\=/, ' ')
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
BAD = %i[
|
|
5
|
+
666 da dont dosent whatever guy hi nazi sup guise loser thats ugly wanna
|
|
6
|
+
whats wont gotta bloody fart pot prick stink smells smelly alot dunno
|
|
7
|
+
gotcha
|
|
8
|
+
].freeze
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
BIASED = %i[
|
|
5
|
+
acclaimed amazing astonishing authoritative beautiful best brilliant
|
|
6
|
+
canonical celebrated charismatic classic cutting-edge defining
|
|
7
|
+
definitive eminent enigma exciting extraordinary fabulous famous
|
|
8
|
+
infamous fantastic fully genius global great greatest iconic immensely
|
|
9
|
+
impactful incendiary indisputable influential innovative inspired
|
|
10
|
+
intriguing leader leading legendary major masterly mature memorable
|
|
11
|
+
notable outstanding pioneer popular prestigious really remarkable
|
|
12
|
+
renowned respected seminal significant skillful solution single-handedly
|
|
13
|
+
staunch talented most top transcendent undoubtedly unique visionary
|
|
14
|
+
virtually virtuoso well-known well-established world-class worst coolest
|
|
15
|
+
super probably hate ugly fat lame weird strange everyone cares boring
|
|
16
|
+
boreing ever huge like idiotic absolute total totally
|
|
17
|
+
].freeze
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
EMOTICONS = [
|
|
5
|
+
:':\)', :':p', :':\(', :';\)', :':D', :';D', :';P', :';p', :':\-\)',
|
|
6
|
+
:':\-\(', :';\-\)', :':\-D', :':\-p', :':\-P', :'8\-\)', :'8\)',
|
|
7
|
+
:'\^\^', :'\*_\*', :'\^_\^', :':\-I', :':\-X', :':\-x', :'X\-p',
|
|
8
|
+
:'X\-P', :':\-\]', :'\^\.\^', :':\*', :':\-\*', :XD, :'X\-D', :'8\-D',
|
|
9
|
+
:'8D', :':\-O', :':\-o', :':\-\|', :'X\-\(', :'X\(', :'\-_\-', :':o\)',
|
|
10
|
+
:':O\)', :'B\-\)', :':O', :':o', :':\-s', :':\-S', :':\-\/', :':\-\\',
|
|
11
|
+
:T_T, :':\*\(', :':\*\-\(', :':\(\(', :'\*\-\*', :':\-\[', :':\->',
|
|
12
|
+
:':\|', :':\-\|', :':\]', :':\[', :'\/:\(', :'\\:\(', :':\-$', :':$',
|
|
13
|
+
:':\-6', :':\-9', :'@_@', :'<3', :'\|\-D', :':0', :':\-0', :o_O,
|
|
14
|
+
:oO, :'\(\-:', :'\(\-;', :'\(:', :'\):', :'\)\-:', :'\(;', :'\(y\)',
|
|
15
|
+
:'\(\.\)\(\.\)', :O_O, :'0_0', :'8\-\[', :'8\-\]', :'8\[', :'8\]',
|
|
16
|
+
:'8\-\(', :'8\(', :':\-', :'%\)', :'%\-\)', :'8\|', :'8\-\|', :'=\)',
|
|
17
|
+
:':\]', :':>', :':c\)', :'\[:', :'<:', :'c:', :'\(x', :'\(o:', :'\(c:',
|
|
18
|
+
:'D:', :':\'\(', :':\'C', :';\(', :';o\)', :'\(o;', :':b', :':p', :'=P',
|
|
19
|
+
:':P', :dx, :xP, :'d\-:', :'d:', :'q:', :'d=', :'d;', :'c\(:', :'=D',
|
|
20
|
+
:'=\-D', :'=O', :'=o', :'=0', :'o=', :'O=', :'0=', :'\^_~', :'>_<',
|
|
21
|
+
:'~_~', :'>:', :':<', :'\(Y\)', :'\(=', :'\)=', :'=\(', :'=\[, :=\]',
|
|
22
|
+
:'\[=', :'\]='
|
|
23
|
+
].freeze
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
MARKUP = [
|
|
5
|
+
:'\{\{',
|
|
6
|
+
:'\[\[',
|
|
7
|
+
:infobox,
|
|
8
|
+
:category,
|
|
9
|
+
:defaultsort,
|
|
10
|
+
:'<ref>',
|
|
11
|
+
:cite,
|
|
12
|
+
:__toc__,
|
|
13
|
+
:__forcetoc__,
|
|
14
|
+
:defaultsort,
|
|
15
|
+
:reflist
|
|
16
|
+
].freeze
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
PRONOUNS = %i[
|
|
5
|
+
i me myself mine my we us ourselves ourself ours our you yourself yours
|
|
6
|
+
your thou thee thyself thine thy yourselves y'all youse you-uns y'all
|
|
7
|
+
youse yous yis yourselves y'all's selves yous's
|
|
8
|
+
].freeze
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
module Wikipedia
|
|
2
|
+
module VandalismDetection
|
|
3
|
+
module WordLists
|
|
4
|
+
# This list is taken from https://github.com/snipe/banbuilder and can be
|
|
5
|
+
# downloaded from:
|
|
6
|
+
# https //:github.com/snipe/banbuilder/blob/master/word-dbs/wordlist.csv
|
|
7
|
+
VULGARISM = %i[
|
|
8
|
+
$#!+ $1ut $h1t $hit $lut 'ho 'hobag a$$ anus ass assmunch b1tch
|
|
9
|
+
ballsack bastard beaner beastiality biatch beeyotch bitchy
|
|
10
|
+
blow blowjob bollock bollocks bollok boner boob bugger buttplug
|
|
11
|
+
c-0-c-k c-o-c-k c-u-n-t c.0.c.k c.o.c.k. c.u.n. jerk jackoff
|
|
12
|
+
jackhole j3rk0ff homo hom0 hobag hell h0mo h0m0 goddamn goddammit
|
|
13
|
+
godamnit ghey ghay gfy gay fudgepacker fuckwad fucktard fuckoff
|
|
14
|
+
fucker fuck-tard fuck fellatio fellate felching felcher felch
|
|
15
|
+
fartknocker fart fannybandit fanny faggot fagg fag f.u.c.k f-u-c-k
|
|
16
|
+
dyke douchebag douche douch3 doosh dike dick damnit damn dammit d1ldo
|
|
17
|
+
d1ld0 d1ck d0uche d0uch3 cunt cumstain cum crap coon cock clitoris
|
|
18
|
+
clit cl1t cawk c0ck jerk0ff jerkoff jizz knobend labia lmfao moolie
|
|
19
|
+
muff nigga nigger p.u.s.s.y. piss piss-off pissoff prick pube pussy
|
|
20
|
+
queer retard retarded s-h-1-t s-h-i-t s.h.i.t. scrotum sh1t shit slut
|
|
21
|
+
smegma t1t tard terd tit tits titties turd twat vag wank wetback
|
|
22
|
+
whore whoreface 'f*ck' sh*t pu$$y p*ssy diligaf wtf stfu fu*ck fack
|
|
23
|
+
shite fxck sh!t @sshole assh0le assho!e a$$hole a$$h0le a$$h0!e
|
|
24
|
+
a$$h01e assho1e wh0re f@g f@gg0t f@ggot motherf*cker mofo cuntlicker
|
|
25
|
+
cuntface dickbag cockknocker beatch fucknut nucking futs mams cunny
|
|
26
|
+
quim clitty kike spic wop chink humper feltch feltcher fvck ahole
|
|
27
|
+
nads spick douchey bullturds gonads bitch butt fellatio lmao s-o-b
|
|
28
|
+
spunk he11 jizm jism bukkake shiz wigger gook ritard reetard
|
|
29
|
+
masterbate masturbate goatse masterbating masturbating hitler nazi
|
|
30
|
+
tubgirl gtfo foad r-tard rtard hoor g-spot gspot vulva assmaster
|
|
31
|
+
viagra phuck frack fuckwit assbang assbanged assbangs asshole
|
|
32
|
+
assholes asswipe asswipes b1tch bastards bitched bitches boners
|
|
33
|
+
bullshit bullshits bullshitted cameltoe chinc chincs chink chode
|
|
34
|
+
chodes clit clits cocks coons cumming cunts d1ck dickhead dickheads
|
|
35
|
+
doggie-style douchebags dumass dumbass dumbasses dykes faggit fags
|
|
36
|
+
fucked fucker fuckface fucks godamnit gooks humped humping jackass
|
|
37
|
+
jap japs jerk jizzed kikes knobend kooch kooches kootch fuckers
|
|
38
|
+
motherfucking niggah niggas niggers p.u.s.s.y. pussies queers rim s0b
|
|
39
|
+
shitface shithead shits shitted s.o.b. spik spiks twats whack whores
|
|
40
|
+
zoophile m-fucking mthrfucking muthrfucking mutherfucking
|
|
41
|
+
mutherfucker mtherfucker mthrfucker mthrf*cker whorehopper copulator
|
|
42
|
+
whoralicious whorealicious aeolus analprobe areola areole aryan arian
|
|
43
|
+
asses assfuck azazel baal babes bang banger barf bawdy beardedclam
|
|
44
|
+
beater beaver beer bigtits bimbo blew blow blowjobs blowup bod bodily
|
|
45
|
+
boink bone boned bong boobies boobs booby booger bookie booky bootee
|
|
46
|
+
bootie booty booze boozer boozy bosom bosomy bowel bowels bra
|
|
47
|
+
brassiere bung babe bush buttfuck cocaine kinky klan panties
|
|
48
|
+
pedophile pedophilia pedophiliac punkass queaf rape scantily essohbee
|
|
49
|
+
shithouse smut snatch toots doggie anorexia bulimia bulimiic burp
|
|
50
|
+
busty buttfucker caca cahone carnal carpetmuncher cervix climax
|
|
51
|
+
cocain cocksucker coital coke commie condom corpse coven crabs crack
|
|
52
|
+
crackwhore crappy cuervo cummin cumshot cumshots cunnilingus dago
|
|
53
|
+
dagos damned dick-ish dickish dickweed anorexic prostitute marijuana
|
|
54
|
+
lsd pcp diddle dawgie-style dimwit dingle doofus dopey douche drunk
|
|
55
|
+
dummy ejaculate enlargement erect erotic exotic extacy extasy faerie
|
|
56
|
+
faery fagged fagot fairy fisted fisting fisty floozy fondle foobar
|
|
57
|
+
foreskin frigg frigga fubar fucking fuckup ganja gays glans godamn
|
|
58
|
+
goddam goldenshower gonad gonads handjob hebe hemp heroin herpes
|
|
59
|
+
hijack hiv homey honky hooch hookah hooker hootch hooter hooters hump
|
|
60
|
+
hussy hymen inbred incest injun jerked jiz jizm horny junkie junky
|
|
61
|
+
kill kkk kraut kyke lech leper lesbians lesbos lez lezbian lezbians
|
|
62
|
+
lezbo lezbos lezzie lezzies lezzy loin loins lube lust lusty massa
|
|
63
|
+
masterbation masturbation maxi menses menstruate menstruation meth
|
|
64
|
+
molest moron motherfucka motherfucker murder muthafucker nad naked
|
|
65
|
+
napalm nappy nazism negro niggle nimrod ninny nooky nympho opiate
|
|
66
|
+
opium oral orally organ orgasm orgies orgy ovary ovum ovums paddy
|
|
67
|
+
pantie panty pastie pasty pecker pedo pee peepee penetrate
|
|
68
|
+
penetration penial penile perversion peyote phalli phallic
|
|
69
|
+
pillowbiter pimp pinko pissed pms polack porn porno pornography pot
|
|
70
|
+
potty prig prude pubic pubis punky puss queef queefing quife quicky
|
|
71
|
+
racist racy raped raper rapist raunch rectal rectum rectus reefer
|
|
72
|
+
reich revue risque rum rump sadism sadist satan scag schizo screw
|
|
73
|
+
screwed scrog scrot scrote scrud scum seaman seamen seduce semen
|
|
74
|
+
sex_story sexual shithole shitter shitty s*o*b sissy skag slave
|
|
75
|
+
sleaze sleazy sluts smutty sniper snuff sodom souse soused sperm
|
|
76
|
+
spooge stab steamy stiffy stoned strip stroke whacking suck sucked
|
|
77
|
+
sucking tampon tawdry teat teste testee testes testis thrust thug
|
|
78
|
+
tinkle titfuck titi titty whacked toke tramp trashy tush undies unwed
|
|
79
|
+
urinal urine uterus uzi valium virgin vixen vodka vomit voyeur vulgar
|
|
80
|
+
wad wazoo wedgie weed weenie weewee weiner weirdo wench whitey whiz
|
|
81
|
+
whored whorehouse whoring womb woody x-rated xxx b@lls yeasty yobbo
|
|
82
|
+
sumofabiatch doggy-style doggy wang dong d0ng w@ng wh0reface
|
|
83
|
+
wh0ref@ce wh0r3f@ce tittyfuck tittyfucker tittiefucker cockholster
|
|
84
|
+
cockblock gai gey faig faigt a55 a55hole gae corksucker rumprammer
|
|
85
|
+
slutdumper niggaz muthafuckaz gigolo pussypounder herp herpy
|
|
86
|
+
transsexual orgasmic cunilingus anilingus dickdipper dickwhipper
|
|
87
|
+
dicksipper dickripper dickflipper dickzipper homoey queero freex
|
|
88
|
+
cunthunter shamedame slutkiss shiteater fuckass fucka$$ clitorus
|
|
89
|
+
assfucker assfuckers dillweed cracker teabagging shitt azz fuk
|
|
90
|
+
fucknugget cuntlick g@y @ss beotch pussys 's***' paedophile
|
|
91
|
+
pedophiles pedophile sucks licker lickers bitchface idiot tosser
|
|
92
|
+
idiots tossers
|
|
93
|
+
].freeze
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/word_lists/bad'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
|
3
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
|
4
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
5
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
|
6
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
|
7
|
+
|
|
8
|
+
module Wikipedia
|
|
9
|
+
module VandalismDetection
|
|
10
|
+
module WordLists
|
|
11
|
+
# Returns an array of all wordlist words
|
|
12
|
+
def self.all
|
|
13
|
+
[*BAD, *BIASED, *PRONOUNS, *SEX, *VULGARISM].uniq!
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'wikipedia'
|
|
2
|
+
require 'wikipedia/vandalism_detection/version'
|
|
3
|
+
require 'wikipedia/vandalism_detection/configuration'
|
|
4
|
+
require 'wikipedia/vandalism_detection/exceptions'
|
|
5
|
+
|
|
6
|
+
require 'wikipedia/vandalism_detection/text'
|
|
7
|
+
require 'wikipedia/vandalism_detection/revision'
|
|
8
|
+
require 'wikipedia/vandalism_detection/edit'
|
|
9
|
+
require 'wikipedia/vandalism_detection/page'
|
|
10
|
+
require 'wikipedia/vandalism_detection/page_parser'
|
|
11
|
+
require 'wikipedia/vandalism_detection/revision_parser'
|
|
12
|
+
|
|
13
|
+
require 'wikipedia/vandalism_detection/word_lists'
|
|
14
|
+
require 'wikipedia/vandalism_detection/diff'
|
|
15
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
|
16
|
+
require 'wikipedia/vandalism_detection/features'
|
|
17
|
+
require 'wikipedia/vandalism_detection/feature_calculator'
|
|
18
|
+
|
|
19
|
+
require 'wikipedia/vandalism_detection/instances'
|
|
20
|
+
require 'wikipedia/vandalism_detection/training_dataset'
|
|
21
|
+
require 'wikipedia/vandalism_detection/test_dataset'
|
|
22
|
+
require 'wikipedia/vandalism_detection/classifier'
|
|
23
|
+
require 'wikipedia/vandalism_detection/evaluator'
|
|
24
|
+
|
|
25
|
+
require 'weka/classifiers/meta/one_class_classifier'
|
|
26
|
+
require 'weka/classifiers/meta/real_ada_boost'
|
|
27
|
+
require 'weka/classifiers/trees/balanced_random_forest'
|
|
28
|
+
|
|
29
|
+
require 'weka/filters/supervised/instance/smote'
|
data/lib/wikipedia.rb
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
require 'open-uri'
|
|
2
|
+
require 'nokogiri'
|
|
3
|
+
require 'timeout'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
def self.api_base_uri
|
|
7
|
+
'https://en.wikipedia.org/w/api.php?format=xml&action=query&'
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def self.param_string(params)
|
|
11
|
+
params.map { |k, v| "#{k}=#{v}" }.join('&')
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
# Retries to call the request in the case of Timeout errors
|
|
15
|
+
def self.request_with_retry(uri, times = 1, timeout = 5)
|
|
16
|
+
content = ''
|
|
17
|
+
|
|
18
|
+
begin
|
|
19
|
+
Timeout.timeout(timeout) do
|
|
20
|
+
content = URI.parse(uri).read
|
|
21
|
+
end
|
|
22
|
+
rescue => error
|
|
23
|
+
if times > 0
|
|
24
|
+
times -= 1
|
|
25
|
+
retry
|
|
26
|
+
else
|
|
27
|
+
warn "Requesting '#{uri}' failed multiple times.\n#{error.message}"
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
content
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def api_request(params = {})
|
|
35
|
+
uri = URI.encode(api_base_uri + param_string(params))
|
|
36
|
+
content = request_with_retry(uri, 3)
|
|
37
|
+
Nokogiri::XML(content)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
module_function :api_request
|
|
41
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
FactoryBot.define do
|
|
2
|
+
factory :edit, class: Wikipedia::VandalismDetection::Edit do
|
|
3
|
+
old_revision { FactoryBot.build(:old_revision) }
|
|
4
|
+
new_revision { FactoryBot.build(:new_revision) }
|
|
5
|
+
page_id { nil }
|
|
6
|
+
page_title { nil }
|
|
7
|
+
|
|
8
|
+
initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
factory :anonymous_edit, class: Wikipedia::VandalismDetection::Edit do
|
|
12
|
+
old_revision { FactoryBot.build(:old_revision) }
|
|
13
|
+
new_revision { FactoryBot.build(:anonymous_revision) }
|
|
14
|
+
page_id { nil }
|
|
15
|
+
page_title { nil }
|
|
16
|
+
|
|
17
|
+
initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
FactoryBot.define do
|
|
2
|
+
factory :page, class: Wikipedia::VandalismDetection::Page do
|
|
3
|
+
id { nil }
|
|
4
|
+
title { nil }
|
|
5
|
+
|
|
6
|
+
after :build do |obj|
|
|
7
|
+
obj.add_revision FactoryBot.build(:old_revision, contributor: 'User')
|
|
8
|
+
obj.add_revision FactoryBot.build(:new_revision, contributor: 'User')
|
|
9
|
+
obj.add_revision FactoryBot.build(:even_newer_revision, contributor: 'User')
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
FactoryBot.define do
|
|
2
|
+
factory :empty_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
3
|
+
f.id { nil }
|
|
4
|
+
f.parent_id { nil }
|
|
5
|
+
f.timestamp { nil }
|
|
6
|
+
f.text { Wikipedia::VandalismDetection::Text.new }
|
|
7
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
factory :old_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
11
|
+
f.id { '1' }
|
|
12
|
+
f.parent_id { nil }
|
|
13
|
+
f.timestamp { nil }
|
|
14
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 1') }
|
|
15
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
factory :new_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
19
|
+
f.id { '2' }
|
|
20
|
+
f.parent_id { '1' }
|
|
21
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
|
22
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
|
23
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
factory :even_newer_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
27
|
+
f.id { '3' }
|
|
28
|
+
f.parent_id { '2' }
|
|
29
|
+
f.timestamp { '2014-11-28T18:00:00Z' }
|
|
30
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 3') }
|
|
31
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
factory :anonymous_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
35
|
+
f.id { '2' }
|
|
36
|
+
f.parent_id { '1' }
|
|
37
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
|
38
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
|
39
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
40
|
+
f.contributor { '127.0.0.1' }
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
factory :registered_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
|
44
|
+
f.id { '2' }
|
|
45
|
+
f.parent_id { '1' }
|
|
46
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
|
47
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
|
48
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
|
49
|
+
f.contributor { '12345' }
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
corpora:
|
|
2
|
+
base_directory: ../../../../spec/resources/corpora
|
|
3
|
+
|
|
4
|
+
training:
|
|
5
|
+
base_directory: training
|
|
6
|
+
annotations_file: annotations.csv
|
|
7
|
+
edits_file: edits.csv
|
|
8
|
+
revisions_directory: revisions
|
|
9
|
+
|
|
10
|
+
test:
|
|
11
|
+
base_directory: test
|
|
12
|
+
edits_file: edits.csv
|
|
13
|
+
revisions_directory: revisions
|
|
14
|
+
ground_truth_file: ground-truth.txt
|
|
15
|
+
|
|
16
|
+
output:
|
|
17
|
+
base_directory: ../../../../spec/resources/build
|
|
18
|
+
training:
|
|
19
|
+
index_file: training_index.yml
|
|
20
|
+
arff_file: training.arff
|
|
21
|
+
test:
|
|
22
|
+
index_file: test_index.yml
|
|
23
|
+
arff_file: test.arff
|
|
24
|
+
classification_file: classification.txt
|
|
25
|
+
|
|
26
|
+
features:
|
|
27
|
+
- anonymity
|
|
28
|
+
- character sequence
|
|
29
|
+
- comment length
|
|
30
|
+
|
|
31
|
+
classifier:
|
|
32
|
+
type: Trees::RandomForest
|
|
33
|
+
options: -I 10
|
|
34
|
+
cross-validation-fold: 2
|
|
35
|
+
training-data-options: unbalanced
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"editid","editor","oldrevisionid","newrevisionid","diffurl","edittime","editcomment","articleid","articletitle"
|
|
2
|
+
1641,"137.163.16.199",328774088,328774188,"http://en.wikipedia.org/w/index.php?diff=328774188&oldid=328774088","2009-11-30T10:23:13Z","/* Location */",100935,"Not annotated article"
|
|
3
|
+
1642,"J04n",307084144,326873205,"http://en.wikipedia.org/w/index.php?diff=326873205&oldid=307084144","2009-11-20T04:42:24Z","Repairing links to disambiguation pages - [[Wikipedia:Disambiguation pages with links|You can help!]]",19490449,"The Soundstage Sessions"
|
|
4
|
+
1643,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
|
|
5
|
+
1644,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
|
|
6
|
+
1647,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|
|
7
|
+
1648,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|
|
8
|
+
1649,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|