wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
<mediawiki>
|
|
2
|
+
<siteinfo>
|
|
3
|
+
<sitename>Wikipedia</sitename>
|
|
4
|
+
<base>http://en.wikipedia.org/wiki/Main_Page</base>
|
|
5
|
+
<generator>MediaWiki 1.23wmf12</generator>
|
|
6
|
+
<case>first-letter</case>
|
|
7
|
+
<namespaces>
|
|
8
|
+
<namespace key="-2" case="first-letter">Media</namespace>
|
|
9
|
+
<namespace key="-1" case="first-letter">Special</namespace>
|
|
10
|
+
<namespace key="0" case="first-letter"/>
|
|
11
|
+
<namespace key="1" case="first-letter">Talk</namespace>
|
|
12
|
+
<namespace key="2" case="first-letter">User</namespace>
|
|
13
|
+
<namespace key="3" case="first-letter">User talk</namespace>
|
|
14
|
+
<namespace key="4" case="first-letter">Wikipedia</namespace>
|
|
15
|
+
<namespace key="5" case="first-letter">Wikipedia talk</namespace>
|
|
16
|
+
<namespace key="6" case="first-letter">File</namespace>
|
|
17
|
+
<namespace key="7" case="first-letter">File talk</namespace>
|
|
18
|
+
<namespace key="8" case="first-letter">MediaWiki</namespace>
|
|
19
|
+
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
|
|
20
|
+
<namespace key="10" case="first-letter">Template</namespace>
|
|
21
|
+
<namespace key="11" case="first-letter">Template talk</namespace>
|
|
22
|
+
<namespace key="12" case="first-letter">Help</namespace>
|
|
23
|
+
<namespace key="13" case="first-letter">Help talk</namespace>
|
|
24
|
+
<namespace key="14" case="first-letter">Category</namespace>
|
|
25
|
+
<namespace key="15" case="first-letter">Category talk</namespace>
|
|
26
|
+
<namespace key="100" case="first-letter">Portal</namespace>
|
|
27
|
+
<namespace key="101" case="first-letter">Portal talk</namespace>
|
|
28
|
+
<namespace key="108" case="first-letter">Book</namespace>
|
|
29
|
+
<namespace key="109" case="first-letter">Book talk</namespace>
|
|
30
|
+
<namespace key="118" case="first-letter">Draft</namespace>
|
|
31
|
+
<namespace key="119" case="first-letter">Draft talk</namespace>
|
|
32
|
+
<namespace key="446" case="first-letter">Education Program</namespace>
|
|
33
|
+
<namespace key="447" case="first-letter">Education Program talk</namespace>
|
|
34
|
+
<namespace key="710" case="first-letter">TimedText</namespace>
|
|
35
|
+
<namespace key="711" case="first-letter">TimedText talk</namespace>
|
|
36
|
+
<namespace key="828" case="first-letter">Module</namespace>
|
|
37
|
+
<namespace key="829" case="first-letter">Module talk</namespace>
|
|
38
|
+
</namespaces>
|
|
39
|
+
</siteinfo>
|
|
40
|
+
<page>
|
|
41
|
+
<title>Vandalism on Wikipedia</title>
|
|
42
|
+
<ns>0</ns>
|
|
43
|
+
<id>29753790</id>
|
|
44
|
+
<revision>
|
|
45
|
+
<id>398880281</id>
|
|
46
|
+
<timestamp>2010-11-25T23:50:21Z</timestamp>
|
|
47
|
+
<contributor>
|
|
48
|
+
<username>Hellno2</username>
|
|
49
|
+
<id>3020504</id>
|
|
50
|
+
</contributor>
|
|
51
|
+
<comment>[[WP:AES|←]]Created page with '{{newpage}} On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, remov...'</comment>
|
|
52
|
+
<text xml:space="preserve" bytes="1880">{{newpage}}
|
|
53
|
+
On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
|
|
54
|
+
|
|
55
|
+
Frequent targets of vandalism include articles on hot and controversial topics and current events<ref name=Newzealand/>.
|
|
56
|
+
|
|
57
|
+
==Fighting vandalism==
|
|
58
|
+
The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
|
|
59
|
+
*Reverting the vandalism by restoring the article to the last version before the vandalism occurred<ref name=Newzealand/>
|
|
60
|
+
*Locking articles so only established users, or in some cases, only administrators can edit them<ref name=Newzealand/>
|
|
61
|
+
*Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely<ref name=Newzealand/>
|
|
62
|
+
|
|
63
|
+
==Notable acts of vandalism==
|
|
64
|
+
*In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation<ref>http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism</ref>.
|
|
65
|
+
*Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him<ref>http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html</ref>.
|
|
66
|
+
*Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays "hate Australian people."<ref name=Newzealand>http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&objectid=10432042</ref>
|
|
67
|
+
|
|
68
|
+
==References==
|
|
69
|
+
{{reflist}}
|
|
70
|
+
|
|
71
|
+
{{Wikipedia}}</text>
|
|
72
|
+
<sha1>eju7ojn2omej7atr11ll7k64hzhpkaq</sha1>
|
|
73
|
+
<model>wikitext</model>
|
|
74
|
+
<format>text/x-wiki</format>
|
|
75
|
+
</revision>
|
|
76
|
+
<revision>
|
|
77
|
+
<id>398880502</id>
|
|
78
|
+
<parentid>398880281</parentid>
|
|
79
|
+
<timestamp>2010-11-25T23:52:13Z</timestamp>
|
|
80
|
+
<contributor>
|
|
81
|
+
<username>Hellno2</username>
|
|
82
|
+
<id>3020504</id>
|
|
83
|
+
</contributor>
|
|
84
|
+
<comment>inuse</comment>
|
|
85
|
+
<text xml:space="preserve" bytes="1914">{{inuse}}
|
|
86
|
+
{{newpage}}
|
|
87
|
+
On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
|
|
88
|
+
|
|
89
|
+
Frequent targets of vandalism include articles on hot and controversial topics and current events<ref name=Newzealand/>.
|
|
90
|
+
|
|
91
|
+
==Fighting vandalism==
|
|
92
|
+
The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
|
|
93
|
+
*Reverting the vandalism by restoring the article to the last version before the vandalism occurred<ref name=Newzealand/>
|
|
94
|
+
*Locking articles so only established users, or in some cases, only administrators can edit them<ref name=Newzealand/>
|
|
95
|
+
*Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely<ref name=Newzealand/>
|
|
96
|
+
|
|
97
|
+
==Notable acts of vandalism==
|
|
98
|
+
*In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation<ref>http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism</ref>.
|
|
99
|
+
*Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him<ref>http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html</ref>.
|
|
100
|
+
*Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays "hate Australian people."<ref name=Newzealand>http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&objectid=10432042</ref>
|
|
101
|
+
|
|
102
|
+
==References==
|
|
103
|
+
{{reflist}}
|
|
104
|
+
|
|
105
|
+
{{Wikipedia}}
|
|
106
|
+
|
|
107
|
+
[[Category:Wikipedia]]</text>
|
|
108
|
+
<sha1>rwmi3pu1ormoc1mqgs7mej6r63u9uxk</sha1>
|
|
109
|
+
<model>wikitext</model>
|
|
110
|
+
<format>text/x-wiki</format>
|
|
111
|
+
</revision>
|
|
112
|
+
<revision>
|
|
113
|
+
<id>398883278</id>
|
|
114
|
+
<parentid>398880502</parentid>
|
|
115
|
+
<timestamp>2010-11-26T00:13:45Z</timestamp>
|
|
116
|
+
<contributor>
|
|
117
|
+
<username>Hellno2</username>
|
|
118
|
+
<id>3020504</id>
|
|
119
|
+
</contributor>
|
|
120
|
+
<text xml:space="preserve" bytes="3177">On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
|
|
121
|
+
|
|
122
|
+
Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site<ref name=newscientist/>. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information<ref name=BBC>http://news.bbc.co.uk/2/hi/4502846.stm</ref>.
|
|
123
|
+
|
|
124
|
+
Most vandalism is committed on impulse<ref name=BBC/>. Frequent targets of vandalism include articles on hot and controversial topics and current events<ref name=Newzealand/><ref>http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news</ref>.
|
|
125
|
+
|
|
126
|
+
==Fighting vandalism==
|
|
127
|
+
The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
|
|
128
|
+
*Reverting the vandalism by restoring the article to the last version before the vandalism occurred<ref name=Newzealand/>
|
|
129
|
+
*Locking articles so only established users, or in some cases, only administrators can edit them<ref name=Newzealand/>
|
|
130
|
+
*Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely<ref name=Newzealand/>
|
|
131
|
+
|
|
132
|
+
In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination<ref name=newscientist>http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html</ref>.
|
|
133
|
+
|
|
134
|
+
In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site<ref>http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html</ref>.
|
|
135
|
+
|
|
136
|
+
==Notable acts of vandalism==
|
|
137
|
+
*In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation<ref>http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism</ref>.
|
|
138
|
+
*Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him<ref>http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html</ref>.
|
|
139
|
+
*Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays "hate Australian people."<ref name=Newzealand>http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&objectid=10432042</ref>
|
|
140
|
+
|
|
141
|
+
==References==
|
|
142
|
+
{{reflist}}
|
|
143
|
+
|
|
144
|
+
{{Wikipedia}}
|
|
145
|
+
|
|
146
|
+
[[Category:Wikipedia]]</text>
|
|
147
|
+
<sha1>hya1xftsfkq6wml6uigb6j480p4x2nt</sha1>
|
|
148
|
+
<model>wikitext</model>
|
|
149
|
+
<format>text/x-wiki</format>
|
|
150
|
+
</revision>
|
|
151
|
+
<revision>
|
|
152
|
+
<id>398883675</id>
|
|
153
|
+
<parentid>398883278</parentid>
|
|
154
|
+
<timestamp>2010-11-26T00:17:04Z</timestamp>
|
|
155
|
+
<contributor>
|
|
156
|
+
<username>Hellno2</username>
|
|
157
|
+
<id>3020504</id>
|
|
158
|
+
</contributor>
|
|
159
|
+
<text xml:space="preserve" bytes="3261">[[Image:Wikipedia vandalism.svg|thumb|300 px|[[Vandalism]] of a Wikipedia article]]
|
|
160
|
+
On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
|
|
161
|
+
|
|
162
|
+
Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site<ref name=newscientist/>. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information<ref name=BBC>http://news.bbc.co.uk/2/hi/4502846.stm</ref>.
|
|
163
|
+
|
|
164
|
+
Most vandalism is committed on impulse<ref name=BBC/>. Frequent targets of vandalism include articles on hot and controversial topics and current events<ref name=Newzealand/><ref>http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news</ref>.
|
|
165
|
+
|
|
166
|
+
==Fighting vandalism==
|
|
167
|
+
The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
|
|
168
|
+
*Reverting the vandalism by restoring the article to the last version before the vandalism occurred<ref name=Newzealand/>
|
|
169
|
+
*Locking articles so only established users, or in some cases, only administrators can edit them<ref name=Newzealand/>
|
|
170
|
+
*Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely<ref name=Newzealand/>
|
|
171
|
+
|
|
172
|
+
In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination<ref name=newscientist>http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html</ref>.
|
|
173
|
+
|
|
174
|
+
In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site<ref>http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html</ref>.
|
|
175
|
+
|
|
176
|
+
==Notable acts of vandalism==
|
|
177
|
+
*In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation<ref>http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism</ref>.
|
|
178
|
+
*Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him<ref>http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html</ref>.
|
|
179
|
+
*Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays "hate Australian people."<ref name=Newzealand>http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&objectid=10432042</ref>
|
|
180
|
+
|
|
181
|
+
==References==
|
|
182
|
+
{{reflist}}
|
|
183
|
+
|
|
184
|
+
{{Wikipedia}}
|
|
185
|
+
|
|
186
|
+
[[Category:Wikipedia]]</text>
|
|
187
|
+
<sha1>ebb1e4tgy49mqdwtyk0rafzdcokp4lh</sha1>
|
|
188
|
+
<model>wikitext</model>
|
|
189
|
+
<format>text/x-wiki</format>
|
|
190
|
+
</revision>
|
|
191
|
+
<revision>
|
|
192
|
+
<id>398885233</id>
|
|
193
|
+
<parentid>398883675</parentid>
|
|
194
|
+
<timestamp>2010-11-26T00:29:53Z</timestamp>
|
|
195
|
+
<contributor>
|
|
196
|
+
<username>Hellno2</username>
|
|
197
|
+
<id>3020504</id>
|
|
198
|
+
</contributor>
|
|
199
|
+
<comment>/* Notable acts of vandalism */</comment>
|
|
200
|
+
<text xml:space="preserve" bytes="3541">[[Image:Wikipedia vandalism.svg|thumb|300 px|[[Vandalism]] of a Wikipedia article]]
|
|
201
|
+
On [[Wikipedia]], '''Vandalism''' is the act of editing the project in a manner that is intentionally disruptive. Vandalism includes the addition, removal, or other modification of the text or other material in a manner that consists either of humor, nonsense, [[hoax]]es, [[spam]] or promotion of a subject, or is of an offensive or degrading nature.
|
|
202
|
+
|
|
203
|
+
Vandalism is easy to commit on Wikipedia due to the fact that anyone can edit the site<ref name=newscientist/>. Founder [[Jimmy Wales]] is very much aware of the fact that the open editing policy allows the addition of false information<ref name=BBC>http://news.bbc.co.uk/2/hi/4502846.stm</ref>.
|
|
204
|
+
|
|
205
|
+
Most vandalism is committed on impulse<ref name=BBC/>. Frequent targets of vandalism include articles on hot and controversial topics and current events<ref name=Newzealand/><ref>http://www.guardian.co.uk/technology/2006/jun/18/wikipedia.news</ref>.
|
|
206
|
+
|
|
207
|
+
==Fighting vandalism==
|
|
208
|
+
The are various measures taken by Wikipedia to prevent or reduce the amount of vandalism. These include:
|
|
209
|
+
*Reverting the vandalism by restoring the article to the last version before the vandalism occurred<ref name=Newzealand/>
|
|
210
|
+
*Locking articles so only established users, or in some cases, only administrators can edit them<ref name=Newzealand/>
|
|
211
|
+
*Blocking and banning those who have repeatedly committed acts of vandalism from editing for a period of time or in some cases, indefinitely<ref name=Newzealand/>
|
|
212
|
+
|
|
213
|
+
In 2005, Wikipedia started to require those who create new articles to have a registered account in an effort to fight some vandalism. This occurred after inaccurate information was added to Wikipedia in which a journalist was accused of taking part in Kennedy's assassination<ref name=newscientist>http://www.newscientist.com/article/dn8425-wikipedia-tightens-editorial-rules-after-complaint.html</ref>.
|
|
214
|
+
|
|
215
|
+
In 2009, Wikipedia instituted a new policy in which the posting of edits to articles on living people would be delayed until they could be reviewed for inclusion of a source to verify accuracy. This was in an effort to prevent inaccurate and potentially damaging information about living people from appearing on the site<ref>http://news.ebrandz.com/miscellaneous/2009/2824-wikipedia-plans-to-enforce-new-editing-policy-to-thwart-vandals-.html</ref>.
|
|
216
|
+
|
|
217
|
+
==Notable acts of vandalism==
|
|
218
|
+
*In 2006, comedian [[Steve Colbert]] vandalized the article [[elephant]] publicly on the air. This resulted in Colbert being blocked from editing, and a lot of elephant-related articles being protected<ref>http://www.tvsquad.com/2006/08/01/did-colbert-hack-wikipedia-video/</ref>.
|
|
219
|
+
*In 2006, [[Rolling Stone Magazine]] printed a false story from which they learned the information from an act of Wikipedia vandalism. Their article's title stated that [[Halle Berry]] was set to ruin her reputation<ref>http://en.wikinews.org/wiki/Rolling_Stone_prints_story_based_on_Wikipedia_vandalism</ref>.
|
|
220
|
+
*Professional golfer [[Fuzzy Zoeller]] sued a Miami company whose IP-based edits to the Wikipedia site included negative information about him<ref>http://www.tomshardware.com/news/golfer-sues-wikipedia-vandalism,4377.html</ref>.
|
|
221
|
+
*Soon after the death of [[Steve Irwin]] in 2007, the [[stingray]] article was vandalized, stating that stingrays "hate Australian people."<ref name=Newzealand>http://www.nzherald.co.nz/technology/news/article.cfm?c_id=5&objectid=10432042</ref>
|
|
222
|
+
|
|
223
|
+
==References==
|
|
224
|
+
{{reflist}}
|
|
225
|
+
|
|
226
|
+
{{Wikipedia}}
|
|
227
|
+
|
|
228
|
+
[[Category:Wikipedia]]</text>
|
|
229
|
+
<sha1>t8s84rnkje13fkdkw0exui4hrs3fx8x</sha1>
|
|
230
|
+
<model>wikitext</model>
|
|
231
|
+
<format>text/x-wiki</format>
|
|
232
|
+
</revision>
|
|
233
|
+
</page>
|
|
234
|
+
</mediawiki>
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
<mediawiki>
|
|
2
|
+
<siteinfo>
|
|
3
|
+
<sitename>Wikipedia</sitename>
|
|
4
|
+
<base>http://en.wikipedia.org/wiki/Main_Page</base>
|
|
5
|
+
<generator>MediaWiki 1.23wmf12</generator>
|
|
6
|
+
<case>first-letter</case>
|
|
7
|
+
<namespaces>
|
|
8
|
+
<namespace key="-2" case="first-letter">Media</namespace>
|
|
9
|
+
<namespace key="-1" case="first-letter">Special</namespace>
|
|
10
|
+
<namespace key="0" case="first-letter"/>
|
|
11
|
+
<namespace key="1" case="first-letter">Talk</namespace>
|
|
12
|
+
<namespace key="2" case="first-letter">User</namespace>
|
|
13
|
+
<namespace key="3" case="first-letter">User talk</namespace>
|
|
14
|
+
<namespace key="4" case="first-letter">Wikipedia</namespace>
|
|
15
|
+
<namespace key="5" case="first-letter">Wikipedia talk</namespace>
|
|
16
|
+
<namespace key="6" case="first-letter">File</namespace>
|
|
17
|
+
<namespace key="7" case="first-letter">File talk</namespace>
|
|
18
|
+
<namespace key="8" case="first-letter">MediaWiki</namespace>
|
|
19
|
+
<namespace key="9" case="first-letter">MediaWiki talk</namespace>
|
|
20
|
+
<namespace key="10" case="first-letter">Template</namespace>
|
|
21
|
+
<namespace key="11" case="first-letter">Template talk</namespace>
|
|
22
|
+
<namespace key="12" case="first-letter">Help</namespace>
|
|
23
|
+
<namespace key="13" case="first-letter">Help talk</namespace>
|
|
24
|
+
<namespace key="14" case="first-letter">Category</namespace>
|
|
25
|
+
<namespace key="15" case="first-letter">Category talk</namespace>
|
|
26
|
+
<namespace key="100" case="first-letter">Portal</namespace>
|
|
27
|
+
<namespace key="101" case="first-letter">Portal talk</namespace>
|
|
28
|
+
<namespace key="108" case="first-letter">Book</namespace>
|
|
29
|
+
<namespace key="109" case="first-letter">Book talk</namespace>
|
|
30
|
+
<namespace key="118" case="first-letter">Draft</namespace>
|
|
31
|
+
<namespace key="119" case="first-letter">Draft talk</namespace>
|
|
32
|
+
<namespace key="446" case="first-letter">Education Program</namespace>
|
|
33
|
+
<namespace key="447" case="first-letter">Education Program talk</namespace>
|
|
34
|
+
<namespace key="710" case="first-letter">TimedText</namespace>
|
|
35
|
+
<namespace key="711" case="first-letter">TimedText talk</namespace>
|
|
36
|
+
<namespace key="828" case="first-letter">Module</namespace>
|
|
37
|
+
<namespace key="829" case="first-letter">Module talk</namespace>
|
|
38
|
+
</namespaces>
|
|
39
|
+
</siteinfo>
|
|
40
|
+
<page>
|
|
41
|
+
<title>Vandalism on Wikipedia</title>
|
|
42
|
+
<ns>0</ns>
|
|
43
|
+
<id>100</id>
|
|
44
|
+
<revision>
|
|
45
|
+
<id>1</id>
|
|
46
|
+
<timestamp>time 1</timestamp>
|
|
47
|
+
<contributor>
|
|
48
|
+
<ip>1</ip>
|
|
49
|
+
</contributor>
|
|
50
|
+
<comment>comment
|
|
51
|
+
|
|
52
|
+
1
|
|
53
|
+
|
|
54
|
+
</comment>
|
|
55
|
+
<text xml:space="preserve" bytes="1880">text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
1
|
|
59
|
+
|
|
60
|
+
</text>
|
|
61
|
+
<sha1>hash1</sha1>
|
|
62
|
+
<model>wikitext</model>
|
|
63
|
+
<format>text/x-wiki</format>
|
|
64
|
+
</revision>
|
|
65
|
+
<revision>
|
|
66
|
+
<id>2</id>
|
|
67
|
+
<parentid>1</parentid>
|
|
68
|
+
<timestamp>time 2</timestamp>
|
|
69
|
+
<contributor>
|
|
70
|
+
<username>user</username>
|
|
71
|
+
<id>10</id>
|
|
72
|
+
</contributor>
|
|
73
|
+
<comment>comment 2</comment>
|
|
74
|
+
<text xml:space="preserve" bytes="1914">text 2</text>
|
|
75
|
+
<sha1>hash2</sha1>
|
|
76
|
+
<model>wikitext</model>
|
|
77
|
+
<format>text/x-wiki</format>
|
|
78
|
+
</revision>
|
|
79
|
+
<revision>
|
|
80
|
+
<id>3</id>
|
|
81
|
+
<parentid>2</parentid>
|
|
82
|
+
<timestamp>time 3</timestamp>
|
|
83
|
+
<contributor>
|
|
84
|
+
<username>user</username>
|
|
85
|
+
<id>11</id>
|
|
86
|
+
</contributor>
|
|
87
|
+
<text xml:space="preserve" bytes="3177">text 3</text>
|
|
88
|
+
<sha1>hash3</sha1>
|
|
89
|
+
<model>wikitext</model>
|
|
90
|
+
<format>text/x-wiki</format>
|
|
91
|
+
</revision>
|
|
92
|
+
<revision>
|
|
93
|
+
<id>4</id>
|
|
94
|
+
<parentid>3</parentid>
|
|
95
|
+
<timestamp>time 4</timestamp>
|
|
96
|
+
<contributor>
|
|
97
|
+
<username>user</username>
|
|
98
|
+
<id>12</id>
|
|
99
|
+
</contributor>
|
|
100
|
+
<text xml:space="preserve" bytes="3261">text 4</text>
|
|
101
|
+
<sha1>hash4</sha1>
|
|
102
|
+
<model>wikitext</model>
|
|
103
|
+
<format>text/x-wiki</format>
|
|
104
|
+
</revision>
|
|
105
|
+
<revision>
|
|
106
|
+
<id>5</id>
|
|
107
|
+
<parentid>4</parentid>
|
|
108
|
+
<timestamp>time 5</timestamp>
|
|
109
|
+
<contributor>
|
|
110
|
+
<ip>2</ip>
|
|
111
|
+
</contributor>
|
|
112
|
+
<comment>comment 3</comment>
|
|
113
|
+
<text xml:space="preserve" bytes="3541">text 5</text>
|
|
114
|
+
<sha1>hash5</sha1>
|
|
115
|
+
<model>wikitext</model>
|
|
116
|
+
<format>text/x-wiki</format>
|
|
117
|
+
</revision>
|
|
118
|
+
</page>
|
|
119
|
+
</mediawiki>
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{{text}}
|
|
2
|
+
[[text]]
|
|
3
|
+
[[text:text]]
|
|
4
|
+
[[text|text]]
|
|
5
|
+
[http://domain.com]
|
|
6
|
+
=text=
|
|
7
|
+
==text==
|
|
8
|
+
===text===
|
|
9
|
+
====text====
|
|
10
|
+
=====text=====
|
|
11
|
+
======text======
|
|
12
|
+
----
|
|
13
|
+
<text>
|
|
14
|
+
:text
|
|
15
|
+
::text
|
|
16
|
+
:::text
|
|
17
|
+
::::text
|
|
18
|
+
'''text'''
|
|
19
|
+
''text''
|
|
20
|
+
* text
|
|
21
|
+
** text
|
|
22
|
+
*** text
|
|
23
|
+
**** text
|
|
24
|
+
# text
|
|
25
|
+
## text
|
|
26
|
+
### text
|
|
27
|
+
#### text
|
|
28
|
+
‹
|
|
29
|
+
"
|
|
30
|
+
[http://www.wikipedia.com/images/uploads/beaver.jpg text text text]
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
require 'rspec'
|
|
2
|
+
require 'factory_bot'
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
|
|
5
|
+
def require_files_from(paths = [])
|
|
6
|
+
paths.each do |path|
|
|
7
|
+
Dir[File.join(File.expand_path("#{path}*.rb", __FILE__))].each do |file|
|
|
8
|
+
require file
|
|
9
|
+
end
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
RSpec.configure do |config|
|
|
14
|
+
base_path = '../../lib/wikipedia/vandalism_detection'
|
|
15
|
+
lib_file = File.expand_path(base_path, __FILE__)
|
|
16
|
+
require lib_file
|
|
17
|
+
|
|
18
|
+
dirs = %w[../factories/**/ ../support/**/]
|
|
19
|
+
require_files_from dirs
|
|
20
|
+
|
|
21
|
+
config.include FileReading
|
|
22
|
+
config.include TestConfiguration
|
|
23
|
+
config.include FactoryBot::Syntax::Methods
|
|
24
|
+
|
|
25
|
+
config.after(:suite) do
|
|
26
|
+
test_build_dir = File.expand_path('../resources/build', __FILE__)
|
|
27
|
+
FileUtils.remove_dir(test_build_dir) if Dir.exist?(test_build_dir)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
Classifier = Wikipedia::VandalismDetection::Classifier
|
|
31
|
+
Edit = Wikipedia::VandalismDetection::Edit
|
|
32
|
+
Evaluator = Wikipedia::VandalismDetection::Evaluator
|
|
33
|
+
Features = Wikipedia::VandalismDetection::Features
|
|
34
|
+
Instances = Wikipedia::VandalismDetection::Instances
|
|
35
|
+
Page = Wikipedia::VandalismDetection::Page
|
|
36
|
+
Text = Wikipedia::VandalismDetection::Text
|
|
37
|
+
TrainingDataset = Wikipedia::VandalismDetection::TrainingDataset
|
|
38
|
+
end
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
module TestConfiguration
|
|
2
|
+
require 'yaml'
|
|
3
|
+
|
|
4
|
+
SOURCE_DIR = File.expand_path('../../../../spec/resources/', __FILE__)
|
|
5
|
+
CONFIG_DEFAULTS = Wikipedia::VandalismDetection::DefaultConfiguration::DEFAULTS
|
|
6
|
+
|
|
7
|
+
def source_dir
|
|
8
|
+
SOURCE_DIR
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def use_configuration(override)
|
|
12
|
+
allow(Wikipedia::VandalismDetection)
|
|
13
|
+
.to receive(:config)
|
|
14
|
+
.and_return(override)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def use_test_configuration
|
|
18
|
+
use_configuration(test_config)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def use_default_configuration
|
|
22
|
+
use_configuration(CONFIG_DEFAULTS)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def test_configuration_content
|
|
26
|
+
config_file = 'config/wikipedia-vandalism-detection.yml'
|
|
27
|
+
config_path = File.join(source_dir, config_file)
|
|
28
|
+
YAML.load_file(config_path)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def merged_configuration(override = test_configuration_content)
|
|
32
|
+
default_config = CONFIG_DEFAULTS.merge('source' => source_dir)
|
|
33
|
+
default_config.deep_merge(override)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def test_config
|
|
37
|
+
allow_any_instance_of(Wikipedia::VandalismDetection::DefaultConfiguration)
|
|
38
|
+
.to receive(:source)
|
|
39
|
+
.and_return(source_dir)
|
|
40
|
+
|
|
41
|
+
Wikipedia::VandalismDetection::Configuration.send(:new)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def paths
|
|
45
|
+
config = test_configuration_content
|
|
46
|
+
corpus_config = config['corpora']
|
|
47
|
+
|
|
48
|
+
base_directory = File.expand_path(corpus_config['base_directory'], __FILE__)
|
|
49
|
+
training = corpus_config['training']
|
|
50
|
+
test = corpus_config['test']
|
|
51
|
+
|
|
52
|
+
{
|
|
53
|
+
corpora: {
|
|
54
|
+
'base_directory' => base_directory,
|
|
55
|
+
'training' => {
|
|
56
|
+
'base_directory' => 'training',
|
|
57
|
+
'edits_file' => training['edits_file'],
|
|
58
|
+
'annotations_file' => training['annotations_file'],
|
|
59
|
+
'revisions_directory' => training['revisions_directory']
|
|
60
|
+
},
|
|
61
|
+
'test' => {
|
|
62
|
+
'base_directory' => 'test',
|
|
63
|
+
'edits_file' => test['edits_file'],
|
|
64
|
+
'revisions_directory' => test['revisions_directory']
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
},
|
|
68
|
+
output: {
|
|
69
|
+
'base_directory' => base_directory,
|
|
70
|
+
'training' => {
|
|
71
|
+
'index_file' => training['index_file'],
|
|
72
|
+
'arff_file' => training['arff_file']
|
|
73
|
+
},
|
|
74
|
+
'test' => {
|
|
75
|
+
'index_file' => test['index_file'],
|
|
76
|
+
'arff_file' => test['arff_file']
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
end
|
|
81
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
|
|
3
|
+
describe Wikipedia::VandalismDetection::Algorithms::KullbackLeiblerDivergence do
|
|
4
|
+
it { is_expected.to respond_to :of }
|
|
5
|
+
|
|
6
|
+
describe '#of' do
|
|
7
|
+
it 'returns missing value if no character in either of the texts' do
|
|
8
|
+
expect(subject.of('&', '?')).to eq Features::MISSING_VALUE
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
it 'returns zero for equal texts' do
|
|
12
|
+
text = 'Text sample'
|
|
13
|
+
expect(subject.of(text, text)).to eq 0.0
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
it 'returns a value bigger than zero for different texts' do
|
|
17
|
+
expect(subject.of('Text 1', 'Text 2')).to be > 0.0
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
it 'returns a higher value for a more different text' do
|
|
21
|
+
lower_divergence = subject.of('text a', 'text b')
|
|
22
|
+
higher_divergence = subject.of('text a', 'bla bla bla')
|
|
23
|
+
|
|
24
|
+
expect(lower_divergence).to be < higher_divergence
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
it 'can handle invalid byte sequences' do
|
|
28
|
+
invalid_byte_sequence = "text \255".force_encoding('UTF-8')
|
|
29
|
+
result = subject.of(invalid_byte_sequence, invalid_byte_sequence)
|
|
30
|
+
|
|
31
|
+
expect(result).to eq 0.0
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|