wikipedia-vandalism_detection 0.1.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
@@ -0,0 +1,80 @@
|
|
1
|
+
# The WikitextExtractor imports the WikitextExtractor class from the
|
2
|
+
# sweble-wikitext-extractor.jar
|
3
|
+
# The sweble-wikitext-extractor.jar is a custom Java project which uses the
|
4
|
+
# Sweble wikitext parser to extract plaintext out of wikimarkup text.
|
5
|
+
#
|
6
|
+
# The Sweble WikitextExtractor currently depends on the swc-engine -v1.1.0 with
|
7
|
+
# dependencies,
|
8
|
+
# see: http://sweble.org/downloads/swc-devel/master-latest/ to download it.
|
9
|
+
#
|
10
|
+
# The Java source code can be found on:
|
11
|
+
# webis.uni-weimar.de:/srv/cvsroot/code-in-progress/wikipedia-vandalism-detection/sweble-wikitext-extractor
|
12
|
+
module Wikipedia
|
13
|
+
module VandalismDetection
|
14
|
+
require 'java'
|
15
|
+
require 'java/swc-engine-1.1.0-jar-with-dependencies.jar'
|
16
|
+
require 'java/sweble-wikitext-extractor.jar'
|
17
|
+
|
18
|
+
java_import 'de.webis.sweble.WikitextExtractor'
|
19
|
+
|
20
|
+
class WikitextExtractionError < StandardError; end
|
21
|
+
|
22
|
+
# This class wrapps the de.webis.sweble.WikitextExtractor Java class and
|
23
|
+
# provides methods to extract plaintext from wiki markup text both space
|
24
|
+
# preserving and cleaned without line breaks and whitespace.
|
25
|
+
class WikitextExtractor
|
26
|
+
REDIRECT = '#REDIRECT'.freeze
|
27
|
+
|
28
|
+
class << self
|
29
|
+
# Returns the extracted text from the given wiki markup preserving
|
30
|
+
# spacing with added section numbers.
|
31
|
+
def extract(wiki_text)
|
32
|
+
wiki_text = wiki_text.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
|
33
|
+
wiki_text = wiki_text.gsub(REDIRECT, '')
|
34
|
+
|
35
|
+
WikitextExtractor.new.extract(wiki_text)
|
36
|
+
rescue => exception
|
37
|
+
message = "Wikitext extraction failed: \n#{exception.message}"
|
38
|
+
raise WikitextExtractionError, message, caller
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns the cleaned extracted text from the given wiki markup.
|
42
|
+
# Cleaned means a single string without breaks, multiple spaces and
|
43
|
+
# section numbers.
|
44
|
+
def extract_clean(wiki_text)
|
45
|
+
wiki_text = extract wiki_text
|
46
|
+
|
47
|
+
wiki_text = remove_section_numbering_from wiki_text
|
48
|
+
wiki_text = remove_line_breaks_from wiki_text
|
49
|
+
wiki_text = remove_uris_from wiki_text
|
50
|
+
wiki_text = remove_special_signes_from wiki_text
|
51
|
+
wiki_text = remove_multiple_spaces_from wiki_text
|
52
|
+
wiki_text.strip
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
# removes 1., 1.1., 2.3.4. etc. at the beginning of a line
|
58
|
+
def remove_section_numbering_from(text)
|
59
|
+
text.gsub(/^(\d\.)+/, '')
|
60
|
+
end
|
61
|
+
|
62
|
+
def remove_line_breaks_from(text)
|
63
|
+
text.gsub(/\n+/, ' ')
|
64
|
+
end
|
65
|
+
|
66
|
+
def remove_multiple_spaces_from(text)
|
67
|
+
text.gsub(/\s+/, ' ')
|
68
|
+
end
|
69
|
+
|
70
|
+
def remove_uris_from(text)
|
71
|
+
text.gsub(%r{(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*}i, '')
|
72
|
+
end
|
73
|
+
|
74
|
+
def remove_special_signes_from(text)
|
75
|
+
text.gsub(/\[\]\{\}\|\=/, ' ')
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
BAD = %i[
|
5
|
+
666 da dont dosent whatever guy hi nazi sup guise loser thats ugly wanna
|
6
|
+
whats wont gotta bloody fart pot prick stink smells smelly alot dunno
|
7
|
+
gotcha
|
8
|
+
].freeze
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
BIASED = %i[
|
5
|
+
acclaimed amazing astonishing authoritative beautiful best brilliant
|
6
|
+
canonical celebrated charismatic classic cutting-edge defining
|
7
|
+
definitive eminent enigma exciting extraordinary fabulous famous
|
8
|
+
infamous fantastic fully genius global great greatest iconic immensely
|
9
|
+
impactful incendiary indisputable influential innovative inspired
|
10
|
+
intriguing leader leading legendary major masterly mature memorable
|
11
|
+
notable outstanding pioneer popular prestigious really remarkable
|
12
|
+
renowned respected seminal significant skillful solution single-handedly
|
13
|
+
staunch talented most top transcendent undoubtedly unique visionary
|
14
|
+
virtually virtuoso well-known well-established world-class worst coolest
|
15
|
+
super probably hate ugly fat lame weird strange everyone cares boring
|
16
|
+
boreing ever huge like idiotic absolute total totally
|
17
|
+
].freeze
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
EMOTICONS = [
|
5
|
+
:':\)', :':p', :':\(', :';\)', :':D', :';D', :';P', :';p', :':\-\)',
|
6
|
+
:':\-\(', :';\-\)', :':\-D', :':\-p', :':\-P', :'8\-\)', :'8\)',
|
7
|
+
:'\^\^', :'\*_\*', :'\^_\^', :':\-I', :':\-X', :':\-x', :'X\-p',
|
8
|
+
:'X\-P', :':\-\]', :'\^\.\^', :':\*', :':\-\*', :XD, :'X\-D', :'8\-D',
|
9
|
+
:'8D', :':\-O', :':\-o', :':\-\|', :'X\-\(', :'X\(', :'\-_\-', :':o\)',
|
10
|
+
:':O\)', :'B\-\)', :':O', :':o', :':\-s', :':\-S', :':\-\/', :':\-\\',
|
11
|
+
:T_T, :':\*\(', :':\*\-\(', :':\(\(', :'\*\-\*', :':\-\[', :':\->',
|
12
|
+
:':\|', :':\-\|', :':\]', :':\[', :'\/:\(', :'\\:\(', :':\-$', :':$',
|
13
|
+
:':\-6', :':\-9', :'@_@', :'<3', :'\|\-D', :':0', :':\-0', :o_O,
|
14
|
+
:oO, :'\(\-:', :'\(\-;', :'\(:', :'\):', :'\)\-:', :'\(;', :'\(y\)',
|
15
|
+
:'\(\.\)\(\.\)', :O_O, :'0_0', :'8\-\[', :'8\-\]', :'8\[', :'8\]',
|
16
|
+
:'8\-\(', :'8\(', :':\-', :'%\)', :'%\-\)', :'8\|', :'8\-\|', :'=\)',
|
17
|
+
:':\]', :':>', :':c\)', :'\[:', :'<:', :'c:', :'\(x', :'\(o:', :'\(c:',
|
18
|
+
:'D:', :':\'\(', :':\'C', :';\(', :';o\)', :'\(o;', :':b', :':p', :'=P',
|
19
|
+
:':P', :dx, :xP, :'d\-:', :'d:', :'q:', :'d=', :'d;', :'c\(:', :'=D',
|
20
|
+
:'=\-D', :'=O', :'=o', :'=0', :'o=', :'O=', :'0=', :'\^_~', :'>_<',
|
21
|
+
:'~_~', :'>:', :':<', :'\(Y\)', :'\(=', :'\)=', :'=\(', :'=\[, :=\]',
|
22
|
+
:'\[=', :'\]='
|
23
|
+
].freeze
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
MARKUP = [
|
5
|
+
:'\{\{',
|
6
|
+
:'\[\[',
|
7
|
+
:infobox,
|
8
|
+
:category,
|
9
|
+
:defaultsort,
|
10
|
+
:'<ref>',
|
11
|
+
:cite,
|
12
|
+
:__toc__,
|
13
|
+
:__forcetoc__,
|
14
|
+
:defaultsort,
|
15
|
+
:reflist
|
16
|
+
].freeze
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
PRONOUNS = %i[
|
5
|
+
i me myself mine my we us ourselves ourself ours our you yourself yours
|
6
|
+
your thou thee thyself thine thy yourselves y'all youse you-uns y'all
|
7
|
+
youse yous yis yourselves y'all's selves yous's
|
8
|
+
].freeze
|
9
|
+
end
|
10
|
+
end
|
11
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
module Wikipedia
|
2
|
+
module VandalismDetection
|
3
|
+
module WordLists
|
4
|
+
# This list is taken from https://github.com/snipe/banbuilder and can be
|
5
|
+
# downloaded from:
|
6
|
+
# https //:github.com/snipe/banbuilder/blob/master/word-dbs/wordlist.csv
|
7
|
+
VULGARISM = %i[
|
8
|
+
$#!+ $1ut $h1t $hit $lut 'ho 'hobag a$$ anus ass assmunch b1tch
|
9
|
+
ballsack bastard beaner beastiality biatch beeyotch bitchy
|
10
|
+
blow blowjob bollock bollocks bollok boner boob bugger buttplug
|
11
|
+
c-0-c-k c-o-c-k c-u-n-t c.0.c.k c.o.c.k. c.u.n. jerk jackoff
|
12
|
+
jackhole j3rk0ff homo hom0 hobag hell h0mo h0m0 goddamn goddammit
|
13
|
+
godamnit ghey ghay gfy gay fudgepacker fuckwad fucktard fuckoff
|
14
|
+
fucker fuck-tard fuck fellatio fellate felching felcher felch
|
15
|
+
fartknocker fart fannybandit fanny faggot fagg fag f.u.c.k f-u-c-k
|
16
|
+
dyke douchebag douche douch3 doosh dike dick damnit damn dammit d1ldo
|
17
|
+
d1ld0 d1ck d0uche d0uch3 cunt cumstain cum crap coon cock clitoris
|
18
|
+
clit cl1t cawk c0ck jerk0ff jerkoff jizz knobend labia lmfao moolie
|
19
|
+
muff nigga nigger p.u.s.s.y. piss piss-off pissoff prick pube pussy
|
20
|
+
queer retard retarded s-h-1-t s-h-i-t s.h.i.t. scrotum sh1t shit slut
|
21
|
+
smegma t1t tard terd tit tits titties turd twat vag wank wetback
|
22
|
+
whore whoreface 'f*ck' sh*t pu$$y p*ssy diligaf wtf stfu fu*ck fack
|
23
|
+
shite fxck sh!t @sshole assh0le assho!e a$$hole a$$h0le a$$h0!e
|
24
|
+
a$$h01e assho1e wh0re f@g f@gg0t f@ggot motherf*cker mofo cuntlicker
|
25
|
+
cuntface dickbag cockknocker beatch fucknut nucking futs mams cunny
|
26
|
+
quim clitty kike spic wop chink humper feltch feltcher fvck ahole
|
27
|
+
nads spick douchey bullturds gonads bitch butt fellatio lmao s-o-b
|
28
|
+
spunk he11 jizm jism bukkake shiz wigger gook ritard reetard
|
29
|
+
masterbate masturbate goatse masterbating masturbating hitler nazi
|
30
|
+
tubgirl gtfo foad r-tard rtard hoor g-spot gspot vulva assmaster
|
31
|
+
viagra phuck frack fuckwit assbang assbanged assbangs asshole
|
32
|
+
assholes asswipe asswipes b1tch bastards bitched bitches boners
|
33
|
+
bullshit bullshits bullshitted cameltoe chinc chincs chink chode
|
34
|
+
chodes clit clits cocks coons cumming cunts d1ck dickhead dickheads
|
35
|
+
doggie-style douchebags dumass dumbass dumbasses dykes faggit fags
|
36
|
+
fucked fucker fuckface fucks godamnit gooks humped humping jackass
|
37
|
+
jap japs jerk jizzed kikes knobend kooch kooches kootch fuckers
|
38
|
+
motherfucking niggah niggas niggers p.u.s.s.y. pussies queers rim s0b
|
39
|
+
shitface shithead shits shitted s.o.b. spik spiks twats whack whores
|
40
|
+
zoophile m-fucking mthrfucking muthrfucking mutherfucking
|
41
|
+
mutherfucker mtherfucker mthrfucker mthrf*cker whorehopper copulator
|
42
|
+
whoralicious whorealicious aeolus analprobe areola areole aryan arian
|
43
|
+
asses assfuck azazel baal babes bang banger barf bawdy beardedclam
|
44
|
+
beater beaver beer bigtits bimbo blew blow blowjobs blowup bod bodily
|
45
|
+
boink bone boned bong boobies boobs booby booger bookie booky bootee
|
46
|
+
bootie booty booze boozer boozy bosom bosomy bowel bowels bra
|
47
|
+
brassiere bung babe bush buttfuck cocaine kinky klan panties
|
48
|
+
pedophile pedophilia pedophiliac punkass queaf rape scantily essohbee
|
49
|
+
shithouse smut snatch toots doggie anorexia bulimia bulimiic burp
|
50
|
+
busty buttfucker caca cahone carnal carpetmuncher cervix climax
|
51
|
+
cocain cocksucker coital coke commie condom corpse coven crabs crack
|
52
|
+
crackwhore crappy cuervo cummin cumshot cumshots cunnilingus dago
|
53
|
+
dagos damned dick-ish dickish dickweed anorexic prostitute marijuana
|
54
|
+
lsd pcp diddle dawgie-style dimwit dingle doofus dopey douche drunk
|
55
|
+
dummy ejaculate enlargement erect erotic exotic extacy extasy faerie
|
56
|
+
faery fagged fagot fairy fisted fisting fisty floozy fondle foobar
|
57
|
+
foreskin frigg frigga fubar fucking fuckup ganja gays glans godamn
|
58
|
+
goddam goldenshower gonad gonads handjob hebe hemp heroin herpes
|
59
|
+
hijack hiv homey honky hooch hookah hooker hootch hooter hooters hump
|
60
|
+
hussy hymen inbred incest injun jerked jiz jizm horny junkie junky
|
61
|
+
kill kkk kraut kyke lech leper lesbians lesbos lez lezbian lezbians
|
62
|
+
lezbo lezbos lezzie lezzies lezzy loin loins lube lust lusty massa
|
63
|
+
masterbation masturbation maxi menses menstruate menstruation meth
|
64
|
+
molest moron motherfucka motherfucker murder muthafucker nad naked
|
65
|
+
napalm nappy nazism negro niggle nimrod ninny nooky nympho opiate
|
66
|
+
opium oral orally organ orgasm orgies orgy ovary ovum ovums paddy
|
67
|
+
pantie panty pastie pasty pecker pedo pee peepee penetrate
|
68
|
+
penetration penial penile perversion peyote phalli phallic
|
69
|
+
pillowbiter pimp pinko pissed pms polack porn porno pornography pot
|
70
|
+
potty prig prude pubic pubis punky puss queef queefing quife quicky
|
71
|
+
racist racy raped raper rapist raunch rectal rectum rectus reefer
|
72
|
+
reich revue risque rum rump sadism sadist satan scag schizo screw
|
73
|
+
screwed scrog scrot scrote scrud scum seaman seamen seduce semen
|
74
|
+
sex_story sexual shithole shitter shitty s*o*b sissy skag slave
|
75
|
+
sleaze sleazy sluts smutty sniper snuff sodom souse soused sperm
|
76
|
+
spooge stab steamy stiffy stoned strip stroke whacking suck sucked
|
77
|
+
sucking tampon tawdry teat teste testee testes testis thrust thug
|
78
|
+
tinkle titfuck titi titty whacked toke tramp trashy tush undies unwed
|
79
|
+
urinal urine uterus uzi valium virgin vixen vodka vomit voyeur vulgar
|
80
|
+
wad wazoo wedgie weed weenie weewee weiner weirdo wench whitey whiz
|
81
|
+
whored whorehouse whoring womb woody x-rated xxx b@lls yeasty yobbo
|
82
|
+
sumofabiatch doggy-style doggy wang dong d0ng w@ng wh0reface
|
83
|
+
wh0ref@ce wh0r3f@ce tittyfuck tittyfucker tittiefucker cockholster
|
84
|
+
cockblock gai gey faig faigt a55 a55hole gae corksucker rumprammer
|
85
|
+
slutdumper niggaz muthafuckaz gigolo pussypounder herp herpy
|
86
|
+
transsexual orgasmic cunilingus anilingus dickdipper dickwhipper
|
87
|
+
dicksipper dickripper dickflipper dickzipper homoey queero freex
|
88
|
+
cunthunter shamedame slutkiss shiteater fuckass fucka$$ clitorus
|
89
|
+
assfucker assfuckers dillweed cracker teabagging shitt azz fuk
|
90
|
+
fucknugget cuntlick g@y @ss beotch pussys 's***' paedophile
|
91
|
+
pedophiles pedophile sucks licker lickers bitchface idiot tosser
|
92
|
+
idiots tossers
|
93
|
+
].freeze
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'wikipedia/vandalism_detection/word_lists/bad'
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
3
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
4
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
5
|
+
require 'wikipedia/vandalism_detection/word_lists/vulgarism'
|
6
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
7
|
+
|
8
|
+
module Wikipedia
|
9
|
+
module VandalismDetection
|
10
|
+
module WordLists
|
11
|
+
# Returns an array of all wordlist words
|
12
|
+
def self.all
|
13
|
+
[*BAD, *BIASED, *PRONOUNS, *SEX, *VULGARISM].uniq!
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'wikipedia'
|
2
|
+
require 'wikipedia/vandalism_detection/version'
|
3
|
+
require 'wikipedia/vandalism_detection/configuration'
|
4
|
+
require 'wikipedia/vandalism_detection/exceptions'
|
5
|
+
|
6
|
+
require 'wikipedia/vandalism_detection/text'
|
7
|
+
require 'wikipedia/vandalism_detection/revision'
|
8
|
+
require 'wikipedia/vandalism_detection/edit'
|
9
|
+
require 'wikipedia/vandalism_detection/page'
|
10
|
+
require 'wikipedia/vandalism_detection/page_parser'
|
11
|
+
require 'wikipedia/vandalism_detection/revision_parser'
|
12
|
+
|
13
|
+
require 'wikipedia/vandalism_detection/word_lists'
|
14
|
+
require 'wikipedia/vandalism_detection/diff'
|
15
|
+
require 'wikipedia/vandalism_detection/wikitext_extractor'
|
16
|
+
require 'wikipedia/vandalism_detection/features'
|
17
|
+
require 'wikipedia/vandalism_detection/feature_calculator'
|
18
|
+
|
19
|
+
require 'wikipedia/vandalism_detection/instances'
|
20
|
+
require 'wikipedia/vandalism_detection/training_dataset'
|
21
|
+
require 'wikipedia/vandalism_detection/test_dataset'
|
22
|
+
require 'wikipedia/vandalism_detection/classifier'
|
23
|
+
require 'wikipedia/vandalism_detection/evaluator'
|
24
|
+
|
25
|
+
require 'weka/classifiers/meta/one_class_classifier'
|
26
|
+
require 'weka/classifiers/meta/real_ada_boost'
|
27
|
+
require 'weka/classifiers/trees/balanced_random_forest'
|
28
|
+
|
29
|
+
require 'weka/filters/supervised/instance/smote'
|
data/lib/wikipedia.rb
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'timeout'
|
4
|
+
|
5
|
+
module Wikipedia
|
6
|
+
def self.api_base_uri
|
7
|
+
'https://en.wikipedia.org/w/api.php?format=xml&action=query&'
|
8
|
+
end
|
9
|
+
|
10
|
+
def self.param_string(params)
|
11
|
+
params.map { |k, v| "#{k}=#{v}" }.join('&')
|
12
|
+
end
|
13
|
+
|
14
|
+
# Retries to call the request in the case of Timeout errors
|
15
|
+
def self.request_with_retry(uri, times = 1, timeout = 5)
|
16
|
+
content = ''
|
17
|
+
|
18
|
+
begin
|
19
|
+
Timeout.timeout(timeout) do
|
20
|
+
content = URI.parse(uri).read
|
21
|
+
end
|
22
|
+
rescue => error
|
23
|
+
if times > 0
|
24
|
+
times -= 1
|
25
|
+
retry
|
26
|
+
else
|
27
|
+
warn "Requesting '#{uri}' failed multiple times.\n#{error.message}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
content
|
32
|
+
end
|
33
|
+
|
34
|
+
def api_request(params = {})
|
35
|
+
uri = URI.encode(api_base_uri + param_string(params))
|
36
|
+
content = request_with_retry(uri, 3)
|
37
|
+
Nokogiri::XML(content)
|
38
|
+
end
|
39
|
+
|
40
|
+
module_function :api_request
|
41
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
FactoryBot.define do
|
2
|
+
factory :edit, class: Wikipedia::VandalismDetection::Edit do
|
3
|
+
old_revision { FactoryBot.build(:old_revision) }
|
4
|
+
new_revision { FactoryBot.build(:new_revision) }
|
5
|
+
page_id { nil }
|
6
|
+
page_title { nil }
|
7
|
+
|
8
|
+
initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
|
9
|
+
end
|
10
|
+
|
11
|
+
factory :anonymous_edit, class: Wikipedia::VandalismDetection::Edit do
|
12
|
+
old_revision { FactoryBot.build(:old_revision) }
|
13
|
+
new_revision { FactoryBot.build(:anonymous_revision) }
|
14
|
+
page_id { nil }
|
15
|
+
page_title { nil }
|
16
|
+
|
17
|
+
initialize_with { new(old_revision, new_revision, page_id: page_id, page_title: page_title) }
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
FactoryBot.define do
|
2
|
+
factory :page, class: Wikipedia::VandalismDetection::Page do
|
3
|
+
id { nil }
|
4
|
+
title { nil }
|
5
|
+
|
6
|
+
after :build do |obj|
|
7
|
+
obj.add_revision FactoryBot.build(:old_revision, contributor: 'User')
|
8
|
+
obj.add_revision FactoryBot.build(:new_revision, contributor: 'User')
|
9
|
+
obj.add_revision FactoryBot.build(:even_newer_revision, contributor: 'User')
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
FactoryBot.define do
|
2
|
+
factory :empty_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
3
|
+
f.id { nil }
|
4
|
+
f.parent_id { nil }
|
5
|
+
f.timestamp { nil }
|
6
|
+
f.text { Wikipedia::VandalismDetection::Text.new }
|
7
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
8
|
+
end
|
9
|
+
|
10
|
+
factory :old_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
11
|
+
f.id { '1' }
|
12
|
+
f.parent_id { nil }
|
13
|
+
f.timestamp { nil }
|
14
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 1') }
|
15
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
16
|
+
end
|
17
|
+
|
18
|
+
factory :new_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
19
|
+
f.id { '2' }
|
20
|
+
f.parent_id { '1' }
|
21
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
22
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
23
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
24
|
+
end
|
25
|
+
|
26
|
+
factory :even_newer_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
27
|
+
f.id { '3' }
|
28
|
+
f.parent_id { '2' }
|
29
|
+
f.timestamp { '2014-11-28T18:00:00Z' }
|
30
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 3') }
|
31
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
32
|
+
end
|
33
|
+
|
34
|
+
factory :anonymous_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
35
|
+
f.id { '2' }
|
36
|
+
f.parent_id { '1' }
|
37
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
38
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
39
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
40
|
+
f.contributor { '127.0.0.1' }
|
41
|
+
end
|
42
|
+
|
43
|
+
factory :registered_revision, class: Wikipedia::VandalismDetection::Revision do |f|
|
44
|
+
f.id { '2' }
|
45
|
+
f.parent_id { '1' }
|
46
|
+
f.timestamp { '2014-11-27T18:00:00Z' }
|
47
|
+
f.text { Wikipedia::VandalismDetection::Text.new('text 2') }
|
48
|
+
f.comment { Wikipedia::VandalismDetection::Text.new }
|
49
|
+
f.contributor { '12345' }
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
corpora:
|
2
|
+
base_directory: ../../../../spec/resources/corpora
|
3
|
+
|
4
|
+
training:
|
5
|
+
base_directory: training
|
6
|
+
annotations_file: annotations.csv
|
7
|
+
edits_file: edits.csv
|
8
|
+
revisions_directory: revisions
|
9
|
+
|
10
|
+
test:
|
11
|
+
base_directory: test
|
12
|
+
edits_file: edits.csv
|
13
|
+
revisions_directory: revisions
|
14
|
+
ground_truth_file: ground-truth.txt
|
15
|
+
|
16
|
+
output:
|
17
|
+
base_directory: ../../../../spec/resources/build
|
18
|
+
training:
|
19
|
+
index_file: training_index.yml
|
20
|
+
arff_file: training.arff
|
21
|
+
test:
|
22
|
+
index_file: test_index.yml
|
23
|
+
arff_file: test.arff
|
24
|
+
classification_file: classification.txt
|
25
|
+
|
26
|
+
features:
|
27
|
+
- anonymity
|
28
|
+
- character sequence
|
29
|
+
- comment length
|
30
|
+
|
31
|
+
classifier:
|
32
|
+
type: Trees::RandomForest
|
33
|
+
options: -I 10
|
34
|
+
cross-validation-fold: 2
|
35
|
+
training-data-options: unbalanced
|
@@ -0,0 +1,8 @@
|
|
1
|
+
"editid","editor","oldrevisionid","newrevisionid","diffurl","edittime","editcomment","articleid","articletitle"
|
2
|
+
1641,"137.163.16.199",328774088,328774188,"http://en.wikipedia.org/w/index.php?diff=328774188&oldid=328774088","2009-11-30T10:23:13Z","/* Location */",100935,"Not annotated article"
|
3
|
+
1642,"J04n",307084144,326873205,"http://en.wikipedia.org/w/index.php?diff=326873205&oldid=307084144","2009-11-20T04:42:24Z","Repairing links to disambiguation pages - [[Wikipedia:Disambiguation pages with links|You can help!]]",19490449,"The Soundstage Sessions"
|
4
|
+
1643,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
|
5
|
+
1644,"64.186.73.198",326471754,326978767,"http://en.wikipedia.org/w/index.php?diff=326978767&oldid=326471754","2009-11-20T19:32:23Z","/* Non-electric telephones */",2193804,"Invention of the telephone"
|
6
|
+
1647,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|
7
|
+
1648,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|
8
|
+
1649,"137.163.16.199",328774035,328774110,"http://en.wikipedia.org/w/index.php?diff=328774110&oldid=328774035","2009-11-30T10:23:13Z","/* Location */",100935,"Chinatown, Manhattan"
|