wikipedia-vandalism_detection 0.1.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +19 -0
- data/.rspec +2 -0
- data/.rubocop.yml +35 -0
- data/.travis.yml +6 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +4 -0
- data/README.md +288 -0
- data/Rakefile +11 -0
- data/config/wikipedia-vandalism-detection.yml.example +103 -0
- data/lib/java/SMOTE.jar +0 -0
- data/lib/java/balancedRandomForest.jar +0 -0
- data/lib/java/diffutils-1.3.0.jar +0 -0
- data/lib/java/oneClassClassifier.jar +0 -0
- data/lib/java/realAdaBoost.jar +0 -0
- data/lib/java/swc-engine-1.1.0-jar-with-dependencies.jar +0 -0
- data/lib/java/sweble-wikitext-extractor.jar +0 -0
- data/lib/weka/classifiers/meta/one_class_classifier.rb +21 -0
- data/lib/weka/classifiers/meta/real_ada_boost.rb +15 -0
- data/lib/weka/classifiers/trees/balanced_random_forest.rb +16 -0
- data/lib/weka/filters/supervised/instance/smote.rb +15 -0
- data/lib/wikipedia/vandalism_detection/algorithms/kullback_leibler_divergence.rb +103 -0
- data/lib/wikipedia/vandalism_detection/algorithms.rb +12 -0
- data/lib/wikipedia/vandalism_detection/classifier.rb +202 -0
- data/lib/wikipedia/vandalism_detection/configuration.rb +350 -0
- data/lib/wikipedia/vandalism_detection/diff.rb +36 -0
- data/lib/wikipedia/vandalism_detection/edit.rb +81 -0
- data/lib/wikipedia/vandalism_detection/evaluator.rb +640 -0
- data/lib/wikipedia/vandalism_detection/exceptions.rb +47 -0
- data/lib/wikipedia/vandalism_detection/feature_calculator.rb +94 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/all_wordlists_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/anonymity_previous.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/article_size.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/bad_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/base.rb +61 -0
- data/lib/wikipedia/vandalism_detection/features/biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/biased_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/blanking.rb +26 -0
- data/lib/wikipedia/vandalism_detection/features/character_diversity.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/character_sequence.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/comment_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_length.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/comment_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/comment_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/comment_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/compressibility.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/contains_base.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/copyedit.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/digit_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/edits_per_user.rb +72 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_frequency.rb +27 -0
- data/lib/wikipedia/vandalism_detection/features/emoticons_impact.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/frequency_base.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/impact_base.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_external_links.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_internal_links.rb +18 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/inserted_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/longest_word.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/markup_frequency.rb +29 -0
- data/lib/wikipedia/vandalism_detection/features/markup_impact.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/non_alphanumeric_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/personal_life.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/pronoun_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/removed_all_wordlists_frequency.rb +22 -0
- data/lib/wikipedia/vandalism_detection/features/removed_bad_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_biased_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_character_distribution.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/removed_emoticons_frequency.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/removed_markup_frequency.rb +30 -0
- data/lib/wikipedia/vandalism_detection/features/removed_pronoun_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_size.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/removed_vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/removed_words.rb +17 -0
- data/lib/wikipedia/vandalism_detection/features/replacement_similarity.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/reverted.rb +16 -0
- data/lib/wikipedia/vandalism_detection/features/revisions_character_distribution.rb +25 -0
- data/lib/wikipedia/vandalism_detection/features/same_editor.rb +32 -0
- data/lib/wikipedia/vandalism_detection/features/sex_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/sex_impact.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features/size_ratio.rb +28 -0
- data/lib/wikipedia/vandalism_detection/features/term_frequency.rb +31 -0
- data/lib/wikipedia/vandalism_detection/features/time_interval.rb +38 -0
- data/lib/wikipedia/vandalism_detection/features/time_of_day.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/upper_case_words_ratio.rb +33 -0
- data/lib/wikipedia/vandalism_detection/features/upper_to_lower_case_ratio.rb +23 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_frequency.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/vulgarism_impact.rb +21 -0
- data/lib/wikipedia/vandalism_detection/features/weekday.rb +19 -0
- data/lib/wikipedia/vandalism_detection/features/words_increment.rb +20 -0
- data/lib/wikipedia/vandalism_detection/features.rb +66 -0
- data/lib/wikipedia/vandalism_detection/instances.rb +121 -0
- data/lib/wikipedia/vandalism_detection/page.rb +101 -0
- data/lib/wikipedia/vandalism_detection/page_parser.rb +63 -0
- data/lib/wikipedia/vandalism_detection/revision.rb +75 -0
- data/lib/wikipedia/vandalism_detection/revision_parser.rb +67 -0
- data/lib/wikipedia/vandalism_detection/test_dataset.rb +374 -0
- data/lib/wikipedia/vandalism_detection/text.rb +23 -0
- data/lib/wikipedia/vandalism_detection/training_dataset.rb +282 -0
- data/lib/wikipedia/vandalism_detection/version.rb +5 -0
- data/lib/wikipedia/vandalism_detection/wikitext_extractor.rb +80 -0
- data/lib/wikipedia/vandalism_detection/word_lists/bad.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/biased.rb +20 -0
- data/lib/wikipedia/vandalism_detection/word_lists/emoticons.rb +26 -0
- data/lib/wikipedia/vandalism_detection/word_lists/markup.rb +19 -0
- data/lib/wikipedia/vandalism_detection/word_lists/pronouns.rb +11 -0
- data/lib/wikipedia/vandalism_detection/word_lists/sex.rb +10 -0
- data/lib/wikipedia/vandalism_detection/word_lists/vulgarism.rb +96 -0
- data/lib/wikipedia/vandalism_detection/word_lists.rb +17 -0
- data/lib/wikipedia/vandalism_detection.rb +29 -0
- data/lib/wikipedia.rb +41 -0
- data/spec/factories/edit.rb +19 -0
- data/spec/factories/page.rb +12 -0
- data/spec/factories/revision.rb +51 -0
- data/spec/resources/config/wikipedia-vandalism-detection.yml +35 -0
- data/spec/resources/corpora/test/edits.csv +8 -0
- data/spec/resources/corpora/test/ground-truth.txt +3 -0
- data/spec/resources/corpora/test/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/test/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/test/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/test/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/corpora/training/annotations.csv +7 -0
- data/spec/resources/corpora/training/edits.csv +7 -0
- data/spec/resources/corpora/training/revisions/part-1/326471754.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-1/326873205.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-1/328774035.txt +162 -0
- data/spec/resources/corpora/training/revisions/part-2/307084144.txt +137 -0
- data/spec/resources/corpora/training/revisions/part-2/326978767.txt +199 -0
- data/spec/resources/corpora/training/revisions/part-2/328774110.txt +162 -0
- data/spec/resources/page_with_redirects.xml +85 -0
- data/spec/resources/redirect_page.xml +59 -0
- data/spec/resources/revision_simplified.xml +13 -0
- data/spec/resources/sample_revision.txt +137 -0
- data/spec/resources/sample_revision_clean_text.txt +1 -0
- data/spec/resources/sample_revision_plain_text.txt +183 -0
- data/spec/resources/vandalism_on_wikipedia.xml +234 -0
- data/spec/resources/vandalism_on_wikipedia_simplified.xml +119 -0
- data/spec/resources/wikipedia_tokens.txt +30 -0
- data/spec/spec_helper.rb +38 -0
- data/spec/support/macros/file_reading.rb +6 -0
- data/spec/support/macros/test_configuration.rb +81 -0
- data/spec/vandalism_detection/algorithms/kullback_leibler_divergence_spec.rb +34 -0
- data/spec/vandalism_detection/classifier_spec.rb +330 -0
- data/spec/vandalism_detection/configuration_spec.rb +601 -0
- data/spec/vandalism_detection/diff_spec.rb +40 -0
- data/spec/vandalism_detection/edit_spec.rb +122 -0
- data/spec/vandalism_detection/evaluator_spec.rb +711 -0
- data/spec/vandalism_detection/feature_calculator_spec.rb +135 -0
- data/spec/vandalism_detection/features/all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/all_wordlists_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/anonymity_previous_spec.rb +67 -0
- data/spec/vandalism_detection/features/anonymity_spec.rb +17 -0
- data/spec/vandalism_detection/features/article_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/bad_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/base_spec.rb +41 -0
- data/spec/vandalism_detection/features/biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/biased_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/blanking_spec.rb +35 -0
- data/spec/vandalism_detection/features/character_diversity_spec.rb +30 -0
- data/spec/vandalism_detection/features/character_sequence_spec.rb +31 -0
- data/spec/vandalism_detection/features/comment_bad_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_biased_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_length_spec.rb +21 -0
- data/spec/vandalism_detection/features/comment_markup_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_pronoun_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_sex_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/comment_vulgarism_frequency_spec.rb +28 -0
- data/spec/vandalism_detection/features/compressibility_spec.rb +38 -0
- data/spec/vandalism_detection/features/contains_base_spec.rb +27 -0
- data/spec/vandalism_detection/features/copyedit_spec.rb +30 -0
- data/spec/vandalism_detection/features/digit_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/edits_per_user_spec.rb +51 -0
- data/spec/vandalism_detection/features/emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/emoticons_impact_spec.rb +45 -0
- data/spec/vandalism_detection/features/frequency_base_spec.rb +20 -0
- data/spec/vandalism_detection/features/impact_base_spec.rb +36 -0
- data/spec/vandalism_detection/features/inserted_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/inserted_external_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_internal_links_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/inserted_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/longest_word_spec.rb +29 -0
- data/spec/vandalism_detection/features/markup_frequency_spec.rb +31 -0
- data/spec/vandalism_detection/features/markup_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/non_alphanumeric_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/personal_life_spec.rb +22 -0
- data/spec/vandalism_detection/features/pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/pronoun_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/removed_all_wordlists_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_bad_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_biased_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/removed_emoticons_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_markup_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_pronoun_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_size_spec.rb +29 -0
- data/spec/vandalism_detection/features/removed_vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/removed_words_spec.rb +29 -0
- data/spec/vandalism_detection/features/replacement_similarity_spec.rb +37 -0
- data/spec/vandalism_detection/features/reverted_spec.rb +24 -0
- data/spec/vandalism_detection/features/revisions_character_distribution_spec.rb +40 -0
- data/spec/vandalism_detection/features/same_editor_spec.rb +71 -0
- data/spec/vandalism_detection/features/sex_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/sex_impact_spec.rb +53 -0
- data/spec/vandalism_detection/features/size_increment_spec.rb +29 -0
- data/spec/vandalism_detection/features/size_ratio_spec.rb +48 -0
- data/spec/vandalism_detection/features/term_frequency_spec.rb +32 -0
- data/spec/vandalism_detection/features/time_interval_spec.rb +56 -0
- data/spec/vandalism_detection/features/time_of_day_spec.rb +16 -0
- data/spec/vandalism_detection/features/upper_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/upper_case_words_ratio_spec.rb +33 -0
- data/spec/vandalism_detection/features/upper_to_lower_case_ratio_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_frequency_spec.rb +30 -0
- data/spec/vandalism_detection/features/vulgarism_impact_spec.rb +52 -0
- data/spec/vandalism_detection/features/weekday_spec.rb +16 -0
- data/spec/vandalism_detection/features/words_increment_spec.rb +29 -0
- data/spec/vandalism_detection/instances_spec.rb +146 -0
- data/spec/vandalism_detection/page_parser_spec.rb +190 -0
- data/spec/vandalism_detection/page_spec.rb +134 -0
- data/spec/vandalism_detection/revision_parser_spec.rb +53 -0
- data/spec/vandalism_detection/revision_spec.rb +148 -0
- data/spec/vandalism_detection/test_dataset_spec.rb +227 -0
- data/spec/vandalism_detection/text_spec.rb +29 -0
- data/spec/vandalism_detection/training_dataset_spec.rb +266 -0
- data/spec/vandalism_detection/wikitext_extractor_spec.rb +97 -0
- data/spec/weka/classifiers/meta/one_class_classifier_spec.rb +82 -0
- data/spec/weka/classifiers/meta/real_ada_boost_spec.rb +37 -0
- data/spec/weka/classifiers/trees/balanced_random_forest_spec.rb +37 -0
- data/spec/weka/filters/supervised/instance/smote_spec.rb +5 -0
- data/wikipedia-vandalism_detection.gemspec +37 -0
- metadata +550 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the digit to all letters ratio of the edit's new
|
|
7
|
+
# revision inserted text.
|
|
8
|
+
class DigitRatio < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
text = edit.inserted_text
|
|
13
|
+
return 0.0 if text.empty?
|
|
14
|
+
|
|
15
|
+
all_letters_count = text.scan(/[[:alnum:]]/).size
|
|
16
|
+
digit_count = text.scan(/[[:digit:]]/).size
|
|
17
|
+
|
|
18
|
+
(1.0 + digit_count) / (1.0 + all_letters_count)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'open-uri'
|
|
3
|
+
require 'nokogiri'
|
|
4
|
+
require 'date'
|
|
5
|
+
|
|
6
|
+
module Wikipedia
|
|
7
|
+
module VandalismDetection
|
|
8
|
+
module Features
|
|
9
|
+
# This feature calculates the number of submitted edits by the same editor
|
|
10
|
+
# (IP or ID) as the edit's editor.
|
|
11
|
+
class EditsPerUser < Base
|
|
12
|
+
# Returns the number of edits the edit's editor made in the same
|
|
13
|
+
# article. Attention: This is pretty time consuming (~2sec) due to the
|
|
14
|
+
# url request.
|
|
15
|
+
def calculate(edit)
|
|
16
|
+
super
|
|
17
|
+
|
|
18
|
+
revision = edit.new_revision
|
|
19
|
+
page = edit.page
|
|
20
|
+
|
|
21
|
+
if page && page.id
|
|
22
|
+
edits_count_from_page(edit)
|
|
23
|
+
else
|
|
24
|
+
edits_count_from_api_request(revision)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
protected
|
|
29
|
+
|
|
30
|
+
def edits_count_from_page(edit)
|
|
31
|
+
edit_revision = edit.new_revision
|
|
32
|
+
|
|
33
|
+
edit.page.edits.reduce(0) do |count, page_edit|
|
|
34
|
+
page_revision = page_edit.new_revision
|
|
35
|
+
|
|
36
|
+
same_user = page_revision.contributor == edit_revision.contributor
|
|
37
|
+
diff = time_diff(page_revision.timestamp, edit_revision.timestamp)
|
|
38
|
+
|
|
39
|
+
count += 1 if same_user && diff < 0
|
|
40
|
+
count
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def edits_count_from_api_request(revision)
|
|
45
|
+
params = {
|
|
46
|
+
list: 'usercontribs',
|
|
47
|
+
ucuser: revision.contributor,
|
|
48
|
+
ucprop: 'ids|timestamp'
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
xml = Wikipedia.api_request(params)
|
|
52
|
+
|
|
53
|
+
page_item = xml.xpath("//item[@revid='#{revision.id}']").first
|
|
54
|
+
return 0 unless page_item
|
|
55
|
+
|
|
56
|
+
page_id = page_item.xpath('@pageid').text
|
|
57
|
+
|
|
58
|
+
# count only edits before current
|
|
59
|
+
xml.xpath("//item[@pageid='#{page_id}']").reduce(0) do |count, item|
|
|
60
|
+
time = item.attr('timestamp')
|
|
61
|
+
count += 1 if time_diff(time, revision.timestamp) < 0
|
|
62
|
+
count
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def time_diff(time1, time2)
|
|
67
|
+
((DateTime.parse(time1) - DateTime.parse(time2)) * 24 * 60 * 60).to_i
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/emoticons'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of emoticon words in the inserted text.
|
|
8
|
+
class EmoticonsFrequency < Base
|
|
9
|
+
EMOJI_REGEX = /(^|\s)(#{WordLists::EMOTICONS.join('|')})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
|
|
10
|
+
|
|
11
|
+
# Returns the percentage of emoticon words in the inserted text.
|
|
12
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
13
|
+
def calculate(edit)
|
|
14
|
+
super
|
|
15
|
+
|
|
16
|
+
inserted_text = edit.inserted_text
|
|
17
|
+
emoticons_count = inserted_text.scan(EMOJI_REGEX).flatten
|
|
18
|
+
.reject { |c| c.size < 2 }.count
|
|
19
|
+
|
|
20
|
+
total_count = inserted_text.split.count
|
|
21
|
+
|
|
22
|
+
total_count > 0 ? emoticons_count.to_f / total_count.to_f : 0.0
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/emoticons'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes impact of emoticons words in the inserted text.
|
|
9
|
+
class EmoticonsImpact < Base
|
|
10
|
+
EMOJI_REGEX = /(^|\s)(#{WordLists::EMOTICONS.join('|')})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
|
|
11
|
+
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
old_text = edit.old_revision.text
|
|
16
|
+
new_text = edit.new_revision.text
|
|
17
|
+
|
|
18
|
+
old_count = old_text.scan(EMOJI_REGEX).flatten
|
|
19
|
+
.reject { |c| c.size < 2 }.count.to_f
|
|
20
|
+
new_count = new_text.scan(EMOJI_REGEX).flatten
|
|
21
|
+
.reject { |c| c.size < 2 }.count.to_f
|
|
22
|
+
|
|
23
|
+
no_terms_in_both = old_count.zero? && new_count.zero?
|
|
24
|
+
no_terms_in_both ? 0.5 : old_count / (old_count + new_count)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
class FrequencyBase < Base
|
|
7
|
+
# Returns the ratio of given numbers.
|
|
8
|
+
# For frequency calculation it returns 0.0 if total_count is zero.
|
|
9
|
+
def frequency(text, terms)
|
|
10
|
+
total_count = text.split.count
|
|
11
|
+
term_count = count terms, in: text
|
|
12
|
+
|
|
13
|
+
total_count > 0 ? term_count.to_f / total_count.to_f : 0.0
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
class ImpactBase < Base
|
|
7
|
+
# Returns the ratio of given text's terms count.
|
|
8
|
+
# For impact calculation it returns 0.5 if number of terms in old text
|
|
9
|
+
# is zero.
|
|
10
|
+
def impact(old_text, new_text, terms)
|
|
11
|
+
old_terms_count = (count terms, in: old_text).to_f
|
|
12
|
+
new_terms_count = (count terms, in: new_text).to_f
|
|
13
|
+
|
|
14
|
+
no_terms_in_both = old_terms_count.zero? && new_terms_count.zero?
|
|
15
|
+
|
|
16
|
+
if no_terms_in_both
|
|
17
|
+
0.5
|
|
18
|
+
else
|
|
19
|
+
old_terms_count / (old_terms_count + new_terms_count)
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/algorithms'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the Kullback-Leibler Divergence of the inserted
|
|
8
|
+
# text's character distribution
|
|
9
|
+
# relative to the character distribution of the old revision's text.
|
|
10
|
+
# The smaller the divergence, the higher the similarity of the
|
|
11
|
+
# distributions and conversely.
|
|
12
|
+
class InsertedCharacterDistribution < Base
|
|
13
|
+
include Algorithms
|
|
14
|
+
|
|
15
|
+
def calculate(edit)
|
|
16
|
+
super
|
|
17
|
+
|
|
18
|
+
kullback_leibler_divergence(edit.old_revision.text, edit.inserted_text)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the number of inserted external links of format
|
|
7
|
+
# [http://example.com].
|
|
8
|
+
class InsertedExternalLinks < Base
|
|
9
|
+
URL_REGEX = %r{\[?(https?|ftp)\s?:\s?\/\/[^\s\/$.?#].[^\s]*]?}i
|
|
10
|
+
|
|
11
|
+
def calculate(edit)
|
|
12
|
+
super
|
|
13
|
+
|
|
14
|
+
edit.inserted_text.scan(URL_REGEX).count
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the number of inserted internal links [[link]].
|
|
7
|
+
class InsertedInternalLinks < Base
|
|
8
|
+
INTERNAL_LINK_REGEX = /\[{2}([^\[].*?)\]{2}/
|
|
9
|
+
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
edit.inserted_text.scan(INTERNAL_LINK_REGEX).count
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the size of inserted text in the edit's new
|
|
7
|
+
# revision.
|
|
8
|
+
class InsertedSize < Base
|
|
9
|
+
# Returns the size of inserted character in the new revision.
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
size = edit.inserted_text.size
|
|
14
|
+
size < 0 ? 0 : size
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the number of inserted words in the edit's new
|
|
7
|
+
# revision.
|
|
8
|
+
class InsertedWords < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
edit.inserted_words.count
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/text'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the length of the longest word in the inserted
|
|
8
|
+
# text.
|
|
9
|
+
class LongestWord < Base
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
|
|
13
|
+
sequence_hash = Text.new(edit.inserted_words.join("\n"))
|
|
14
|
+
.clean.split(/[\b\s+,;:]/).group_by(&:length)
|
|
15
|
+
|
|
16
|
+
sequence_hash.empty? ? 0 : sequence_hash.max.first
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes frequency of bad words in the inserted text.
|
|
8
|
+
class MarkupFrequency < Base
|
|
9
|
+
MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
|
|
10
|
+
|
|
11
|
+
# Returns the percentage of markup related words in the inserted text.
|
|
12
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
13
|
+
def calculate(edit)
|
|
14
|
+
super
|
|
15
|
+
|
|
16
|
+
text = edit.inserted_text
|
|
17
|
+
all_words_count = edit.inserted_words.count
|
|
18
|
+
markup_words_count = text.scan(MARKUP_REGEX).count
|
|
19
|
+
|
|
20
|
+
if all_words_count > 0
|
|
21
|
+
markup_words_count.to_f / all_words_count.to_f
|
|
22
|
+
else
|
|
23
|
+
0.0
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of markup words in the text.
|
|
9
|
+
class MarkupImpact < Base
|
|
10
|
+
MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
|
|
11
|
+
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
old_text = edit.old_revision.text
|
|
16
|
+
new_text = edit.new_revision.text
|
|
17
|
+
|
|
18
|
+
old_markup_count = old_text.scan(MARKUP_REGEX).count.to_f
|
|
19
|
+
new_markup_count = new_text.scan(MARKUP_REGEX).count.to_f
|
|
20
|
+
|
|
21
|
+
if old_markup_count.zero? && new_markup_count.zero?
|
|
22
|
+
0.5
|
|
23
|
+
else
|
|
24
|
+
old_markup_count / (old_markup_count + new_markup_count)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature computes the non-alphanumeric to all letters ratio of the
|
|
7
|
+
# edit's new revision inserted text.
|
|
8
|
+
class NonAlphanumericRatio < Base
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
text = edit.inserted_text
|
|
13
|
+
return 0.0 if text.empty?
|
|
14
|
+
|
|
15
|
+
non_alpha_count = text.scan(/[^a-zA-Z0-9\s]/).size
|
|
16
|
+
all_letters_count = text.scan(/[^\s]/).size
|
|
17
|
+
|
|
18
|
+
(1.0 + non_alpha_count) / (1.0 + all_letters_count)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/contains_base'
|
|
2
|
+
|
|
3
|
+
module Wikipedia
|
|
4
|
+
module VandalismDetection
|
|
5
|
+
module Features
|
|
6
|
+
# This feature returns whether the edit's comment includes
|
|
7
|
+
# 'personal life'.
|
|
8
|
+
class PersonalLife < ContainsBase
|
|
9
|
+
def calculate(edit)
|
|
10
|
+
super
|
|
11
|
+
|
|
12
|
+
contains(edit.new_revision.comment, 'personal life')
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the frequency of pronouns in the insetred text.
|
|
9
|
+
class PronounFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of pronoun words in the inserted text.
|
|
11
|
+
# Returns 0.0 if inserted clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.inserted_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::PRONOUNS)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/impact_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the percentage by which the edit increases the
|
|
8
|
+
# number of pronouns in the text.
|
|
9
|
+
class PronounImpact < ImpactBase
|
|
10
|
+
def calculate(edit)
|
|
11
|
+
super
|
|
12
|
+
old_text = edit.old_revision.text.clean
|
|
13
|
+
new_text = edit.new_revision.text.clean
|
|
14
|
+
|
|
15
|
+
impact(old_text, new_text, WordLists::PRONOUNS)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of all wordlists words in the removed
|
|
9
|
+
# text.
|
|
10
|
+
class RemovedAllWordlistsFrequency < FrequencyBase
|
|
11
|
+
# Returns the percentage of wordlists words in the removed text.
|
|
12
|
+
# Returns 0.0 if removed clean text is of zero length.
|
|
13
|
+
def calculate(edit)
|
|
14
|
+
super
|
|
15
|
+
|
|
16
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
17
|
+
frequency(text, WordLists.all)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the frequency of bad words in the removed text.
|
|
9
|
+
class RemovedBadFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of bad words in the removed text.
|
|
11
|
+
# Returns 0.0 if cleaned removed text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::BAD)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/biased'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes frequency of biased words in the removed text.
|
|
9
|
+
class RemovedBiasedFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of biased words in the removed text.
|
|
11
|
+
# Returns 0.0 if removed clean text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::BIASED)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/algorithms'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the Kullback-Leibler Divergence of the removed
|
|
8
|
+
# text's character distribution relative to the character distribution
|
|
9
|
+
# of the new revision's text.
|
|
10
|
+
# The smaller the divergence, the higher the similarity of the
|
|
11
|
+
# distributions and conversely.
|
|
12
|
+
class RemovedCharacterDistribution < Base
|
|
13
|
+
include Algorithms
|
|
14
|
+
|
|
15
|
+
def calculate(edit)
|
|
16
|
+
super
|
|
17
|
+
|
|
18
|
+
kullback_leibler_divergence(edit.new_revision.text, edit.removed_text)
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/emoticons'
|
|
3
|
+
|
|
4
|
+
module Wikipedia
|
|
5
|
+
module VandalismDetection
|
|
6
|
+
module Features
|
|
7
|
+
# This feature computes the frequency of emoticon words in the removed
|
|
8
|
+
# text.
|
|
9
|
+
class RemovedEmoticonsFrequency < Base
|
|
10
|
+
# Returns the percentage of markup words in the removed text.
|
|
11
|
+
# Returns 0.0 if cleaned removed text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
removed_text = edit.removed_text
|
|
16
|
+
emojis = WordLists::EMOTICONS.join('|')
|
|
17
|
+
regex = /(^|\s)(#{emojis})(?=\s|$|\Z|[\.,!?]\s|[\.!?]\Z)/
|
|
18
|
+
|
|
19
|
+
emoticons_count = removed_text.scan(regex).flatten
|
|
20
|
+
.reject { |c| c.size < 2 }.count
|
|
21
|
+
total_count = removed_text.split.count
|
|
22
|
+
|
|
23
|
+
total_count > 0 ? emoticons_count.to_f / total_count.to_f : 0.0
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/markup'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the frequency of markup words in the removed text.
|
|
9
|
+
class RemovedMarkupFrequency < Base
|
|
10
|
+
MARKUP_REGEX = /(#{WordLists::MARKUP.join('|')})/
|
|
11
|
+
|
|
12
|
+
# Returns the percentage of markup words in the removed text.
|
|
13
|
+
# Returns 0.0 if cleaned removed text is of zero length.
|
|
14
|
+
def calculate(edit)
|
|
15
|
+
super
|
|
16
|
+
|
|
17
|
+
text = edit.removed_text
|
|
18
|
+
all_words_count = edit.removed_words.count
|
|
19
|
+
markup_words_count = text.scan(MARKUP_REGEX).count
|
|
20
|
+
|
|
21
|
+
if all_words_count > 0
|
|
22
|
+
markup_words_count.to_f / all_words_count.to_f
|
|
23
|
+
else
|
|
24
|
+
0.0
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/pronouns'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the frequency of pronouns in the removed text.
|
|
9
|
+
class RemovedPronounFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of pronoun words in the removed text.
|
|
11
|
+
# Returns 0.0 if cleaned removed text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::PRONOUNS)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
require 'wikipedia/vandalism_detection/features/frequency_base'
|
|
2
|
+
require 'wikipedia/vandalism_detection/word_lists/sex'
|
|
3
|
+
require 'wikipedia/vandalism_detection/text'
|
|
4
|
+
|
|
5
|
+
module Wikipedia
|
|
6
|
+
module VandalismDetection
|
|
7
|
+
module Features
|
|
8
|
+
# This feature computes the frequency of sex words in the removed text.
|
|
9
|
+
class RemovedSexFrequency < FrequencyBase
|
|
10
|
+
# Returns the percentage of sex words in the removed text.
|
|
11
|
+
# Returns 0.0 if cleaned removed text is of zero length.
|
|
12
|
+
def calculate(edit)
|
|
13
|
+
super
|
|
14
|
+
|
|
15
|
+
text = Text.new(edit.removed_words.join("\n")).clean
|
|
16
|
+
frequency(text, WordLists::SEX)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|