despamilator 2.0.1 → 2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/History.txt +7 -0
  2. data/lib/despamilator.rb +38 -7
  3. data/lib/despamilator/filter.rb +39 -23
  4. data/lib/despamilator/filter/gtubs_test_filter.rb +4 -4
  5. data/lib/despamilator/filter/html_tags.rb +9 -7
  6. data/lib/despamilator/filter/ip_address_url.rb +6 -4
  7. data/lib/despamilator/filter/long_words.rb +7 -5
  8. data/lib/despamilator/filter/mixed_case.rb +21 -0
  9. data/lib/despamilator/filter/naughty_words.rb +5 -5
  10. data/lib/despamilator/filter/numbers_and_words.rb +19 -11
  11. data/lib/despamilator/filter/obfuscated_urls.rb +41 -0
  12. data/lib/despamilator/filter/prices.rb +19 -0
  13. data/lib/despamilator/filter/script_tag.rb +4 -4
  14. data/lib/despamilator/filter/shouting.rb +9 -6
  15. data/lib/despamilator/filter/spammy_tlds.rb +22 -0
  16. data/lib/despamilator/filter/square_brackets.rb +5 -5
  17. data/lib/despamilator/filter/trailing_number.rb +4 -4
  18. data/lib/despamilator/filter/unusual_characters.rb +5 -5
  19. data/lib/despamilator/filter/urls.rb +7 -9
  20. data/lib/despamilator/filter/very_long_domain_name.rb +27 -0
  21. data/lib/despamilator/filter/weird_punctuation.rb +44 -0
  22. data/lib/despamilator/subject.rb +30 -0
  23. data/lib/despamilator/subject/text.rb +32 -0
  24. data/lib/despamilator/version.rb +3 -0
  25. metadata +29 -75
  26. data/.rspec +0 -2
  27. data/.rvmrc +0 -1
  28. data/Gemfile +0 -12
  29. data/Gemfile.lock +0 -47
  30. data/Manifest.txt +0 -46
  31. data/PostInstall.txt +0 -1
  32. data/Rakefile +0 -39
  33. data/conf/unusual_characters.txt +0 -6674
  34. data/despamilator.gemspec +0 -38
  35. data/lib/despamilator/filter_base.rb +0 -82
  36. data/scripts/despamilator_score.rb +0 -25
  37. data/scripts/from_file.rb +0 -26
  38. data/spec/despamilator_spec.rb +0 -13
  39. data/spec/filter_base_spec.rb +0 -30
  40. data/spec/filters/gtubs_test_filter_spec.rb +0 -9
  41. data/spec/filters/html_tags_spec.rb +0 -129
  42. data/spec/filters/ip_address_url_spec.rb +0 -11
  43. data/spec/filters/long_words_spec.rb +0 -11
  44. data/spec/filters/naughty_words_spec.rb +0 -11
  45. data/spec/filters/numbers_and_words_spec.rb +0 -34
  46. data/spec/filters/script_tag_spec.rb +0 -22
  47. data/spec/filters/shouting_spec.rb +0 -45
  48. data/spec/filters/square_brackets_spec.rb +0 -11
  49. data/spec/filters/trailing_number_spec.rb +0 -10
  50. data/spec/filters/unusual_characters_spec.rb +0 -9
  51. data/spec/filters/urls_spec.rb +0 -11
  52. data/spec/helpers/corpus_helper.rb +0 -5
  53. data/spec/helpers/filter_helper.rb +0 -59
  54. data/spec/helpers/spec_helper.rb +0 -6
  55. data/tasks/test.rake +0 -6
@@ -1,45 +0,0 @@
1
- describe DespamilatorFilter::Shouting do
2
-
3
- the_name_should_be 'Shouting'
4
- the_description_should_be 'Detects and scores shouting (all caps)'
5
-
6
- despamilator_should_apply_the_filter_for('this lil string is 50 PERCENT SHOUTING')
7
-
8
- a_single_match_of('this lil string is 50 PERCENT SHOUTING', should_score: 0.25)
9
- a_multiple_match_of('HELLO THERE!! THIS IS SHOUTING!!', should_score: [0.5, 1.times])
10
-
11
- describe "exceptions" do
12
-
13
- before :all do
14
- @filter = DespamilatorFilter::Shouting.new
15
- end
16
-
17
- it "should strip out HTML" do
18
- @filter.parse('<H1>this is a flipping html tag whose contents is very long</h1>')
19
- @filter.score.should == 0
20
- end
21
-
22
- it "should ignore strings less than 20 characters long" do
23
- @filter.parse('ABCD EFG HIJKLM NOP')
24
- @filter.score.should == 0
25
- end
26
-
27
- end
28
-
29
- [
30
- ['this is a lowercased string', 0],
31
- ['This is a String with Capital Letters', 0],
32
- ['this lil string is 50 PERCENT SHOUTING', 0.25],
33
- ['THIS LIL STRING IS 100 PERCENT SHOUTING', 0.5]
34
- ].each do |string, expected_score|
35
-
36
- it "should score the string '#{string}' based on a percentage of uppercase words" do
37
- filter = DespamilatorFilter::Shouting.new
38
-
39
- filter.parse(string)
40
- filter.score.should == expected_score
41
- end
42
-
43
- end
44
-
45
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::SquareBrackets do
2
-
3
- the_name_should_be 'Square Brackets'
4
- the_description_should_be 'Detects each square bracket in a string'
5
-
6
- despamilator_should_apply_the_filter_for('[')
7
-
8
- a_single_match_of('[', should_score: 0.05)
9
- a_multiple_match_of('[]', should_score: [0.1, 2.times])
10
-
11
- end
@@ -1,10 +0,0 @@
1
- describe DespamilatorFilter::TrailingNumber do
2
-
3
- the_name_should_be 'Trailing Number'
4
- the_description_should_be 'Detects a trailing cache busting number'
5
-
6
- despamilator_should_apply_the_filter_for('hello 123 ')
7
-
8
- a_single_match_of('hello 123', should_score: 0.1)
9
-
10
- end
@@ -1,9 +0,0 @@
1
- describe DespamilatorFilter::UnusualCharacters do
2
- the_name_should_be 'Unusual Characters'
3
- the_description_should_be 'Detects and scores each occurrence of an unusual 2 or 3 character combination'
4
-
5
- despamilator_should_apply_the_filter_for('sx')
6
-
7
- a_single_match_of('sx', should_score: 0.05)
8
- a_multiple_match_of('sxsx', should_score: [0.1, 2.times])
9
- end
@@ -1,11 +0,0 @@
1
- describe DespamilatorFilter::URLs do
2
-
3
- the_name_should_be 'URLs'
4
- the_description_should_be 'Detects each url in a string'
5
-
6
- despamilator_should_apply_the_filter_for('http://www.blah.com')
7
-
8
- a_single_match_of('http://www.blah.com', should_score: 0.4)
9
- a_multiple_match_of('http://www.blah.com http://www.poop.com', should_score: [0.8, 2.times])
10
-
11
- end
@@ -1,5 +0,0 @@
1
- require 'zlib'
2
-
3
- def unzip_file filename
4
- Zlib::GzipReader.open(filename).read
5
- end
@@ -1,59 +0,0 @@
1
- def the_name_should_be expected_name
2
- it "should have a name" do
3
- described_class.new.name.should == expected_name
4
- end
5
- end
6
-
7
- def the_description_should_be expected_description
8
- it "should have a description" do
9
- described_class.new.description.should == expected_description
10
- end
11
- end
12
-
13
- def a_single_match_of string, expectation
14
- describe 'detecting a single match' do
15
-
16
- before :all do
17
- @filter = described_class.new
18
- @filter.parse(string)
19
- end
20
-
21
- it "should only match once" do
22
- @filter.matches.should == 1
23
- end
24
-
25
- it "should have a score" do
26
- @filter.score.should == expectation[:should_score]
27
- end
28
-
29
- end
30
- end
31
-
32
- def a_multiple_match_of string, expectation
33
- describe 'detecting a multiple matches' do
34
-
35
- before :all do
36
- @filter = described_class.new
37
- @filter.parse(string)
38
- end
39
-
40
- it "should match many times" do
41
- @filter.matches.should == expectation[:should_score].last.count
42
- end
43
-
44
- it "should have a score" do
45
- @filter.score.should == expectation[:should_score].first
46
- end
47
-
48
- end
49
- end
50
-
51
- def despamilator_should_apply_the_filter_for string
52
-
53
- it "should be applied during filtering" do
54
- filter_name = described_class.new.name
55
- despamilator = Despamilator.new(string)
56
- despamilator.matched_by.collect { |f| f.name == filter_name }.should_not be_empty
57
- end
58
-
59
- end
@@ -1,6 +0,0 @@
1
- require 'one_hundred_percent_coverage' if ENV['WITH_COVERAGE'].to_i == 1
2
- require File.join(File.dirname(__FILE__), '..', '..', 'lib', 'despamilator')
3
-
4
- Dir.glob(File.join(File.dirname(__FILE__), '*.rb')).each do |file|
5
- require file
6
- end
data/tasks/test.rake DELETED
@@ -1,6 +0,0 @@
1
- ENV['WITH_COVERAGE'] = '1'
2
-
3
- desc "Run the spec tests with coverage"
4
- task :test do
5
- Rake::Task[:spec].invoke
6
- end