despamilator 0.8 → 1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.rspec +3 -0
- data/.rvmrc +1 -0
- data/Gemfile +12 -0
- data/Gemfile.lock +47 -0
- data/History.txt +14 -0
- data/Manifest.txt +9 -605
- data/README.rdoc +37 -37
- data/Rakefile +10 -3
- data/despamilator.gemspec +8 -11
- data/lib/despamilator.rb +26 -1
- data/lib/despamilator/filter.rb +15 -26
- data/lib/despamilator/filter/funky_consonant.rb +25 -15
- data/lib/despamilator/filter/html_tags.rb +122 -111
- data/lib/despamilator/filter/ip_address_url.rb +18 -8
- data/lib/despamilator/filter/long_words.rb +20 -10
- data/lib/despamilator/filter/naughty_q.rb +24 -14
- data/lib/despamilator/filter/naughty_words.rb +25 -16
- data/lib/despamilator/filter/numbers_and_words.rb +39 -29
- data/lib/despamilator/filter/script_tag.rb +18 -10
- data/lib/despamilator/filter/shouting.rb +25 -15
- data/lib/despamilator/filter/square_brackets.rb +19 -9
- data/lib/despamilator/filter/urls.rb +24 -8
- data/lib/despamilator/filter_base.rb +54 -9
- data/spec/despamilator_spec.rb +0 -2
- data/spec/filter_base_spec.rb +30 -0
- data/spec/filters/funky_consonant_spec.rb +6 -36
- data/spec/filters/html_tags_spec.rb +120 -138
- data/spec/filters/ip_address_url_spec.rb +6 -24
- data/spec/filters/long_words_spec.rb +6 -29
- data/spec/filters/naughty_q_spec.rb +6 -34
- data/spec/filters/naughty_words_spec.rb +6 -34
- data/spec/filters/numbers_and_words_spec.rb +21 -46
- data/spec/filters/script_tag_spec.rb +10 -20
- data/spec/filters/shouting_spec.rb +28 -33
- data/spec/filters/square_brackets_spec.rb +6 -30
- data/spec/filters/urls_spec.rb +6 -34
- data/spec/helpers/corpus_helper.rb +5 -0
- data/spec/helpers/filter_helper.rb +59 -0
- data/spec/helpers/spec_helper.rb +6 -0
- data/tasks/test.rake +6 -0
- metadata +19 -611
- data/lib/despamilator/validation.rb +0 -12
- data/spec/clean_corpus/101.txt.gz +0 -0
- data/spec/clean_corpus/103.txt.gz +0 -0
- data/spec/clean_corpus/105.txt.gz +0 -0
- data/spec/clean_corpus/107.txt.gz +0 -0
- data/spec/clean_corpus/109.txt.gz +0 -0
- data/spec/clean_corpus/111.txt.gz +0 -0
- data/spec/clean_corpus/113.txt.gz +0 -0
- data/spec/clean_corpus/115.txt.gz +0 -0
- data/spec/clean_corpus/117.txt.gz +0 -0
- data/spec/clean_corpus/119.txt.gz +0 -0
- data/spec/clean_corpus/121.txt.gz +0 -0
- data/spec/clean_corpus/123.txt.gz +0 -0
- data/spec/clean_corpus/125.txt.gz +0 -0
- data/spec/clean_corpus/127.txt.gz +0 -0
- data/spec/clean_corpus/129.txt.gz +0 -0
- data/spec/clean_corpus/131.txt.gz +0 -0
- data/spec/clean_corpus/133.txt.gz +0 -0
- data/spec/clean_corpus/135.txt.gz +0 -0
- data/spec/clean_corpus/137.txt.gz +0 -0
- data/spec/clean_corpus/139.txt.gz +0 -0
- data/spec/clean_corpus/141.txt.gz +0 -0
- data/spec/clean_corpus/143.txt.gz +0 -0
- data/spec/clean_corpus/145.txt.gz +0 -0
- data/spec/clean_corpus/147.txt.gz +0 -0
- data/spec/clean_corpus/149.txt.gz +0 -0
- data/spec/clean_corpus/151.txt.gz +0 -0
- data/spec/clean_corpus/153.txt.gz +0 -0
- data/spec/clean_corpus/155.txt.gz +0 -0
- data/spec/clean_corpus/157.txt.gz +0 -0
- data/spec/clean_corpus/159.txt.gz +0 -0
- data/spec/clean_corpus/161.txt.gz +0 -0
- data/spec/clean_corpus/163.txt.gz +0 -0
- data/spec/clean_corpus/165.txt.gz +0 -0
- data/spec/clean_corpus/167.txt.gz +0 -0
- data/spec/clean_corpus/169.txt.gz +0 -0
- data/spec/clean_corpus/171.txt.gz +0 -0
- data/spec/clean_corpus/173.txt.gz +0 -0
- data/spec/clean_corpus/175.txt.gz +0 -0
- data/spec/clean_corpus/177.txt.gz +0 -0
- data/spec/clean_corpus/179.txt.gz +0 -0
- data/spec/clean_corpus/18.txt.gz +0 -0
- data/spec/clean_corpus/181.txt.gz +0 -0
- data/spec/clean_corpus/183.txt.gz +0 -0
- data/spec/clean_corpus/185.txt.gz +0 -0
- data/spec/clean_corpus/187.txt.gz +0 -0
- data/spec/clean_corpus/189.txt.gz +0 -0
- data/spec/clean_corpus/191.txt.gz +0 -0
- data/spec/clean_corpus/193.txt.gz +0 -0
- data/spec/clean_corpus/195.txt.gz +0 -0
- data/spec/clean_corpus/197.txt.gz +0 -0
- data/spec/clean_corpus/199.txt.gz +0 -0
- data/spec/clean_corpus/20.txt.gz +0 -0
- data/spec/clean_corpus/201.txt.gz +0 -0
- data/spec/clean_corpus/203.txt.gz +0 -0
- data/spec/clean_corpus/205.txt.gz +0 -0
- data/spec/clean_corpus/207.txt.gz +0 -0
- data/spec/clean_corpus/209.txt.gz +0 -0
- data/spec/clean_corpus/211.txt.gz +0 -0
- data/spec/clean_corpus/213.txt.gz +0 -0
- data/spec/clean_corpus/215.txt.gz +0 -0
- data/spec/clean_corpus/217.txt.gz +0 -0
- data/spec/clean_corpus/219.txt.gz +0 -0
- data/spec/clean_corpus/22.txt.gz +0 -0
- data/spec/clean_corpus/221.txt.gz +0 -0
- data/spec/clean_corpus/223.txt.gz +0 -0
- data/spec/clean_corpus/225.txt.gz +0 -0
- data/spec/clean_corpus/24.txt.gz +0 -0
- data/spec/clean_corpus/26.txt.gz +0 -0
- data/spec/clean_corpus/27.txt.gz +0 -0
- data/spec/clean_corpus/29.txt.gz +0 -0
- data/spec/clean_corpus/31.txt.gz +0 -0
- data/spec/clean_corpus/33.txt.gz +0 -0
- data/spec/clean_corpus/35.txt.gz +0 -0
- data/spec/clean_corpus/37.txt.gz +0 -0
- data/spec/clean_corpus/39.txt.gz +0 -0
- data/spec/clean_corpus/41.txt.gz +0 -0
- data/spec/clean_corpus/43.txt.gz +0 -0
- data/spec/clean_corpus/45.txt.gz +0 -0
- data/spec/clean_corpus/47.txt.gz +0 -0
- data/spec/clean_corpus/49.txt.gz +0 -0
- data/spec/clean_corpus/51.txt.gz +0 -0
- data/spec/clean_corpus/53.txt.gz +0 -0
- data/spec/clean_corpus/55.txt.gz +0 -0
- data/spec/clean_corpus/57.txt.gz +0 -0
- data/spec/clean_corpus/59.txt.gz +0 -0
- data/spec/clean_corpus/61.txt.gz +0 -0
- data/spec/clean_corpus/63.txt.gz +0 -0
- data/spec/clean_corpus/65.txt.gz +0 -0
- data/spec/clean_corpus/67.txt.gz +0 -0
- data/spec/clean_corpus/69.txt.gz +0 -0
- data/spec/clean_corpus/71.txt.gz +0 -0
- data/spec/clean_corpus/73.txt.gz +0 -0
- data/spec/clean_corpus/75.txt.gz +0 -0
- data/spec/clean_corpus/77.txt.gz +0 -0
- data/spec/clean_corpus/79.txt.gz +0 -0
- data/spec/clean_corpus/81.txt.gz +0 -0
- data/spec/clean_corpus/83.txt.gz +0 -0
- data/spec/clean_corpus/85.txt.gz +0 -0
- data/spec/clean_corpus/87.txt.gz +0 -0
- data/spec/clean_corpus/89.txt.gz +0 -0
- data/spec/clean_corpus/91.txt.gz +0 -0
- data/spec/clean_corpus/93.txt.gz +0 -0
- data/spec/clean_corpus/95.txt.gz +0 -0
- data/spec/clean_corpus/97.txt.gz +0 -0
- data/spec/clean_corpus/99.txt.gz +0 -0
- data/spec/clean_corpus_spec.rb +0 -11
- data/spec/despamilator_validation_spec.rb +0 -27
- data/spec/spam_corpus/0.txt.gz +0 -0
- data/spec/spam_corpus/1.txt.gz +0 -0
- data/spec/spam_corpus/10.txt.gz +0 -0
- data/spec/spam_corpus/100.txt.gz +0 -0
- data/spec/spam_corpus/102.txt.gz +0 -0
- data/spec/spam_corpus/104.txt.gz +0 -0
- data/spec/spam_corpus/106.txt.gz +0 -0
- data/spec/spam_corpus/108.txt.gz +0 -0
- data/spec/spam_corpus/11.txt.gz +0 -0
- data/spec/spam_corpus/110.txt.gz +0 -0
- data/spec/spam_corpus/112.txt.gz +0 -0
- data/spec/spam_corpus/114.txt.gz +0 -0
- data/spec/spam_corpus/116.txt.gz +0 -0
- data/spec/spam_corpus/118.txt.gz +0 -0
- data/spec/spam_corpus/12.txt.gz +0 -0
- data/spec/spam_corpus/120.txt.gz +0 -0
- data/spec/spam_corpus/122.txt.gz +0 -0
- data/spec/spam_corpus/124.txt.gz +0 -0
- data/spec/spam_corpus/126.txt.gz +0 -0
- data/spec/spam_corpus/128.txt.gz +0 -0
- data/spec/spam_corpus/13.txt.gz +0 -0
- data/spec/spam_corpus/130.txt.gz +0 -0
- data/spec/spam_corpus/132.txt.gz +0 -0
- data/spec/spam_corpus/134.txt.gz +0 -0
- data/spec/spam_corpus/136.txt.gz +0 -0
- data/spec/spam_corpus/138.txt.gz +0 -0
- data/spec/spam_corpus/14.txt.gz +0 -0
- data/spec/spam_corpus/140.txt.gz +0 -0
- data/spec/spam_corpus/142.txt.gz +0 -0
- data/spec/spam_corpus/144.txt.gz +0 -0
- data/spec/spam_corpus/146.txt.gz +0 -0
- data/spec/spam_corpus/148.txt.gz +0 -0
- data/spec/spam_corpus/15.txt.gz +0 -0
- data/spec/spam_corpus/150.txt.gz +0 -0
- data/spec/spam_corpus/152.txt.gz +0 -0
- data/spec/spam_corpus/154.txt.gz +0 -0
- data/spec/spam_corpus/156.txt.gz +0 -0
- data/spec/spam_corpus/158.txt.gz +0 -0
- data/spec/spam_corpus/16.txt.gz +0 -0
- data/spec/spam_corpus/160.txt.gz +0 -0
- data/spec/spam_corpus/162.txt.gz +0 -0
- data/spec/spam_corpus/164.txt.gz +0 -0
- data/spec/spam_corpus/166.txt.gz +0 -0
- data/spec/spam_corpus/168.txt.gz +0 -0
- data/spec/spam_corpus/170.txt.gz +0 -0
- data/spec/spam_corpus/172.txt.gz +0 -0
- data/spec/spam_corpus/174.txt.gz +0 -0
- data/spec/spam_corpus/176.txt.gz +0 -0
- data/spec/spam_corpus/178.txt.gz +0 -0
- data/spec/spam_corpus/180.txt.gz +0 -0
- data/spec/spam_corpus/182.txt.gz +0 -0
- data/spec/spam_corpus/184.txt.gz +0 -0
- data/spec/spam_corpus/186.txt.gz +0 -0
- data/spec/spam_corpus/188.txt.gz +0 -0
- data/spec/spam_corpus/190.txt.gz +0 -0
- data/spec/spam_corpus/192.txt.gz +0 -0
- data/spec/spam_corpus/194.txt.gz +0 -0
- data/spec/spam_corpus/196.txt.gz +0 -0
- data/spec/spam_corpus/198.txt.gz +0 -0
- data/spec/spam_corpus/2.txt.gz +0 -0
- data/spec/spam_corpus/200.txt.gz +0 -0
- data/spec/spam_corpus/202.txt.gz +0 -0
- data/spec/spam_corpus/204.txt.gz +0 -0
- data/spec/spam_corpus/206.txt.gz +0 -0
- data/spec/spam_corpus/208.txt.gz +0 -0
- data/spec/spam_corpus/210.txt.gz +0 -0
- data/spec/spam_corpus/212.txt.gz +0 -0
- data/spec/spam_corpus/214.txt.gz +0 -0
- data/spec/spam_corpus/216.txt.gz +0 -0
- data/spec/spam_corpus/218.txt.gz +0 -0
- data/spec/spam_corpus/220.txt.gz +0 -0
- data/spec/spam_corpus/222.txt.gz +0 -0
- data/spec/spam_corpus/224.txt.gz +0 -0
- data/spec/spam_corpus/226.txt.gz +0 -0
- data/spec/spam_corpus/228.txt.gz +0 -0
- data/spec/spam_corpus/230.txt.gz +0 -0
- data/spec/spam_corpus/232.txt.gz +0 -0
- data/spec/spam_corpus/234.txt.gz +0 -0
- data/spec/spam_corpus/236.txt.gz +0 -0
- data/spec/spam_corpus/238.txt.gz +0 -0
- data/spec/spam_corpus/240.txt.gz +0 -0
- data/spec/spam_corpus/242.txt.gz +0 -0
- data/spec/spam_corpus/244.txt.gz +0 -0
- data/spec/spam_corpus/246.txt.gz +0 -0
- data/spec/spam_corpus/248.txt.gz +0 -0
- data/spec/spam_corpus/250.txt.gz +0 -0
- data/spec/spam_corpus/252.txt.gz +0 -0
- data/spec/spam_corpus/254.txt.gz +0 -0
- data/spec/spam_corpus/256.txt.gz +0 -0
- data/spec/spam_corpus/258.txt.gz +0 -0
- data/spec/spam_corpus/260.txt.gz +0 -0
- data/spec/spam_corpus/262.txt.gz +0 -0
- data/spec/spam_corpus/264.txt.gz +0 -0
- data/spec/spam_corpus/266.txt.gz +0 -0
- data/spec/spam_corpus/268.txt.gz +0 -0
- data/spec/spam_corpus/270.txt.gz +0 -0
- data/spec/spam_corpus/272.txt.gz +0 -0
- data/spec/spam_corpus/274.txt.gz +0 -0
- data/spec/spam_corpus/276.txt.gz +0 -0
- data/spec/spam_corpus/278.txt.gz +0 -0
- data/spec/spam_corpus/28.txt.gz +0 -0
- data/spec/spam_corpus/280.txt.gz +0 -0
- data/spec/spam_corpus/282.txt.gz +0 -0
- data/spec/spam_corpus/284.txt.gz +0 -0
- data/spec/spam_corpus/286.txt.gz +0 -0
- data/spec/spam_corpus/288.txt.gz +0 -0
- data/spec/spam_corpus/290.txt.gz +0 -0
- data/spec/spam_corpus/292.txt.gz +0 -0
- data/spec/spam_corpus/294.txt.gz +0 -0
- data/spec/spam_corpus/296.txt.gz +0 -0
- data/spec/spam_corpus/298.txt.gz +0 -0
- data/spec/spam_corpus/3.txt.gz +0 -0
- data/spec/spam_corpus/30.txt.gz +0 -0
- data/spec/spam_corpus/300.txt.gz +0 -0
- data/spec/spam_corpus/302.txt.gz +0 -0
- data/spec/spam_corpus/304.txt.gz +0 -0
- data/spec/spam_corpus/306.txt.gz +0 -0
- data/spec/spam_corpus/308.txt.gz +0 -0
- data/spec/spam_corpus/310.txt.gz +0 -0
- data/spec/spam_corpus/312.txt.gz +0 -0
- data/spec/spam_corpus/314.txt.gz +0 -0
- data/spec/spam_corpus/316.txt.gz +0 -0
- data/spec/spam_corpus/318.txt.gz +0 -0
- data/spec/spam_corpus/32.txt.gz +0 -0
- data/spec/spam_corpus/320.txt.gz +0 -0
- data/spec/spam_corpus/322.txt.gz +0 -0
- data/spec/spam_corpus/324.txt.gz +0 -0
- data/spec/spam_corpus/326.txt.gz +0 -0
- data/spec/spam_corpus/328.txt.gz +0 -0
- data/spec/spam_corpus/330.txt.gz +0 -0
- data/spec/spam_corpus/332.txt.gz +0 -0
- data/spec/spam_corpus/334.txt.gz +0 -0
- data/spec/spam_corpus/336.txt.gz +0 -0
- data/spec/spam_corpus/338.txt.gz +0 -0
- data/spec/spam_corpus/34.txt.gz +0 -0
- data/spec/spam_corpus/340.txt.gz +0 -0
- data/spec/spam_corpus/342.txt.gz +0 -0
- data/spec/spam_corpus/344.txt.gz +0 -0
- data/spec/spam_corpus/346.txt.gz +0 -0
- data/spec/spam_corpus/348.txt.gz +0 -0
- data/spec/spam_corpus/350.txt.gz +0 -0
- data/spec/spam_corpus/352.txt.gz +0 -0
- data/spec/spam_corpus/354.txt.gz +0 -0
- data/spec/spam_corpus/356.txt.gz +0 -0
- data/spec/spam_corpus/358.txt.gz +0 -0
- data/spec/spam_corpus/36.txt.gz +0 -0
- data/spec/spam_corpus/360.txt.gz +0 -0
- data/spec/spam_corpus/362.txt.gz +0 -0
- data/spec/spam_corpus/364.txt.gz +0 -0
- data/spec/spam_corpus/366.txt.gz +0 -0
- data/spec/spam_corpus/368.txt.gz +0 -0
- data/spec/spam_corpus/370.txt.gz +0 -0
- data/spec/spam_corpus/372.txt.gz +0 -0
- data/spec/spam_corpus/374.txt.gz +0 -0
- data/spec/spam_corpus/376.txt.gz +0 -0
- data/spec/spam_corpus/378.txt.gz +0 -0
- data/spec/spam_corpus/38.txt.gz +0 -0
- data/spec/spam_corpus/380.txt.gz +0 -0
- data/spec/spam_corpus/382.txt.gz +0 -0
- data/spec/spam_corpus/384.txt.gz +0 -0
- data/spec/spam_corpus/386.txt.gz +0 -0
- data/spec/spam_corpus/388.txt.gz +0 -0
- data/spec/spam_corpus/390.txt.gz +0 -0
- data/spec/spam_corpus/392.txt.gz +0 -0
- data/spec/spam_corpus/394.txt.gz +0 -0
- data/spec/spam_corpus/396.txt.gz +0 -0
- data/spec/spam_corpus/398.txt.gz +0 -0
- data/spec/spam_corpus/4.txt.gz +0 -0
- data/spec/spam_corpus/40.txt.gz +0 -0
- data/spec/spam_corpus/400.txt.gz +0 -0
- data/spec/spam_corpus/402.txt.gz +0 -0
- data/spec/spam_corpus/404.txt.gz +0 -0
- data/spec/spam_corpus/406.txt.gz +0 -0
- data/spec/spam_corpus/408.txt.gz +0 -0
- data/spec/spam_corpus/410.txt.gz +0 -0
- data/spec/spam_corpus/412.txt.gz +0 -0
- data/spec/spam_corpus/414.txt.gz +0 -0
- data/spec/spam_corpus/416.txt.gz +0 -0
- data/spec/spam_corpus/418.txt.gz +0 -0
- data/spec/spam_corpus/42.txt.gz +0 -0
- data/spec/spam_corpus/420.txt.gz +0 -0
- data/spec/spam_corpus/422.txt.gz +0 -0
- data/spec/spam_corpus/424.txt.gz +0 -0
- data/spec/spam_corpus/426.txt.gz +0 -0
- data/spec/spam_corpus/428.txt.gz +0 -0
- data/spec/spam_corpus/430.txt.gz +0 -0
- data/spec/spam_corpus/432.txt.gz +0 -0
- data/spec/spam_corpus/434.txt.gz +0 -0
- data/spec/spam_corpus/436.txt.gz +0 -0
- data/spec/spam_corpus/438.txt.gz +0 -0
- data/spec/spam_corpus/44.txt.gz +0 -0
- data/spec/spam_corpus/440.txt.gz +0 -0
- data/spec/spam_corpus/442.txt.gz +0 -0
- data/spec/spam_corpus/444.txt.gz +0 -0
- data/spec/spam_corpus/446.txt.gz +0 -0
- data/spec/spam_corpus/448.txt.gz +0 -0
- data/spec/spam_corpus/450.txt.gz +0 -0
- data/spec/spam_corpus/452.txt.gz +0 -0
- data/spec/spam_corpus/454.txt.gz +0 -0
- data/spec/spam_corpus/456.txt.gz +0 -0
- data/spec/spam_corpus/458.txt.gz +0 -0
- data/spec/spam_corpus/46.txt.gz +0 -0
- data/spec/spam_corpus/460.txt.gz +0 -0
- data/spec/spam_corpus/462.txt.gz +0 -0
- data/spec/spam_corpus/464.txt.gz +0 -0
- data/spec/spam_corpus/466.txt.gz +0 -0
- data/spec/spam_corpus/468.txt.gz +0 -0
- data/spec/spam_corpus/470.txt.gz +0 -0
- data/spec/spam_corpus/472.txt.gz +0 -0
- data/spec/spam_corpus/474.txt.gz +0 -0
- data/spec/spam_corpus/476.txt.gz +0 -0
- data/spec/spam_corpus/478.txt.gz +0 -0
- data/spec/spam_corpus/48.txt.gz +0 -0
- data/spec/spam_corpus/480.txt.gz +0 -0
- data/spec/spam_corpus/482.txt.gz +0 -0
- data/spec/spam_corpus/484.txt.gz +0 -0
- data/spec/spam_corpus/486.txt.gz +0 -0
- data/spec/spam_corpus/488.txt.gz +0 -0
- data/spec/spam_corpus/490.txt.gz +0 -0
- data/spec/spam_corpus/492.txt.gz +0 -0
- data/spec/spam_corpus/494.txt.gz +0 -0
- data/spec/spam_corpus/496.txt.gz +0 -0
- data/spec/spam_corpus/498.txt.gz +0 -0
- data/spec/spam_corpus/5.txt.gz +0 -0
- data/spec/spam_corpus/50.txt.gz +0 -0
- data/spec/spam_corpus/500.txt.gz +0 -0
- data/spec/spam_corpus/502.txt.gz +0 -0
- data/spec/spam_corpus/504.txt.gz +0 -0
- data/spec/spam_corpus/506.txt.gz +0 -0
- data/spec/spam_corpus/508.txt.gz +0 -0
- data/spec/spam_corpus/510.txt.gz +0 -0
- data/spec/spam_corpus/512.txt.gz +0 -0
- data/spec/spam_corpus/514.txt.gz +0 -0
- data/spec/spam_corpus/516.txt.gz +0 -0
- data/spec/spam_corpus/518.txt.gz +0 -0
- data/spec/spam_corpus/52.txt.gz +0 -0
- data/spec/spam_corpus/520.txt.gz +0 -0
- data/spec/spam_corpus/522.txt.gz +0 -0
- data/spec/spam_corpus/524.txt.gz +0 -0
- data/spec/spam_corpus/526.txt.gz +0 -0
- data/spec/spam_corpus/528.txt.gz +0 -0
- data/spec/spam_corpus/530.txt.gz +0 -0
- data/spec/spam_corpus/532.txt.gz +0 -0
- data/spec/spam_corpus/534.txt.gz +0 -0
- data/spec/spam_corpus/536.txt.gz +0 -0
- data/spec/spam_corpus/538.txt.gz +0 -0
- data/spec/spam_corpus/54.txt.gz +0 -0
- data/spec/spam_corpus/540.txt.gz +0 -0
- data/spec/spam_corpus/542.txt.gz +0 -0
- data/spec/spam_corpus/544.txt.gz +0 -0
- data/spec/spam_corpus/546.txt.gz +0 -0
- data/spec/spam_corpus/548.txt.gz +0 -0
- data/spec/spam_corpus/550.txt.gz +0 -0
- data/spec/spam_corpus/552.txt.gz +0 -0
- data/spec/spam_corpus/554.txt.gz +0 -0
- data/spec/spam_corpus/556.txt.gz +0 -0
- data/spec/spam_corpus/558.txt.gz +0 -0
- data/spec/spam_corpus/56.txt.gz +0 -0
- data/spec/spam_corpus/560.txt.gz +0 -0
- data/spec/spam_corpus/562.txt.gz +0 -0
- data/spec/spam_corpus/564.txt.gz +0 -0
- data/spec/spam_corpus/566.txt.gz +0 -0
- data/spec/spam_corpus/568.txt.gz +0 -0
- data/spec/spam_corpus/570.txt.gz +0 -0
- data/spec/spam_corpus/572.txt.gz +0 -0
- data/spec/spam_corpus/574.txt.gz +0 -0
- data/spec/spam_corpus/576.txt.gz +0 -0
- data/spec/spam_corpus/578.txt.gz +0 -0
- data/spec/spam_corpus/58.txt.gz +0 -0
- data/spec/spam_corpus/580.txt.gz +0 -0
- data/spec/spam_corpus/582.txt.gz +0 -0
- data/spec/spam_corpus/584.txt.gz +0 -0
- data/spec/spam_corpus/586.txt.gz +0 -0
- data/spec/spam_corpus/588.txt.gz +0 -0
- data/spec/spam_corpus/590.txt.gz +0 -0
- data/spec/spam_corpus/592.txt.gz +0 -0
- data/spec/spam_corpus/594.txt.gz +0 -0
- data/spec/spam_corpus/596.txt.gz +0 -0
- data/spec/spam_corpus/598.txt.gz +0 -0
- data/spec/spam_corpus/6.txt.gz +0 -0
- data/spec/spam_corpus/60.txt.gz +0 -0
- data/spec/spam_corpus/600.txt.gz +0 -0
- data/spec/spam_corpus/602.txt.gz +0 -0
- data/spec/spam_corpus/604.txt.gz +0 -0
- data/spec/spam_corpus/606.txt.gz +0 -0
- data/spec/spam_corpus/608.txt.gz +0 -0
- data/spec/spam_corpus/610.txt.gz +0 -0
- data/spec/spam_corpus/612.txt.gz +0 -0
- data/spec/spam_corpus/614.txt.gz +0 -0
- data/spec/spam_corpus/616.txt.gz +0 -0
- data/spec/spam_corpus/618.txt.gz +0 -0
- data/spec/spam_corpus/62.txt.gz +0 -0
- data/spec/spam_corpus/620.txt.gz +0 -0
- data/spec/spam_corpus/622.txt.gz +0 -0
- data/spec/spam_corpus/624.txt.gz +0 -0
- data/spec/spam_corpus/626.txt.gz +0 -0
- data/spec/spam_corpus/628.txt.gz +0 -0
- data/spec/spam_corpus/630.txt.gz +0 -0
- data/spec/spam_corpus/632.txt.gz +0 -0
- data/spec/spam_corpus/634.txt.gz +0 -0
- data/spec/spam_corpus/636.txt.gz +0 -0
- data/spec/spam_corpus/638.txt.gz +0 -0
- data/spec/spam_corpus/64.txt.gz +0 -0
- data/spec/spam_corpus/640.txt.gz +0 -0
- data/spec/spam_corpus/642.txt.gz +0 -0
- data/spec/spam_corpus/644.txt.gz +0 -0
- data/spec/spam_corpus/646.txt.gz +0 -0
- data/spec/spam_corpus/648.txt.gz +0 -0
- data/spec/spam_corpus/650.txt.gz +0 -0
- data/spec/spam_corpus/652.txt.gz +0 -0
- data/spec/spam_corpus/654.txt.gz +0 -0
- data/spec/spam_corpus/656.txt.gz +0 -0
- data/spec/spam_corpus/658.txt.gz +0 -0
- data/spec/spam_corpus/66.txt.gz +0 -0
- data/spec/spam_corpus/660.txt.gz +0 -0
- data/spec/spam_corpus/662.txt.gz +0 -0
- data/spec/spam_corpus/664.txt.gz +0 -0
- data/spec/spam_corpus/666.txt.gz +0 -0
- data/spec/spam_corpus/668.txt.gz +0 -0
- data/spec/spam_corpus/670.txt.gz +0 -0
- data/spec/spam_corpus/672.txt.gz +0 -0
- data/spec/spam_corpus/674.txt.gz +0 -0
- data/spec/spam_corpus/676.txt.gz +0 -0
- data/spec/spam_corpus/678.txt.gz +0 -0
- data/spec/spam_corpus/68.txt.gz +0 -0
- data/spec/spam_corpus/680.txt.gz +0 -0
- data/spec/spam_corpus/682.txt.gz +0 -0
- data/spec/spam_corpus/684.txt.gz +0 -0
- data/spec/spam_corpus/686.txt.gz +0 -0
- data/spec/spam_corpus/688.txt.gz +0 -0
- data/spec/spam_corpus/690.txt.gz +0 -0
- data/spec/spam_corpus/692.txt.gz +0 -0
- data/spec/spam_corpus/694.txt.gz +0 -0
- data/spec/spam_corpus/696.txt.gz +0 -0
- data/spec/spam_corpus/698.txt.gz +0 -0
- data/spec/spam_corpus/7.txt.gz +0 -0
- data/spec/spam_corpus/70.txt.gz +0 -0
- data/spec/spam_corpus/700.txt.gz +0 -0
- data/spec/spam_corpus/702.txt.gz +0 -0
- data/spec/spam_corpus/704.txt.gz +0 -0
- data/spec/spam_corpus/706.txt.gz +0 -0
- data/spec/spam_corpus/708.txt.gz +0 -0
- data/spec/spam_corpus/710.txt.gz +0 -0
- data/spec/spam_corpus/712.txt.gz +0 -0
- data/spec/spam_corpus/714.txt.gz +0 -0
- data/spec/spam_corpus/716.txt.gz +0 -0
- data/spec/spam_corpus/718.txt.gz +0 -0
- data/spec/spam_corpus/72.txt.gz +0 -0
- data/spec/spam_corpus/720.txt.gz +0 -0
- data/spec/spam_corpus/722.txt.gz +0 -0
- data/spec/spam_corpus/724.txt.gz +0 -0
- data/spec/spam_corpus/726.txt.gz +0 -0
- data/spec/spam_corpus/728.txt.gz +0 -0
- data/spec/spam_corpus/730.txt.gz +0 -0
- data/spec/spam_corpus/732.txt.gz +0 -0
- data/spec/spam_corpus/734.txt.gz +0 -0
- data/spec/spam_corpus/736.txt.gz +0 -0
- data/spec/spam_corpus/738.txt.gz +0 -0
- data/spec/spam_corpus/74.txt.gz +0 -0
- data/spec/spam_corpus/740.txt.gz +0 -0
- data/spec/spam_corpus/742.txt.gz +0 -0
- data/spec/spam_corpus/744.txt.gz +0 -0
- data/spec/spam_corpus/746.txt.gz +0 -0
- data/spec/spam_corpus/748.txt.gz +0 -0
- data/spec/spam_corpus/750.txt.gz +0 -0
- data/spec/spam_corpus/752.txt.gz +0 -0
- data/spec/spam_corpus/754.txt.gz +0 -0
- data/spec/spam_corpus/756.txt.gz +0 -0
- data/spec/spam_corpus/758.txt.gz +0 -0
- data/spec/spam_corpus/76.txt.gz +0 -0
- data/spec/spam_corpus/760.txt.gz +0 -0
- data/spec/spam_corpus/762.txt.gz +0 -0
- data/spec/spam_corpus/764.txt.gz +0 -0
- data/spec/spam_corpus/766.txt.gz +0 -0
- data/spec/spam_corpus/768.txt.gz +0 -0
- data/spec/spam_corpus/770.txt.gz +0 -0
- data/spec/spam_corpus/772.txt.gz +0 -0
- data/spec/spam_corpus/774.txt.gz +0 -0
- data/spec/spam_corpus/776.txt.gz +0 -0
- data/spec/spam_corpus/778.txt.gz +0 -0
- data/spec/spam_corpus/78.txt.gz +0 -0
- data/spec/spam_corpus/780.txt.gz +0 -0
- data/spec/spam_corpus/782.txt.gz +0 -0
- data/spec/spam_corpus/784.txt.gz +0 -0
- data/spec/spam_corpus/786.txt.gz +0 -0
- data/spec/spam_corpus/788.txt.gz +0 -0
- data/spec/spam_corpus/790.txt.gz +0 -0
- data/spec/spam_corpus/792.txt.gz +0 -0
- data/spec/spam_corpus/794.txt.gz +0 -0
- data/spec/spam_corpus/796.txt.gz +0 -0
- data/spec/spam_corpus/798.txt.gz +0 -0
- data/spec/spam_corpus/8.txt.gz +0 -0
- data/spec/spam_corpus/80.txt.gz +0 -0
- data/spec/spam_corpus/800.txt.gz +0 -0
- data/spec/spam_corpus/802.txt.gz +0 -0
- data/spec/spam_corpus/804.txt.gz +0 -0
- data/spec/spam_corpus/806.txt.gz +0 -0
- data/spec/spam_corpus/808.txt.gz +0 -0
- data/spec/spam_corpus/810.txt.gz +0 -0
- data/spec/spam_corpus/812.txt.gz +0 -0
- data/spec/spam_corpus/814.txt.gz +0 -0
- data/spec/spam_corpus/816.txt.gz +0 -0
- data/spec/spam_corpus/818.txt.gz +0 -0
- data/spec/spam_corpus/82.txt.gz +0 -0
- data/spec/spam_corpus/820.txt.gz +0 -0
- data/spec/spam_corpus/822.txt.gz +0 -0
- data/spec/spam_corpus/824.txt.gz +0 -0
- data/spec/spam_corpus/826.txt.gz +0 -0
- data/spec/spam_corpus/828.txt.gz +0 -0
- data/spec/spam_corpus/830.txt.gz +0 -0
- data/spec/spam_corpus/832.txt.gz +0 -0
- data/spec/spam_corpus/834.txt.gz +0 -0
- data/spec/spam_corpus/836.txt.gz +0 -0
- data/spec/spam_corpus/838.txt.gz +0 -0
- data/spec/spam_corpus/84.txt.gz +0 -0
- data/spec/spam_corpus/840.txt.gz +0 -0
- data/spec/spam_corpus/842.txt.gz +0 -0
- data/spec/spam_corpus/844.txt.gz +0 -0
- data/spec/spam_corpus/846.txt.gz +0 -0
- data/spec/spam_corpus/848.txt.gz +0 -0
- data/spec/spam_corpus/850.txt.gz +0 -0
- data/spec/spam_corpus/852.txt.gz +0 -0
- data/spec/spam_corpus/854.txt.gz +0 -0
- data/spec/spam_corpus/856.txt.gz +0 -0
- data/spec/spam_corpus/858.txt.gz +0 -0
- data/spec/spam_corpus/86.txt.gz +0 -0
- data/spec/spam_corpus/860.txt.gz +0 -0
- data/spec/spam_corpus/862.txt.gz +0 -0
- data/spec/spam_corpus/864.txt.gz +0 -0
- data/spec/spam_corpus/866.txt.gz +0 -0
- data/spec/spam_corpus/868.txt.gz +0 -0
- data/spec/spam_corpus/870.txt.gz +0 -0
- data/spec/spam_corpus/872.txt.gz +0 -0
- data/spec/spam_corpus/874.txt.gz +0 -0
- data/spec/spam_corpus/876.txt.gz +0 -0
- data/spec/spam_corpus/878.txt.gz +0 -0
- data/spec/spam_corpus/88.txt.gz +0 -0
- data/spec/spam_corpus/880.txt.gz +0 -0
- data/spec/spam_corpus/882.txt.gz +0 -0
- data/spec/spam_corpus/884.txt.gz +0 -0
- data/spec/spam_corpus/886.txt.gz +0 -0
- data/spec/spam_corpus/888.txt.gz +0 -0
- data/spec/spam_corpus/890.txt.gz +0 -0
- data/spec/spam_corpus/892.txt.gz +0 -0
- data/spec/spam_corpus/894.txt.gz +0 -0
- data/spec/spam_corpus/896.txt.gz +0 -0
- data/spec/spam_corpus/898.txt.gz +0 -0
- data/spec/spam_corpus/9.txt.gz +0 -0
- data/spec/spam_corpus/90.txt.gz +0 -0
- data/spec/spam_corpus/900.txt.gz +0 -0
- data/spec/spam_corpus/902.txt.gz +0 -0
- data/spec/spam_corpus/904.txt.gz +0 -0
- data/spec/spam_corpus/906.txt.gz +0 -0
- data/spec/spam_corpus/908.txt.gz +0 -0
- data/spec/spam_corpus/910.txt.gz +0 -0
- data/spec/spam_corpus/912.txt.gz +0 -0
- data/spec/spam_corpus/914.txt.gz +0 -0
- data/spec/spam_corpus/916.txt.gz +0 -0
- data/spec/spam_corpus/918.txt.gz +0 -0
- data/spec/spam_corpus/92.txt.gz +0 -0
- data/spec/spam_corpus/920.txt.gz +0 -0
- data/spec/spam_corpus/922.txt.gz +0 -0
- data/spec/spam_corpus/924.txt.gz +0 -0
- data/spec/spam_corpus/926.txt.gz +0 -0
- data/spec/spam_corpus/928.txt.gz +0 -0
- data/spec/spam_corpus/930.txt.gz +0 -0
- data/spec/spam_corpus/932.txt.gz +0 -0
- data/spec/spam_corpus/934.txt.gz +0 -0
- data/spec/spam_corpus/936.txt.gz +0 -0
- data/spec/spam_corpus/938.txt.gz +0 -0
- data/spec/spam_corpus/94.txt.gz +0 -0
- data/spec/spam_corpus/940.txt.gz +0 -0
- data/spec/spam_corpus/942.txt.gz +0 -0
- data/spec/spam_corpus/944.txt.gz +0 -0
- data/spec/spam_corpus/946.txt.gz +0 -0
- data/spec/spam_corpus/948.txt.gz +0 -0
- data/spec/spam_corpus/950.txt.gz +0 -0
- data/spec/spam_corpus/952.txt.gz +0 -0
- data/spec/spam_corpus/954.txt.gz +0 -0
- data/spec/spam_corpus/956.txt.gz +0 -0
- data/spec/spam_corpus/958.txt.gz +0 -0
- data/spec/spam_corpus/96.txt.gz +0 -0
- data/spec/spam_corpus/960.txt.gz +0 -0
- data/spec/spam_corpus/962.txt.gz +0 -0
- data/spec/spam_corpus/964.txt.gz +0 -0
- data/spec/spam_corpus/966.txt.gz +0 -0
- data/spec/spam_corpus/968.txt.gz +0 -0
- data/spec/spam_corpus/970.txt.gz +0 -0
- data/spec/spam_corpus/972.txt.gz +0 -0
- data/spec/spam_corpus/974.txt.gz +0 -0
- data/spec/spam_corpus/98.txt.gz +0 -0
- data/spec/spam_corpus/debugyouradd.com.txt.gz +0 -0
- data/spec/spam_corpus/humandesignconsulting.comm.txt.gz +0 -0
- data/spec/spam_corpus_spec.rb +0 -11
- data/spec/spec.opts +0 -1
- data/spec/spec_helper.rb +0 -16
- data/tasks/rspec.rake +0 -21
data/README.rdoc
CHANGED
@@ -10,13 +10,15 @@ some commonly used heuristics from the world of anti-spam to help you decide whe
|
|
10
10
|
|
11
11
|
== FEATURES/PROBLEMS:
|
12
12
|
|
13
|
-
*
|
13
|
+
* Moved Rails-esque validation gem to the despamilator-rails gem.
|
14
14
|
|
15
15
|
== SYNOPSIS:
|
16
16
|
|
17
|
-
|
18
|
-
|
19
|
-
require '
|
17
|
+
Using Despamilator:
|
18
|
+
|
19
|
+
require 'despamilator'
|
20
|
+
|
21
|
+
# some time later...
|
20
22
|
|
21
23
|
dspam = Despamilator.new('some text with an <h2> tag qthhg')
|
22
24
|
|
@@ -27,35 +29,6 @@ some commonly used heuristics from the world of anti-spam to help you decide whe
|
|
27
29
|
first_match.description #=> some string to describe
|
28
30
|
first_match.score #=> the individual score assigned by this filter
|
29
31
|
|
30
|
-
# adding a new filter! example: detecting the letter "a"
|
31
|
-
# put the following code in lib/despamilator/filter/detect_letter_a.rb
|
32
|
-
def name
|
33
|
-
'Detecting the letter A'
|
34
|
-
end
|
35
|
-
|
36
|
-
def description
|
37
|
-
'Detects the letter "a" in a string for no reason other than a demo'
|
38
|
-
end
|
39
|
-
|
40
|
-
def parse
|
41
|
-
if self.text.downcase.scan(/a/)
|
42
|
-
# add 0.1 to the score of the text
|
43
|
-
self.append_score = 0.1
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
== RAILS SYNOPSIS:
|
48
|
-
|
49
|
-
# in your environment.rb
|
50
|
-
require 'despamilator/validation'
|
51
|
-
|
52
|
-
# in your model
|
53
|
-
include DespamilatorValidation
|
54
|
-
|
55
|
-
def validate
|
56
|
-
validates_despamilation_of :text, :threshold => 1
|
57
|
-
end
|
58
|
-
|
59
32
|
== FILTERING:
|
60
33
|
|
61
34
|
As stated, this is a heuristic scanner so its up to the user to decide the thresholds of the scanner. I usually
|
@@ -76,17 +49,44 @@ They should always supply the following methods:
|
|
76
49
|
|
77
50
|
* name #=> the name of your filter.
|
78
51
|
* description #=> what your filter will look for.
|
79
|
-
* parse #=> the method that will be called when parsing.
|
52
|
+
* parse(text) #=> the method that will be called when parsing. A copy of the message is passed in.
|
80
53
|
|
81
54
|
Along side the above, the following methods are made available to each filter:
|
82
55
|
|
83
|
-
* text #=> a copy of the text your parser will parse
|
84
56
|
* append_score= #=> method to append a score to the text if there are matches in your parser.
|
85
57
|
* matched? #=> whether or not any filter has so far detected something suspect
|
86
58
|
* score #=> the current score assigned to the text
|
87
59
|
|
88
|
-
|
60
|
+
Take a look at the "naughty_q" code and tests in "spec/filters/naughty_q.rb".
|
61
|
+
|
62
|
+
==== Example Filter:
|
63
|
+
|
64
|
+
This example is to detect the letter "a". Put the code in
|
65
|
+
lib/despamilator/filter/detect_letter_a.rb:
|
66
|
+
|
67
|
+
require 'despamilator/filter_base'
|
68
|
+
|
69
|
+
module DespamilatorFilter
|
70
|
+
|
71
|
+
class DetectLetterA < Despamilator::FilterBase
|
72
|
+
|
73
|
+
def name
|
74
|
+
'Detecting the letter A'
|
75
|
+
end
|
76
|
+
|
77
|
+
def description
|
78
|
+
'Detects the letter "a" in a string for no reason other than a demo'
|
79
|
+
end
|
80
|
+
|
81
|
+
def parse text
|
82
|
+
if text.downcase.scan(/a/)
|
83
|
+
# add 0.1 to the score of the text
|
84
|
+
self.append_score = 0.1
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
89
88
|
|
89
|
+
As previously stated, ensure you put a spec test together as well!
|
90
90
|
|
91
91
|
== REQUIREMENTS:
|
92
92
|
|
@@ -100,7 +100,7 @@ spec tests are an absolute must!
|
|
100
100
|
|
101
101
|
== LICENSE:
|
102
102
|
|
103
|
-
Copyright (c)
|
103
|
+
Copyright (c) 2011 Stephen Hardisty
|
104
104
|
|
105
105
|
Permission is hereby granted, free of charge, to any person obtaining
|
106
106
|
a copy of this software and associated documentation files (the
|
data/Rakefile
CHANGED
@@ -27,7 +27,14 @@ task :test => [:spec]
|
|
27
27
|
task :default => [:test]
|
28
28
|
task :install => [:install_gem]
|
29
29
|
|
30
|
+
desc 'Generate relevant documentation.'
|
31
|
+
task :rdoc do
|
32
|
+
sh 'rdoc lib/despamilator.rb lib/despamilator/filter_base.rb'
|
33
|
+
end
|
34
|
+
|
30
35
|
task :cultivate do
|
31
|
-
|
32
|
-
|
33
|
-
|
36
|
+
sh "touch Manifest.txt; rake check_manifest |grep -v \"(in \" | patch"
|
37
|
+
sh "cat Manifest.txt | grep -v 'bundle/config' | grep -v '_corpus' > Manifest.txt2"
|
38
|
+
sh "mv Manifest.txt2 Manifest.txt"
|
39
|
+
sh "rake debug_gem | grep -v \"(in \" > `basename \\`pwd\\``.gemspec"
|
40
|
+
end
|
data/despamilator.gemspec
CHANGED
@@ -2,38 +2,35 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |s|
|
4
4
|
s.name = %q{despamilator}
|
5
|
-
s.version = "0
|
5
|
+
s.version = "1.0"
|
6
6
|
|
7
7
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
8
|
s.authors = ["Stephen Hardisty"]
|
9
|
-
s.date = %q{2010-
|
9
|
+
s.date = %q{2010-12-11}
|
10
10
|
s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
|
11
11
|
Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
|
12
12
|
some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
|
13
13
|
s.email = ["moowahaha@hotmail.com"]
|
14
14
|
s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
|
15
|
-
s.files = ["History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "lib/despamilator/validation.rb", "scripts/despamilator_score.rb", "spec/clean_corpus/101.txt.gz", "spec/clean_corpus/103.txt.gz", "spec/clean_corpus/105.txt.gz", "spec/clean_corpus/107.txt.gz", "spec/clean_corpus/109.txt.gz", "spec/clean_corpus/111.txt.gz", "spec/clean_corpus/113.txt.gz", "spec/clean_corpus/115.txt.gz", "spec/clean_corpus/117.txt.gz", "spec/clean_corpus/119.txt.gz", "spec/clean_corpus/121.txt.gz", "spec/clean_corpus/123.txt.gz", "spec/clean_corpus/125.txt.gz", "spec/clean_corpus/127.txt.gz", "spec/clean_corpus/129.txt.gz", "spec/clean_corpus/131.txt.gz", "spec/clean_corpus/133.txt.gz", "spec/clean_corpus/135.txt.gz", "spec/clean_corpus/137.txt.gz", "spec/clean_corpus/139.txt.gz", "spec/clean_corpus/141.txt.gz", "spec/clean_corpus/143.txt.gz", "spec/clean_corpus/145.txt.gz", "spec/clean_corpus/147.txt.gz", "spec/clean_corpus/149.txt.gz", "spec/clean_corpus/151.txt.gz", "spec/clean_corpus/153.txt.gz", "spec/clean_corpus/155.txt.gz", "spec/clean_corpus/157.txt.gz", "spec/clean_corpus/159.txt.gz", "spec/clean_corpus/161.txt.gz", "spec/clean_corpus/163.txt.gz", "spec/clean_corpus/165.txt.gz", "spec/clean_corpus/167.txt.gz", "spec/clean_corpus/169.txt.gz", "spec/clean_corpus/171.txt.gz", "spec/clean_corpus/173.txt.gz", "spec/clean_corpus/175.txt.gz", "spec/clean_corpus/177.txt.gz", "spec/clean_corpus/179.txt.gz", "spec/clean_corpus/18.txt.gz", "spec/clean_corpus/181.txt.gz", "spec/clean_corpus/183.txt.gz", "spec/clean_corpus/185.txt.gz", "spec/clean_corpus/187.txt.gz", "spec/clean_corpus/189.txt.gz", "spec/clean_corpus/191.txt.gz", "spec/clean_corpus/193.txt.gz", "spec/clean_corpus/195.txt.gz", "spec/clean_corpus/197.txt.gz", "spec/clean_corpus/199.txt.gz", "spec/clean_corpus/20.txt.gz", "spec/clean_corpus/201.txt.gz", "spec/clean_corpus/203.txt.gz", "spec/clean_corpus/205.txt.gz", "spec/clean_corpus/207.txt.gz", "spec/clean_corpus/209.txt.gz", "spec/clean_corpus/211.txt.gz", "spec/clean_corpus/213.txt.gz", "spec/clean_corpus/215.txt.gz", "spec/clean_corpus/217.txt.gz", "spec/clean_corpus/219.txt.gz", "spec/clean_corpus/22.txt.gz", "spec/clean_corpus/221.txt.gz", "spec/clean_corpus/223.txt.gz", "spec/clean_corpus/225.txt.gz", "spec/clean_corpus/24.txt.gz", "spec/clean_corpus/26.txt.gz", "spec/clean_corpus/27.txt.gz", "spec/clean_corpus/29.txt.gz", "spec/clean_corpus/31.txt.gz", "spec/clean_corpus/33.txt.gz", "spec/clean_corpus/35.txt.gz", "spec/clean_corpus/37.txt.gz", "spec/clean_corpus/39.txt.gz", "spec/clean_corpus/41.txt.gz", "spec/clean_corpus/43.txt.gz", "spec/clean_corpus/45.txt.gz", "spec/clean_corpus/47.txt.gz", "spec/clean_corpus/49.txt.gz", "spec/clean_corpus/51.txt.gz", "spec/clean_corpus/53.txt.gz", "spec/clean_corpus/55.txt.gz", "spec/clean_corpus/57.txt.gz", "spec/clean_corpus/59.txt.gz", "spec/clean_corpus/61.txt.gz", "spec/clean_corpus/63.txt.gz", "spec/clean_corpus/65.txt.gz", "spec/clean_corpus/67.txt.gz", "spec/clean_corpus/69.txt.gz", "spec/clean_corpus/71.txt.gz", "spec/clean_corpus/73.txt.gz", "spec/clean_corpus/75.txt.gz", "spec/clean_corpus/77.txt.gz", "spec/clean_corpus/79.txt.gz", "spec/clean_corpus/81.txt.gz", "spec/clean_corpus/83.txt.gz", "spec/clean_corpus/85.txt.gz", "spec/clean_corpus/87.txt.gz", "spec/clean_corpus/89.txt.gz", "spec/clean_corpus/91.txt.gz", "spec/clean_corpus/93.txt.gz", "spec/clean_corpus/95.txt.gz", "spec/clean_corpus/97.txt.gz", "spec/clean_corpus/99.txt.gz", "spec/clean_corpus_spec.rb", "spec/despamilator_spec.rb", "spec/despamilator_validation_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/urls_spec.rb", "spec/spam_corpus/0.txt.gz", "spec/spam_corpus/1.txt.gz", "spec/spam_corpus/10.txt.gz", "spec/spam_corpus/100.txt.gz", "spec/spam_corpus/102.txt.gz", "spec/spam_corpus/104.txt.gz", "spec/spam_corpus/106.txt.gz", "spec/spam_corpus/108.txt.gz", "spec/spam_corpus/11.txt.gz", "spec/spam_corpus/110.txt.gz", "spec/spam_corpus/112.txt.gz", "spec/spam_corpus/114.txt.gz", "spec/spam_corpus/116.txt.gz", "spec/spam_corpus/118.txt.gz", "spec/spam_corpus/12.txt.gz", "spec/spam_corpus/120.txt.gz", "spec/spam_corpus/122.txt.gz", "spec/spam_corpus/124.txt.gz", "spec/spam_corpus/126.txt.gz", "spec/spam_corpus/128.txt.gz", "spec/spam_corpus/13.txt.gz", "spec/spam_corpus/130.txt.gz", "spec/spam_corpus/132.txt.gz", "spec/spam_corpus/134.txt.gz", "spec/spam_corpus/136.txt.gz", "spec/spam_corpus/138.txt.gz", "spec/spam_corpus/14.txt.gz", "spec/spam_corpus/140.txt.gz", "spec/spam_corpus/142.txt.gz", "spec/spam_corpus/144.txt.gz", "spec/spam_corpus/146.txt.gz", "spec/spam_corpus/148.txt.gz", "spec/spam_corpus/15.txt.gz", "spec/spam_corpus/150.txt.gz", "spec/spam_corpus/152.txt.gz", "spec/spam_corpus/154.txt.gz", "spec/spam_corpus/156.txt.gz", "spec/spam_corpus/158.txt.gz", "spec/spam_corpus/16.txt.gz", "spec/spam_corpus/160.txt.gz", "spec/spam_corpus/162.txt.gz", "spec/spam_corpus/164.txt.gz", "spec/spam_corpus/166.txt.gz", "spec/spam_corpus/168.txt.gz", "spec/spam_corpus/170.txt.gz", "spec/spam_corpus/172.txt.gz", "spec/spam_corpus/174.txt.gz", "spec/spam_corpus/176.txt.gz", "spec/spam_corpus/178.txt.gz", "spec/spam_corpus/180.txt.gz", "spec/spam_corpus/182.txt.gz", "spec/spam_corpus/184.txt.gz", "spec/spam_corpus/186.txt.gz", "spec/spam_corpus/188.txt.gz", "spec/spam_corpus/190.txt.gz", "spec/spam_corpus/192.txt.gz", "spec/spam_corpus/194.txt.gz", "spec/spam_corpus/196.txt.gz", "spec/spam_corpus/198.txt.gz", "spec/spam_corpus/2.txt.gz", "spec/spam_corpus/200.txt.gz", "spec/spam_corpus/202.txt.gz", "spec/spam_corpus/204.txt.gz", "spec/spam_corpus/206.txt.gz", "spec/spam_corpus/208.txt.gz", "spec/spam_corpus/210.txt.gz", "spec/spam_corpus/212.txt.gz", "spec/spam_corpus/214.txt.gz", "spec/spam_corpus/216.txt.gz", "spec/spam_corpus/218.txt.gz", "spec/spam_corpus/220.txt.gz", "spec/spam_corpus/222.txt.gz", "spec/spam_corpus/224.txt.gz", "spec/spam_corpus/226.txt.gz", "spec/spam_corpus/228.txt.gz", "spec/spam_corpus/230.txt.gz", "spec/spam_corpus/232.txt.gz", "spec/spam_corpus/234.txt.gz", "spec/spam_corpus/236.txt.gz", "spec/spam_corpus/238.txt.gz", "spec/spam_corpus/240.txt.gz", "spec/spam_corpus/242.txt.gz", "spec/spam_corpus/244.txt.gz", "spec/spam_corpus/246.txt.gz", "spec/spam_corpus/248.txt.gz", "spec/spam_corpus/250.txt.gz", "spec/spam_corpus/252.txt.gz", "spec/spam_corpus/254.txt.gz", "spec/spam_corpus/256.txt.gz", "spec/spam_corpus/258.txt.gz", "spec/spam_corpus/260.txt.gz", "spec/spam_corpus/262.txt.gz", "spec/spam_corpus/264.txt.gz", "spec/spam_corpus/266.txt.gz", "spec/spam_corpus/268.txt.gz", "spec/spam_corpus/270.txt.gz", "spec/spam_corpus/272.txt.gz", "spec/spam_corpus/274.txt.gz", "spec/spam_corpus/276.txt.gz", "spec/spam_corpus/278.txt.gz", "spec/spam_corpus/28.txt.gz", "spec/spam_corpus/280.txt.gz", "spec/spam_corpus/282.txt.gz", "spec/spam_corpus/284.txt.gz", "spec/spam_corpus/286.txt.gz", "spec/spam_corpus/288.txt.gz", "spec/spam_corpus/290.txt.gz", "spec/spam_corpus/292.txt.gz", "spec/spam_corpus/294.txt.gz", "spec/spam_corpus/296.txt.gz", "spec/spam_corpus/298.txt.gz", "spec/spam_corpus/3.txt.gz", "spec/spam_corpus/30.txt.gz", "spec/spam_corpus/300.txt.gz", "spec/spam_corpus/302.txt.gz", "spec/spam_corpus/304.txt.gz", "spec/spam_corpus/306.txt.gz", "spec/spam_corpus/308.txt.gz", "spec/spam_corpus/310.txt.gz", "spec/spam_corpus/312.txt.gz", "spec/spam_corpus/314.txt.gz", "spec/spam_corpus/316.txt.gz", "spec/spam_corpus/318.txt.gz", "spec/spam_corpus/32.txt.gz", "spec/spam_corpus/320.txt.gz", "spec/spam_corpus/322.txt.gz", "spec/spam_corpus/324.txt.gz", "spec/spam_corpus/326.txt.gz", "spec/spam_corpus/328.txt.gz", "spec/spam_corpus/330.txt.gz", "spec/spam_corpus/332.txt.gz", "spec/spam_corpus/334.txt.gz", "spec/spam_corpus/336.txt.gz", "spec/spam_corpus/338.txt.gz", "spec/spam_corpus/34.txt.gz", "spec/spam_corpus/340.txt.gz", "spec/spam_corpus/342.txt.gz", "spec/spam_corpus/344.txt.gz", "spec/spam_corpus/346.txt.gz", "spec/spam_corpus/348.txt.gz", "spec/spam_corpus/350.txt.gz", "spec/spam_corpus/352.txt.gz", "spec/spam_corpus/354.txt.gz", "spec/spam_corpus/356.txt.gz", "spec/spam_corpus/358.txt.gz", "spec/spam_corpus/36.txt.gz", "spec/spam_corpus/360.txt.gz", "spec/spam_corpus/362.txt.gz", "spec/spam_corpus/364.txt.gz", "spec/spam_corpus/366.txt.gz", "spec/spam_corpus/368.txt.gz", "spec/spam_corpus/370.txt.gz", "spec/spam_corpus/372.txt.gz", "spec/spam_corpus/374.txt.gz", "spec/spam_corpus/376.txt.gz", "spec/spam_corpus/378.txt.gz", "spec/spam_corpus/38.txt.gz", "spec/spam_corpus/380.txt.gz", "spec/spam_corpus/382.txt.gz", "spec/spam_corpus/384.txt.gz", "spec/spam_corpus/386.txt.gz", "spec/spam_corpus/388.txt.gz", "spec/spam_corpus/390.txt.gz", "spec/spam_corpus/392.txt.gz", "spec/spam_corpus/394.txt.gz", "spec/spam_corpus/396.txt.gz", "spec/spam_corpus/398.txt.gz", "spec/spam_corpus/4.txt.gz", "spec/spam_corpus/40.txt.gz", "spec/spam_corpus/400.txt.gz", "spec/spam_corpus/402.txt.gz", "spec/spam_corpus/404.txt.gz", "spec/spam_corpus/406.txt.gz", "spec/spam_corpus/408.txt.gz", "spec/spam_corpus/410.txt.gz", "spec/spam_corpus/412.txt.gz", "spec/spam_corpus/414.txt.gz", "spec/spam_corpus/416.txt.gz", "spec/spam_corpus/418.txt.gz", "spec/spam_corpus/42.txt.gz", "spec/spam_corpus/420.txt.gz", "spec/spam_corpus/422.txt.gz", "spec/spam_corpus/424.txt.gz", "spec/spam_corpus/426.txt.gz", "spec/spam_corpus/428.txt.gz", "spec/spam_corpus/430.txt.gz", "spec/spam_corpus/432.txt.gz", "spec/spam_corpus/434.txt.gz", "spec/spam_corpus/436.txt.gz", "spec/spam_corpus/438.txt.gz", "spec/spam_corpus/44.txt.gz", "spec/spam_corpus/440.txt.gz", "spec/spam_corpus/442.txt.gz", "spec/spam_corpus/444.txt.gz", "spec/spam_corpus/446.txt.gz", "spec/spam_corpus/448.txt.gz", "spec/spam_corpus/450.txt.gz", "spec/spam_corpus/452.txt.gz", "spec/spam_corpus/454.txt.gz", "spec/spam_corpus/456.txt.gz", "spec/spam_corpus/458.txt.gz", "spec/spam_corpus/46.txt.gz", "spec/spam_corpus/460.txt.gz", "spec/spam_corpus/462.txt.gz", "spec/spam_corpus/464.txt.gz", "spec/spam_corpus/466.txt.gz", "spec/spam_corpus/468.txt.gz", "spec/spam_corpus/470.txt.gz", "spec/spam_corpus/472.txt.gz", "spec/spam_corpus/474.txt.gz", "spec/spam_corpus/476.txt.gz", "spec/spam_corpus/478.txt.gz", "spec/spam_corpus/48.txt.gz", "spec/spam_corpus/480.txt.gz", "spec/spam_corpus/482.txt.gz", "spec/spam_corpus/484.txt.gz", "spec/spam_corpus/486.txt.gz", "spec/spam_corpus/488.txt.gz", "spec/spam_corpus/490.txt.gz", "spec/spam_corpus/492.txt.gz", "spec/spam_corpus/494.txt.gz", "spec/spam_corpus/496.txt.gz", "spec/spam_corpus/498.txt.gz", "spec/spam_corpus/5.txt.gz", "spec/spam_corpus/50.txt.gz", "spec/spam_corpus/500.txt.gz", "spec/spam_corpus/502.txt.gz", "spec/spam_corpus/504.txt.gz", "spec/spam_corpus/506.txt.gz", "spec/spam_corpus/508.txt.gz", "spec/spam_corpus/510.txt.gz", "spec/spam_corpus/512.txt.gz", "spec/spam_corpus/514.txt.gz", "spec/spam_corpus/516.txt.gz", "spec/spam_corpus/518.txt.gz", "spec/spam_corpus/52.txt.gz", "spec/spam_corpus/520.txt.gz", "spec/spam_corpus/522.txt.gz", "spec/spam_corpus/524.txt.gz", "spec/spam_corpus/526.txt.gz", "spec/spam_corpus/528.txt.gz", "spec/spam_corpus/530.txt.gz", "spec/spam_corpus/532.txt.gz", "spec/spam_corpus/534.txt.gz", "spec/spam_corpus/536.txt.gz", "spec/spam_corpus/538.txt.gz", "spec/spam_corpus/54.txt.gz", "spec/spam_corpus/540.txt.gz", "spec/spam_corpus/542.txt.gz", "spec/spam_corpus/544.txt.gz", "spec/spam_corpus/546.txt.gz", "spec/spam_corpus/548.txt.gz", "spec/spam_corpus/550.txt.gz", "spec/spam_corpus/552.txt.gz", "spec/spam_corpus/554.txt.gz", "spec/spam_corpus/556.txt.gz", "spec/spam_corpus/558.txt.gz", "spec/spam_corpus/56.txt.gz", "spec/spam_corpus/560.txt.gz", "spec/spam_corpus/562.txt.gz", "spec/spam_corpus/564.txt.gz", "spec/spam_corpus/566.txt.gz", "spec/spam_corpus/568.txt.gz", "spec/spam_corpus/570.txt.gz", "spec/spam_corpus/572.txt.gz", "spec/spam_corpus/574.txt.gz", "spec/spam_corpus/576.txt.gz", "spec/spam_corpus/578.txt.gz", "spec/spam_corpus/58.txt.gz", "spec/spam_corpus/580.txt.gz", "spec/spam_corpus/582.txt.gz", "spec/spam_corpus/584.txt.gz", "spec/spam_corpus/586.txt.gz", "spec/spam_corpus/588.txt.gz", "spec/spam_corpus/590.txt.gz", "spec/spam_corpus/592.txt.gz", "spec/spam_corpus/594.txt.gz", "spec/spam_corpus/596.txt.gz", "spec/spam_corpus/598.txt.gz", "spec/spam_corpus/6.txt.gz", "spec/spam_corpus/60.txt.gz", "spec/spam_corpus/600.txt.gz", "spec/spam_corpus/602.txt.gz", "spec/spam_corpus/604.txt.gz", "spec/spam_corpus/606.txt.gz", "spec/spam_corpus/608.txt.gz", "spec/spam_corpus/610.txt.gz", "spec/spam_corpus/612.txt.gz", "spec/spam_corpus/614.txt.gz", "spec/spam_corpus/616.txt.gz", "spec/spam_corpus/618.txt.gz", "spec/spam_corpus/62.txt.gz", "spec/spam_corpus/620.txt.gz", "spec/spam_corpus/622.txt.gz", "spec/spam_corpus/624.txt.gz", "spec/spam_corpus/626.txt.gz", "spec/spam_corpus/628.txt.gz", "spec/spam_corpus/630.txt.gz", "spec/spam_corpus/632.txt.gz", "spec/spam_corpus/634.txt.gz", "spec/spam_corpus/636.txt.gz", "spec/spam_corpus/638.txt.gz", "spec/spam_corpus/64.txt.gz", "spec/spam_corpus/640.txt.gz", "spec/spam_corpus/642.txt.gz", "spec/spam_corpus/644.txt.gz", "spec/spam_corpus/646.txt.gz", "spec/spam_corpus/648.txt.gz", "spec/spam_corpus/650.txt.gz", "spec/spam_corpus/652.txt.gz", "spec/spam_corpus/654.txt.gz", "spec/spam_corpus/656.txt.gz", "spec/spam_corpus/658.txt.gz", "spec/spam_corpus/66.txt.gz", "spec/spam_corpus/660.txt.gz", "spec/spam_corpus/662.txt.gz", "spec/spam_corpus/664.txt.gz", "spec/spam_corpus/666.txt.gz", "spec/spam_corpus/668.txt.gz", "spec/spam_corpus/670.txt.gz", "spec/spam_corpus/672.txt.gz", "spec/spam_corpus/674.txt.gz", "spec/spam_corpus/676.txt.gz", "spec/spam_corpus/678.txt.gz", "spec/spam_corpus/68.txt.gz", "spec/spam_corpus/680.txt.gz", "spec/spam_corpus/682.txt.gz", "spec/spam_corpus/684.txt.gz", "spec/spam_corpus/686.txt.gz", "spec/spam_corpus/688.txt.gz", "spec/spam_corpus/690.txt.gz", "spec/spam_corpus/692.txt.gz", "spec/spam_corpus/694.txt.gz", "spec/spam_corpus/696.txt.gz", "spec/spam_corpus/698.txt.gz", "spec/spam_corpus/7.txt.gz", "spec/spam_corpus/70.txt.gz", "spec/spam_corpus/700.txt.gz", "spec/spam_corpus/702.txt.gz", "spec/spam_corpus/704.txt.gz", "spec/spam_corpus/706.txt.gz", "spec/spam_corpus/708.txt.gz", "spec/spam_corpus/710.txt.gz", "spec/spam_corpus/712.txt.gz", "spec/spam_corpus/714.txt.gz", "spec/spam_corpus/716.txt.gz", "spec/spam_corpus/718.txt.gz", "spec/spam_corpus/72.txt.gz", "spec/spam_corpus/720.txt.gz", "spec/spam_corpus/722.txt.gz", "spec/spam_corpus/724.txt.gz", "spec/spam_corpus/726.txt.gz", "spec/spam_corpus/728.txt.gz", "spec/spam_corpus/730.txt.gz", "spec/spam_corpus/732.txt.gz", "spec/spam_corpus/734.txt.gz", "spec/spam_corpus/736.txt.gz", "spec/spam_corpus/738.txt.gz", "spec/spam_corpus/74.txt.gz", "spec/spam_corpus/740.txt.gz", "spec/spam_corpus/742.txt.gz", "spec/spam_corpus/744.txt.gz", "spec/spam_corpus/746.txt.gz", "spec/spam_corpus/748.txt.gz", "spec/spam_corpus/750.txt.gz", "spec/spam_corpus/752.txt.gz", "spec/spam_corpus/754.txt.gz", "spec/spam_corpus/756.txt.gz", "spec/spam_corpus/758.txt.gz", "spec/spam_corpus/76.txt.gz", "spec/spam_corpus/760.txt.gz", "spec/spam_corpus/762.txt.gz", "spec/spam_corpus/764.txt.gz", "spec/spam_corpus/766.txt.gz", "spec/spam_corpus/768.txt.gz", "spec/spam_corpus/770.txt.gz", "spec/spam_corpus/772.txt.gz", "spec/spam_corpus/774.txt.gz", "spec/spam_corpus/776.txt.gz", "spec/spam_corpus/778.txt.gz", "spec/spam_corpus/78.txt.gz", "spec/spam_corpus/780.txt.gz", "spec/spam_corpus/782.txt.gz", "spec/spam_corpus/784.txt.gz", "spec/spam_corpus/786.txt.gz", "spec/spam_corpus/788.txt.gz", "spec/spam_corpus/790.txt.gz", "spec/spam_corpus/792.txt.gz", "spec/spam_corpus/794.txt.gz", "spec/spam_corpus/796.txt.gz", "spec/spam_corpus/798.txt.gz", "spec/spam_corpus/8.txt.gz", "spec/spam_corpus/80.txt.gz", "spec/spam_corpus/800.txt.gz", "spec/spam_corpus/802.txt.gz", "spec/spam_corpus/804.txt.gz", "spec/spam_corpus/806.txt.gz", "spec/spam_corpus/808.txt.gz", "spec/spam_corpus/810.txt.gz", "spec/spam_corpus/812.txt.gz", "spec/spam_corpus/814.txt.gz", "spec/spam_corpus/816.txt.gz", "spec/spam_corpus/818.txt.gz", "spec/spam_corpus/82.txt.gz", "spec/spam_corpus/820.txt.gz", "spec/spam_corpus/822.txt.gz", "spec/spam_corpus/824.txt.gz", "spec/spam_corpus/826.txt.gz", "spec/spam_corpus/828.txt.gz", "spec/spam_corpus/830.txt.gz", "spec/spam_corpus/832.txt.gz", "spec/spam_corpus/834.txt.gz", "spec/spam_corpus/836.txt.gz", "spec/spam_corpus/838.txt.gz", "spec/spam_corpus/84.txt.gz", "spec/spam_corpus/840.txt.gz", "spec/spam_corpus/842.txt.gz", "spec/spam_corpus/844.txt.gz", "spec/spam_corpus/846.txt.gz", "spec/spam_corpus/848.txt.gz", "spec/spam_corpus/850.txt.gz", "spec/spam_corpus/852.txt.gz", "spec/spam_corpus/854.txt.gz", "spec/spam_corpus/856.txt.gz", "spec/spam_corpus/858.txt.gz", "spec/spam_corpus/86.txt.gz", "spec/spam_corpus/860.txt.gz", "spec/spam_corpus/862.txt.gz", "spec/spam_corpus/864.txt.gz", "spec/spam_corpus/866.txt.gz", "spec/spam_corpus/868.txt.gz", "spec/spam_corpus/870.txt.gz", "spec/spam_corpus/872.txt.gz", "spec/spam_corpus/874.txt.gz", "spec/spam_corpus/876.txt.gz", "spec/spam_corpus/878.txt.gz", "spec/spam_corpus/88.txt.gz", "spec/spam_corpus/880.txt.gz", "spec/spam_corpus/882.txt.gz", "spec/spam_corpus/884.txt.gz", "spec/spam_corpus/886.txt.gz", "spec/spam_corpus/888.txt.gz", "spec/spam_corpus/890.txt.gz", "spec/spam_corpus/892.txt.gz", "spec/spam_corpus/894.txt.gz", "spec/spam_corpus/896.txt.gz", "spec/spam_corpus/898.txt.gz", "spec/spam_corpus/9.txt.gz", "spec/spam_corpus/90.txt.gz", "spec/spam_corpus/900.txt.gz", "spec/spam_corpus/902.txt.gz", "spec/spam_corpus/904.txt.gz", "spec/spam_corpus/906.txt.gz", "spec/spam_corpus/908.txt.gz", "spec/spam_corpus/910.txt.gz", "spec/spam_corpus/912.txt.gz", "spec/spam_corpus/914.txt.gz", "spec/spam_corpus/916.txt.gz", "spec/spam_corpus/918.txt.gz", "spec/spam_corpus/92.txt.gz", "spec/spam_corpus/920.txt.gz", "spec/spam_corpus/922.txt.gz", "spec/spam_corpus/924.txt.gz", "spec/spam_corpus/926.txt.gz", "spec/spam_corpus/928.txt.gz", "spec/spam_corpus/930.txt.gz", "spec/spam_corpus/932.txt.gz", "spec/spam_corpus/934.txt.gz", "spec/spam_corpus/936.txt.gz", "spec/spam_corpus/938.txt.gz", "spec/spam_corpus/94.txt.gz", "spec/spam_corpus/940.txt.gz", "spec/spam_corpus/942.txt.gz", "spec/spam_corpus/944.txt.gz", "spec/spam_corpus/946.txt.gz", "spec/spam_corpus/948.txt.gz", "spec/spam_corpus/950.txt.gz", "spec/spam_corpus/952.txt.gz", "spec/spam_corpus/954.txt.gz", "spec/spam_corpus/956.txt.gz", "spec/spam_corpus/958.txt.gz", "spec/spam_corpus/96.txt.gz", "spec/spam_corpus/960.txt.gz", "spec/spam_corpus/962.txt.gz", "spec/spam_corpus/964.txt.gz", "spec/spam_corpus/966.txt.gz", "spec/spam_corpus/968.txt.gz", "spec/spam_corpus/970.txt.gz", "spec/spam_corpus/972.txt.gz", "spec/spam_corpus/974.txt.gz", "spec/spam_corpus/98.txt.gz", "spec/spam_corpus/debugyouradd.com.txt.gz", "spec/spam_corpus/humandesignconsulting.comm.txt.gz", "spec/spam_corpus_spec.rb", "spec/spec.opts", "spec/spec_helper.rb", "tasks/rspec.rake"]
|
15
|
+
s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
|
16
16
|
s.homepage = %q{http://github.com/moowahaha/despamilator}
|
17
17
|
s.post_install_message = %q{PostInstall.txt}
|
18
18
|
s.rdoc_options = ["--main", "README.rdoc"]
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
s.rubyforge_project = %q{despamilator}
|
21
|
-
s.rubygems_version = %q{1.3.
|
21
|
+
s.rubygems_version = %q{1.3.7}
|
22
22
|
s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
|
23
23
|
|
24
24
|
if s.respond_to? :specification_version then
|
25
25
|
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
26
26
|
s.specification_version = 3
|
27
27
|
|
28
|
-
if Gem::Version.new(Gem::
|
29
|
-
s.add_development_dependency(%q<
|
30
|
-
s.add_development_dependency(%q<hoe>, [">= 2.6.0"])
|
28
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
29
|
+
s.add_development_dependency(%q<hoe>, [">= 2.7.0"])
|
31
30
|
else
|
32
|
-
s.add_dependency(%q<
|
33
|
-
s.add_dependency(%q<hoe>, [">= 2.6.0"])
|
31
|
+
s.add_dependency(%q<hoe>, [">= 2.7.0"])
|
34
32
|
end
|
35
33
|
else
|
36
|
-
s.add_dependency(%q<
|
37
|
-
s.add_dependency(%q<hoe>, [">= 2.6.0"])
|
34
|
+
s.add_dependency(%q<hoe>, [">= 2.7.0"])
|
38
35
|
end
|
39
36
|
end
|
data/lib/despamilator.rb
CHANGED
@@ -2,18 +2,43 @@ $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) ||
|
|
2
2
|
|
3
3
|
require 'despamilator/filter'
|
4
4
|
|
5
|
+
#== SYNOPSIS:
|
6
|
+
#
|
7
|
+
# require 'despamilator'
|
8
|
+
#
|
9
|
+
# # some time later...
|
10
|
+
#
|
11
|
+
# dspam = Despamilator.new('some text with an <h2> tag qthhg')
|
12
|
+
#
|
13
|
+
# dspam.score #=> the total score for this string (1 is normally my threshold)
|
14
|
+
# dspam.matched_by #=> array of matching filters
|
15
|
+
|
5
16
|
class Despamilator
|
6
|
-
VERSION = "0
|
17
|
+
VERSION = "1.0"
|
18
|
+
|
19
|
+
# Constructor. Takes the text you which to parse and score.
|
7
20
|
|
8
21
|
def initialize text
|
9
22
|
@filters = Despamilator::Filter.new text
|
10
23
|
end
|
11
24
|
|
25
|
+
# Returns the total score as a Float.
|
26
|
+
|
12
27
|
def score
|
13
28
|
@filters.score
|
14
29
|
end
|
15
30
|
|
31
|
+
# Returns an array of filters that have matched and contributed to the score.
|
32
|
+
# Each element is a a child of the Despamilator::FilterBase class.
|
33
|
+
|
16
34
|
def matched_by
|
17
35
|
@filters.matches
|
18
36
|
end
|
37
|
+
|
38
|
+
# Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
|
39
|
+
# A string that will result in a spam score of at least 100. Handy for testing.
|
40
|
+
|
41
|
+
def self.gtubs_test_string
|
42
|
+
'89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
|
43
|
+
end
|
19
44
|
end
|
data/lib/despamilator/filter.rb
CHANGED
@@ -3,46 +3,35 @@ class Despamilator
|
|
3
3
|
attr_accessor :matches, :score
|
4
4
|
|
5
5
|
def initialize text
|
6
|
-
@filters ||= []
|
7
6
|
@matches ||= []
|
8
7
|
@score ||= 0
|
9
|
-
|
10
|
-
run_filters
|
8
|
+
run_filters text
|
11
9
|
end
|
12
10
|
|
13
11
|
private
|
14
12
|
|
15
|
-
def
|
16
|
-
|
17
|
-
filter_name = classify_filename filter_file
|
13
|
+
def run_filters text
|
14
|
+
filter_namespace = Object.const_get('DespamilatorFilter')
|
18
15
|
|
19
|
-
|
20
|
-
|
21
|
-
filter.class_eval(
|
22
|
-
"require 'despamilator/filter_base'\nclass #{filter_name} < Despamilator::FilterBase\n#{filter_code}\nend"
|
23
|
-
)
|
24
|
-
|
25
|
-
@filters.push(filter.const_get(filter_name).new(text.to_s.dup, File.basename(filter_file)))
|
16
|
+
filter_namespace.constants.each do |filter_class|
|
17
|
+
execute_filter(filter_namespace.const_get(filter_class).new, text)
|
26
18
|
end
|
27
19
|
end
|
28
20
|
|
29
|
-
|
30
|
-
|
31
|
-
|
21
|
+
private
|
22
|
+
|
23
|
+
def execute_filter filter, text
|
24
|
+
filter.parse text.dup
|
32
25
|
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
end
|
26
|
+
if filter.matched?
|
27
|
+
@matches.push(filter)
|
28
|
+
@score += filter.score
|
37
29
|
end
|
38
30
|
end
|
39
31
|
|
40
|
-
|
41
|
-
|
42
|
-
File.basename(filename).gsub(/\.rb$/, '').split('_').each do |filename_part|
|
43
|
-
classname += filename_part.capitalize
|
44
|
-
end
|
45
|
-
classname || filename.capitalize
|
32
|
+
Dir.glob(File.join(File.dirname(__FILE__), 'filter', '*.rb')).each do |filter_file|
|
33
|
+
require filter_file
|
46
34
|
end
|
35
|
+
|
47
36
|
end
|
48
37
|
end
|
@@ -1,21 +1,31 @@
|
|
1
|
-
|
2
|
-
'Funky Consonant'
|
3
|
-
end
|
1
|
+
require 'despamilator/filter_base'
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class FunkyConsonant < Despamilator::FilterBase
|
8
6
|
|
9
|
-
def
|
10
|
-
|
7
|
+
def name
|
8
|
+
'Funky Consonant'
|
9
|
+
end
|
11
10
|
|
12
|
-
|
13
|
-
|
14
|
-
self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
|
11
|
+
def description
|
12
|
+
'Detects and scores each occurrence of a consonant next to an unlikely character'
|
15
13
|
end
|
14
|
+
|
15
|
+
def parse text
|
16
|
+
text.downcase!
|
17
|
+
|
18
|
+
consonant_pairs.each do |pair|
|
19
|
+
[pair, pair.reverse].each do |combo_pair|
|
20
|
+
self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def consonant_pairs
|
26
|
+
%w{ zt gb vk vt jk mj dm jm xz bn }
|
27
|
+
end
|
28
|
+
|
16
29
|
end
|
17
|
-
end
|
18
30
|
|
19
|
-
|
20
|
-
%w{ zt gb vk vt jk mj dm jm }
|
21
|
-
end
|
31
|
+
end
|
@@ -1,116 +1,127 @@
|
|
1
|
-
|
2
|
-
html = self.text.downcase
|
1
|
+
require 'despamilator/filter_base'
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
module DespamilatorFilter
|
4
|
+
|
5
|
+
class HtmlTags < Despamilator::FilterBase
|
6
|
+
|
7
|
+
def parse text
|
8
|
+
text.downcase!
|
9
|
+
|
10
|
+
html_tags.each do |tag|
|
11
|
+
if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
|
12
|
+
self.append_score = 0.3
|
13
|
+
end
|
14
|
+
end
|
7
15
|
end
|
8
|
-
end
|
9
|
-
end
|
10
16
|
|
11
|
-
def name
|
12
|
-
|
13
|
-
end
|
17
|
+
def name
|
18
|
+
'HTML tags'
|
19
|
+
end
|
14
20
|
|
15
|
-
def description
|
16
|
-
|
17
|
-
end
|
21
|
+
def description
|
22
|
+
'Detects HTML tags in text'
|
23
|
+
end
|
24
|
+
|
25
|
+
def html_tags
|
26
|
+
# make sure these are lowercase, in order to save processing
|
27
|
+
[
|
28
|
+
'!--',
|
29
|
+
'!doctype',
|
30
|
+
'a',
|
31
|
+
'abbr',
|
32
|
+
'acronym',
|
33
|
+
'address',
|
34
|
+
'applet',
|
35
|
+
'area',
|
36
|
+
'b',
|
37
|
+
'base',
|
38
|
+
'basefont',
|
39
|
+
'bdo',
|
40
|
+
'big',
|
41
|
+
'blockquote',
|
42
|
+
'body',
|
43
|
+
'br',
|
44
|
+
'button',
|
45
|
+
'caption',
|
46
|
+
'center',
|
47
|
+
'cite',
|
48
|
+
'code',
|
49
|
+
'col',
|
50
|
+
'colgroup',
|
51
|
+
'dd',
|
52
|
+
'del',
|
53
|
+
'dfn',
|
54
|
+
'dir',
|
55
|
+
'div',
|
56
|
+
'dl',
|
57
|
+
'dt',
|
58
|
+
'em',
|
59
|
+
'fieldset',
|
60
|
+
'font',
|
61
|
+
'form',
|
62
|
+
'frame',
|
63
|
+
'frameset',
|
64
|
+
'h1',
|
65
|
+
'h2',
|
66
|
+
'h3',
|
67
|
+
'h4',
|
68
|
+
'h5',
|
69
|
+
'h6',
|
70
|
+
'head',
|
71
|
+
'hr',
|
72
|
+
'html',
|
73
|
+
'i',
|
74
|
+
'iframe',
|
75
|
+
'img',
|
76
|
+
'input',
|
77
|
+
'ins',
|
78
|
+
'isindex',
|
79
|
+
'kbd',
|
80
|
+
'label',
|
81
|
+
'legend',
|
82
|
+
'li',
|
83
|
+
'link',
|
84
|
+
'map',
|
85
|
+
'menu',
|
86
|
+
'meta',
|
87
|
+
'noframes',
|
88
|
+
'noscript',
|
89
|
+
'object',
|
90
|
+
'ol',
|
91
|
+
'optgroup',
|
92
|
+
'option',
|
93
|
+
'p',
|
94
|
+
'param',
|
95
|
+
'pre',
|
96
|
+
'q',
|
97
|
+
's',
|
98
|
+
'samp',
|
99
|
+
'select',
|
100
|
+
'small',
|
101
|
+
'span',
|
102
|
+
'strike',
|
103
|
+
'strong',
|
104
|
+
'style',
|
105
|
+
'sub',
|
106
|
+
'sup',
|
107
|
+
'table',
|
108
|
+
'tbody',
|
109
|
+
'td',
|
110
|
+
'textarea',
|
111
|
+
'tfoot',
|
112
|
+
'th',
|
113
|
+
'thead',
|
114
|
+
'title',
|
115
|
+
'tr',
|
116
|
+
'tt',
|
117
|
+
'u',
|
118
|
+
'ul',
|
119
|
+
'var',
|
120
|
+
'xmp'
|
121
|
+
]
|
122
|
+
|
123
|
+
end
|
18
124
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
'!--',
|
23
|
-
'!doctype',
|
24
|
-
'a',
|
25
|
-
'abbr',
|
26
|
-
'acronym',
|
27
|
-
'address',
|
28
|
-
'applet',
|
29
|
-
'area',
|
30
|
-
'b',
|
31
|
-
'base',
|
32
|
-
'basefont',
|
33
|
-
'bdo',
|
34
|
-
'big',
|
35
|
-
'blockquote',
|
36
|
-
'body',
|
37
|
-
'br',
|
38
|
-
'button',
|
39
|
-
'caption',
|
40
|
-
'center',
|
41
|
-
'cite',
|
42
|
-
'code',
|
43
|
-
'col',
|
44
|
-
'colgroup',
|
45
|
-
'dd',
|
46
|
-
'del',
|
47
|
-
'dfn',
|
48
|
-
'dir',
|
49
|
-
'div',
|
50
|
-
'dl',
|
51
|
-
'dt',
|
52
|
-
'em',
|
53
|
-
'fieldset',
|
54
|
-
'font',
|
55
|
-
'form',
|
56
|
-
'frame',
|
57
|
-
'frameset',
|
58
|
-
'h1',
|
59
|
-
'h2',
|
60
|
-
'h3',
|
61
|
-
'h4',
|
62
|
-
'h5',
|
63
|
-
'h6',
|
64
|
-
'head',
|
65
|
-
'hr',
|
66
|
-
'html',
|
67
|
-
'i',
|
68
|
-
'iframe',
|
69
|
-
'img',
|
70
|
-
'input',
|
71
|
-
'ins',
|
72
|
-
'isindex',
|
73
|
-
'kbd',
|
74
|
-
'label',
|
75
|
-
'legend',
|
76
|
-
'li',
|
77
|
-
'link',
|
78
|
-
'map',
|
79
|
-
'menu',
|
80
|
-
'meta',
|
81
|
-
'noframes',
|
82
|
-
'noscript',
|
83
|
-
'object',
|
84
|
-
'ol',
|
85
|
-
'optgroup',
|
86
|
-
'option',
|
87
|
-
'p',
|
88
|
-
'param',
|
89
|
-
'pre',
|
90
|
-
'q',
|
91
|
-
's',
|
92
|
-
'samp',
|
93
|
-
'select',
|
94
|
-
'small',
|
95
|
-
'span',
|
96
|
-
'strike',
|
97
|
-
'strong',
|
98
|
-
'style',
|
99
|
-
'sub',
|
100
|
-
'sup',
|
101
|
-
'table',
|
102
|
-
'tbody',
|
103
|
-
'td',
|
104
|
-
'textarea',
|
105
|
-
'tfoot',
|
106
|
-
'th',
|
107
|
-
'thead',
|
108
|
-
'title',
|
109
|
-
'tr',
|
110
|
-
'tt',
|
111
|
-
'u',
|
112
|
-
'ul',
|
113
|
-
'var',
|
114
|
-
'xmp'
|
115
|
-
]
|
116
|
-
end
|
125
|
+
end
|
126
|
+
|
127
|
+
end
|