despamilator 0.8 → 1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (646) hide show
  1. data/.rspec +3 -0
  2. data/.rvmrc +1 -0
  3. data/Gemfile +12 -0
  4. data/Gemfile.lock +47 -0
  5. data/History.txt +14 -0
  6. data/Manifest.txt +9 -605
  7. data/README.rdoc +37 -37
  8. data/Rakefile +10 -3
  9. data/despamilator.gemspec +8 -11
  10. data/lib/despamilator.rb +26 -1
  11. data/lib/despamilator/filter.rb +15 -26
  12. data/lib/despamilator/filter/funky_consonant.rb +25 -15
  13. data/lib/despamilator/filter/html_tags.rb +122 -111
  14. data/lib/despamilator/filter/ip_address_url.rb +18 -8
  15. data/lib/despamilator/filter/long_words.rb +20 -10
  16. data/lib/despamilator/filter/naughty_q.rb +24 -14
  17. data/lib/despamilator/filter/naughty_words.rb +25 -16
  18. data/lib/despamilator/filter/numbers_and_words.rb +39 -29
  19. data/lib/despamilator/filter/script_tag.rb +18 -10
  20. data/lib/despamilator/filter/shouting.rb +25 -15
  21. data/lib/despamilator/filter/square_brackets.rb +19 -9
  22. data/lib/despamilator/filter/urls.rb +24 -8
  23. data/lib/despamilator/filter_base.rb +54 -9
  24. data/spec/despamilator_spec.rb +0 -2
  25. data/spec/filter_base_spec.rb +30 -0
  26. data/spec/filters/funky_consonant_spec.rb +6 -36
  27. data/spec/filters/html_tags_spec.rb +120 -138
  28. data/spec/filters/ip_address_url_spec.rb +6 -24
  29. data/spec/filters/long_words_spec.rb +6 -29
  30. data/spec/filters/naughty_q_spec.rb +6 -34
  31. data/spec/filters/naughty_words_spec.rb +6 -34
  32. data/spec/filters/numbers_and_words_spec.rb +21 -46
  33. data/spec/filters/script_tag_spec.rb +10 -20
  34. data/spec/filters/shouting_spec.rb +28 -33
  35. data/spec/filters/square_brackets_spec.rb +6 -30
  36. data/spec/filters/urls_spec.rb +6 -34
  37. data/spec/helpers/corpus_helper.rb +5 -0
  38. data/spec/helpers/filter_helper.rb +59 -0
  39. data/spec/helpers/spec_helper.rb +6 -0
  40. data/tasks/test.rake +6 -0
  41. metadata +19 -611
  42. data/lib/despamilator/validation.rb +0 -12
  43. data/spec/clean_corpus/101.txt.gz +0 -0
  44. data/spec/clean_corpus/103.txt.gz +0 -0
  45. data/spec/clean_corpus/105.txt.gz +0 -0
  46. data/spec/clean_corpus/107.txt.gz +0 -0
  47. data/spec/clean_corpus/109.txt.gz +0 -0
  48. data/spec/clean_corpus/111.txt.gz +0 -0
  49. data/spec/clean_corpus/113.txt.gz +0 -0
  50. data/spec/clean_corpus/115.txt.gz +0 -0
  51. data/spec/clean_corpus/117.txt.gz +0 -0
  52. data/spec/clean_corpus/119.txt.gz +0 -0
  53. data/spec/clean_corpus/121.txt.gz +0 -0
  54. data/spec/clean_corpus/123.txt.gz +0 -0
  55. data/spec/clean_corpus/125.txt.gz +0 -0
  56. data/spec/clean_corpus/127.txt.gz +0 -0
  57. data/spec/clean_corpus/129.txt.gz +0 -0
  58. data/spec/clean_corpus/131.txt.gz +0 -0
  59. data/spec/clean_corpus/133.txt.gz +0 -0
  60. data/spec/clean_corpus/135.txt.gz +0 -0
  61. data/spec/clean_corpus/137.txt.gz +0 -0
  62. data/spec/clean_corpus/139.txt.gz +0 -0
  63. data/spec/clean_corpus/141.txt.gz +0 -0
  64. data/spec/clean_corpus/143.txt.gz +0 -0
  65. data/spec/clean_corpus/145.txt.gz +0 -0
  66. data/spec/clean_corpus/147.txt.gz +0 -0
  67. data/spec/clean_corpus/149.txt.gz +0 -0
  68. data/spec/clean_corpus/151.txt.gz +0 -0
  69. data/spec/clean_corpus/153.txt.gz +0 -0
  70. data/spec/clean_corpus/155.txt.gz +0 -0
  71. data/spec/clean_corpus/157.txt.gz +0 -0
  72. data/spec/clean_corpus/159.txt.gz +0 -0
  73. data/spec/clean_corpus/161.txt.gz +0 -0
  74. data/spec/clean_corpus/163.txt.gz +0 -0
  75. data/spec/clean_corpus/165.txt.gz +0 -0
  76. data/spec/clean_corpus/167.txt.gz +0 -0
  77. data/spec/clean_corpus/169.txt.gz +0 -0
  78. data/spec/clean_corpus/171.txt.gz +0 -0
  79. data/spec/clean_corpus/173.txt.gz +0 -0
  80. data/spec/clean_corpus/175.txt.gz +0 -0
  81. data/spec/clean_corpus/177.txt.gz +0 -0
  82. data/spec/clean_corpus/179.txt.gz +0 -0
  83. data/spec/clean_corpus/18.txt.gz +0 -0
  84. data/spec/clean_corpus/181.txt.gz +0 -0
  85. data/spec/clean_corpus/183.txt.gz +0 -0
  86. data/spec/clean_corpus/185.txt.gz +0 -0
  87. data/spec/clean_corpus/187.txt.gz +0 -0
  88. data/spec/clean_corpus/189.txt.gz +0 -0
  89. data/spec/clean_corpus/191.txt.gz +0 -0
  90. data/spec/clean_corpus/193.txt.gz +0 -0
  91. data/spec/clean_corpus/195.txt.gz +0 -0
  92. data/spec/clean_corpus/197.txt.gz +0 -0
  93. data/spec/clean_corpus/199.txt.gz +0 -0
  94. data/spec/clean_corpus/20.txt.gz +0 -0
  95. data/spec/clean_corpus/201.txt.gz +0 -0
  96. data/spec/clean_corpus/203.txt.gz +0 -0
  97. data/spec/clean_corpus/205.txt.gz +0 -0
  98. data/spec/clean_corpus/207.txt.gz +0 -0
  99. data/spec/clean_corpus/209.txt.gz +0 -0
  100. data/spec/clean_corpus/211.txt.gz +0 -0
  101. data/spec/clean_corpus/213.txt.gz +0 -0
  102. data/spec/clean_corpus/215.txt.gz +0 -0
  103. data/spec/clean_corpus/217.txt.gz +0 -0
  104. data/spec/clean_corpus/219.txt.gz +0 -0
  105. data/spec/clean_corpus/22.txt.gz +0 -0
  106. data/spec/clean_corpus/221.txt.gz +0 -0
  107. data/spec/clean_corpus/223.txt.gz +0 -0
  108. data/spec/clean_corpus/225.txt.gz +0 -0
  109. data/spec/clean_corpus/24.txt.gz +0 -0
  110. data/spec/clean_corpus/26.txt.gz +0 -0
  111. data/spec/clean_corpus/27.txt.gz +0 -0
  112. data/spec/clean_corpus/29.txt.gz +0 -0
  113. data/spec/clean_corpus/31.txt.gz +0 -0
  114. data/spec/clean_corpus/33.txt.gz +0 -0
  115. data/spec/clean_corpus/35.txt.gz +0 -0
  116. data/spec/clean_corpus/37.txt.gz +0 -0
  117. data/spec/clean_corpus/39.txt.gz +0 -0
  118. data/spec/clean_corpus/41.txt.gz +0 -0
  119. data/spec/clean_corpus/43.txt.gz +0 -0
  120. data/spec/clean_corpus/45.txt.gz +0 -0
  121. data/spec/clean_corpus/47.txt.gz +0 -0
  122. data/spec/clean_corpus/49.txt.gz +0 -0
  123. data/spec/clean_corpus/51.txt.gz +0 -0
  124. data/spec/clean_corpus/53.txt.gz +0 -0
  125. data/spec/clean_corpus/55.txt.gz +0 -0
  126. data/spec/clean_corpus/57.txt.gz +0 -0
  127. data/spec/clean_corpus/59.txt.gz +0 -0
  128. data/spec/clean_corpus/61.txt.gz +0 -0
  129. data/spec/clean_corpus/63.txt.gz +0 -0
  130. data/spec/clean_corpus/65.txt.gz +0 -0
  131. data/spec/clean_corpus/67.txt.gz +0 -0
  132. data/spec/clean_corpus/69.txt.gz +0 -0
  133. data/spec/clean_corpus/71.txt.gz +0 -0
  134. data/spec/clean_corpus/73.txt.gz +0 -0
  135. data/spec/clean_corpus/75.txt.gz +0 -0
  136. data/spec/clean_corpus/77.txt.gz +0 -0
  137. data/spec/clean_corpus/79.txt.gz +0 -0
  138. data/spec/clean_corpus/81.txt.gz +0 -0
  139. data/spec/clean_corpus/83.txt.gz +0 -0
  140. data/spec/clean_corpus/85.txt.gz +0 -0
  141. data/spec/clean_corpus/87.txt.gz +0 -0
  142. data/spec/clean_corpus/89.txt.gz +0 -0
  143. data/spec/clean_corpus/91.txt.gz +0 -0
  144. data/spec/clean_corpus/93.txt.gz +0 -0
  145. data/spec/clean_corpus/95.txt.gz +0 -0
  146. data/spec/clean_corpus/97.txt.gz +0 -0
  147. data/spec/clean_corpus/99.txt.gz +0 -0
  148. data/spec/clean_corpus_spec.rb +0 -11
  149. data/spec/despamilator_validation_spec.rb +0 -27
  150. data/spec/spam_corpus/0.txt.gz +0 -0
  151. data/spec/spam_corpus/1.txt.gz +0 -0
  152. data/spec/spam_corpus/10.txt.gz +0 -0
  153. data/spec/spam_corpus/100.txt.gz +0 -0
  154. data/spec/spam_corpus/102.txt.gz +0 -0
  155. data/spec/spam_corpus/104.txt.gz +0 -0
  156. data/spec/spam_corpus/106.txt.gz +0 -0
  157. data/spec/spam_corpus/108.txt.gz +0 -0
  158. data/spec/spam_corpus/11.txt.gz +0 -0
  159. data/spec/spam_corpus/110.txt.gz +0 -0
  160. data/spec/spam_corpus/112.txt.gz +0 -0
  161. data/spec/spam_corpus/114.txt.gz +0 -0
  162. data/spec/spam_corpus/116.txt.gz +0 -0
  163. data/spec/spam_corpus/118.txt.gz +0 -0
  164. data/spec/spam_corpus/12.txt.gz +0 -0
  165. data/spec/spam_corpus/120.txt.gz +0 -0
  166. data/spec/spam_corpus/122.txt.gz +0 -0
  167. data/spec/spam_corpus/124.txt.gz +0 -0
  168. data/spec/spam_corpus/126.txt.gz +0 -0
  169. data/spec/spam_corpus/128.txt.gz +0 -0
  170. data/spec/spam_corpus/13.txt.gz +0 -0
  171. data/spec/spam_corpus/130.txt.gz +0 -0
  172. data/spec/spam_corpus/132.txt.gz +0 -0
  173. data/spec/spam_corpus/134.txt.gz +0 -0
  174. data/spec/spam_corpus/136.txt.gz +0 -0
  175. data/spec/spam_corpus/138.txt.gz +0 -0
  176. data/spec/spam_corpus/14.txt.gz +0 -0
  177. data/spec/spam_corpus/140.txt.gz +0 -0
  178. data/spec/spam_corpus/142.txt.gz +0 -0
  179. data/spec/spam_corpus/144.txt.gz +0 -0
  180. data/spec/spam_corpus/146.txt.gz +0 -0
  181. data/spec/spam_corpus/148.txt.gz +0 -0
  182. data/spec/spam_corpus/15.txt.gz +0 -0
  183. data/spec/spam_corpus/150.txt.gz +0 -0
  184. data/spec/spam_corpus/152.txt.gz +0 -0
  185. data/spec/spam_corpus/154.txt.gz +0 -0
  186. data/spec/spam_corpus/156.txt.gz +0 -0
  187. data/spec/spam_corpus/158.txt.gz +0 -0
  188. data/spec/spam_corpus/16.txt.gz +0 -0
  189. data/spec/spam_corpus/160.txt.gz +0 -0
  190. data/spec/spam_corpus/162.txt.gz +0 -0
  191. data/spec/spam_corpus/164.txt.gz +0 -0
  192. data/spec/spam_corpus/166.txt.gz +0 -0
  193. data/spec/spam_corpus/168.txt.gz +0 -0
  194. data/spec/spam_corpus/170.txt.gz +0 -0
  195. data/spec/spam_corpus/172.txt.gz +0 -0
  196. data/spec/spam_corpus/174.txt.gz +0 -0
  197. data/spec/spam_corpus/176.txt.gz +0 -0
  198. data/spec/spam_corpus/178.txt.gz +0 -0
  199. data/spec/spam_corpus/180.txt.gz +0 -0
  200. data/spec/spam_corpus/182.txt.gz +0 -0
  201. data/spec/spam_corpus/184.txt.gz +0 -0
  202. data/spec/spam_corpus/186.txt.gz +0 -0
  203. data/spec/spam_corpus/188.txt.gz +0 -0
  204. data/spec/spam_corpus/190.txt.gz +0 -0
  205. data/spec/spam_corpus/192.txt.gz +0 -0
  206. data/spec/spam_corpus/194.txt.gz +0 -0
  207. data/spec/spam_corpus/196.txt.gz +0 -0
  208. data/spec/spam_corpus/198.txt.gz +0 -0
  209. data/spec/spam_corpus/2.txt.gz +0 -0
  210. data/spec/spam_corpus/200.txt.gz +0 -0
  211. data/spec/spam_corpus/202.txt.gz +0 -0
  212. data/spec/spam_corpus/204.txt.gz +0 -0
  213. data/spec/spam_corpus/206.txt.gz +0 -0
  214. data/spec/spam_corpus/208.txt.gz +0 -0
  215. data/spec/spam_corpus/210.txt.gz +0 -0
  216. data/spec/spam_corpus/212.txt.gz +0 -0
  217. data/spec/spam_corpus/214.txt.gz +0 -0
  218. data/spec/spam_corpus/216.txt.gz +0 -0
  219. data/spec/spam_corpus/218.txt.gz +0 -0
  220. data/spec/spam_corpus/220.txt.gz +0 -0
  221. data/spec/spam_corpus/222.txt.gz +0 -0
  222. data/spec/spam_corpus/224.txt.gz +0 -0
  223. data/spec/spam_corpus/226.txt.gz +0 -0
  224. data/spec/spam_corpus/228.txt.gz +0 -0
  225. data/spec/spam_corpus/230.txt.gz +0 -0
  226. data/spec/spam_corpus/232.txt.gz +0 -0
  227. data/spec/spam_corpus/234.txt.gz +0 -0
  228. data/spec/spam_corpus/236.txt.gz +0 -0
  229. data/spec/spam_corpus/238.txt.gz +0 -0
  230. data/spec/spam_corpus/240.txt.gz +0 -0
  231. data/spec/spam_corpus/242.txt.gz +0 -0
  232. data/spec/spam_corpus/244.txt.gz +0 -0
  233. data/spec/spam_corpus/246.txt.gz +0 -0
  234. data/spec/spam_corpus/248.txt.gz +0 -0
  235. data/spec/spam_corpus/250.txt.gz +0 -0
  236. data/spec/spam_corpus/252.txt.gz +0 -0
  237. data/spec/spam_corpus/254.txt.gz +0 -0
  238. data/spec/spam_corpus/256.txt.gz +0 -0
  239. data/spec/spam_corpus/258.txt.gz +0 -0
  240. data/spec/spam_corpus/260.txt.gz +0 -0
  241. data/spec/spam_corpus/262.txt.gz +0 -0
  242. data/spec/spam_corpus/264.txt.gz +0 -0
  243. data/spec/spam_corpus/266.txt.gz +0 -0
  244. data/spec/spam_corpus/268.txt.gz +0 -0
  245. data/spec/spam_corpus/270.txt.gz +0 -0
  246. data/spec/spam_corpus/272.txt.gz +0 -0
  247. data/spec/spam_corpus/274.txt.gz +0 -0
  248. data/spec/spam_corpus/276.txt.gz +0 -0
  249. data/spec/spam_corpus/278.txt.gz +0 -0
  250. data/spec/spam_corpus/28.txt.gz +0 -0
  251. data/spec/spam_corpus/280.txt.gz +0 -0
  252. data/spec/spam_corpus/282.txt.gz +0 -0
  253. data/spec/spam_corpus/284.txt.gz +0 -0
  254. data/spec/spam_corpus/286.txt.gz +0 -0
  255. data/spec/spam_corpus/288.txt.gz +0 -0
  256. data/spec/spam_corpus/290.txt.gz +0 -0
  257. data/spec/spam_corpus/292.txt.gz +0 -0
  258. data/spec/spam_corpus/294.txt.gz +0 -0
  259. data/spec/spam_corpus/296.txt.gz +0 -0
  260. data/spec/spam_corpus/298.txt.gz +0 -0
  261. data/spec/spam_corpus/3.txt.gz +0 -0
  262. data/spec/spam_corpus/30.txt.gz +0 -0
  263. data/spec/spam_corpus/300.txt.gz +0 -0
  264. data/spec/spam_corpus/302.txt.gz +0 -0
  265. data/spec/spam_corpus/304.txt.gz +0 -0
  266. data/spec/spam_corpus/306.txt.gz +0 -0
  267. data/spec/spam_corpus/308.txt.gz +0 -0
  268. data/spec/spam_corpus/310.txt.gz +0 -0
  269. data/spec/spam_corpus/312.txt.gz +0 -0
  270. data/spec/spam_corpus/314.txt.gz +0 -0
  271. data/spec/spam_corpus/316.txt.gz +0 -0
  272. data/spec/spam_corpus/318.txt.gz +0 -0
  273. data/spec/spam_corpus/32.txt.gz +0 -0
  274. data/spec/spam_corpus/320.txt.gz +0 -0
  275. data/spec/spam_corpus/322.txt.gz +0 -0
  276. data/spec/spam_corpus/324.txt.gz +0 -0
  277. data/spec/spam_corpus/326.txt.gz +0 -0
  278. data/spec/spam_corpus/328.txt.gz +0 -0
  279. data/spec/spam_corpus/330.txt.gz +0 -0
  280. data/spec/spam_corpus/332.txt.gz +0 -0
  281. data/spec/spam_corpus/334.txt.gz +0 -0
  282. data/spec/spam_corpus/336.txt.gz +0 -0
  283. data/spec/spam_corpus/338.txt.gz +0 -0
  284. data/spec/spam_corpus/34.txt.gz +0 -0
  285. data/spec/spam_corpus/340.txt.gz +0 -0
  286. data/spec/spam_corpus/342.txt.gz +0 -0
  287. data/spec/spam_corpus/344.txt.gz +0 -0
  288. data/spec/spam_corpus/346.txt.gz +0 -0
  289. data/spec/spam_corpus/348.txt.gz +0 -0
  290. data/spec/spam_corpus/350.txt.gz +0 -0
  291. data/spec/spam_corpus/352.txt.gz +0 -0
  292. data/spec/spam_corpus/354.txt.gz +0 -0
  293. data/spec/spam_corpus/356.txt.gz +0 -0
  294. data/spec/spam_corpus/358.txt.gz +0 -0
  295. data/spec/spam_corpus/36.txt.gz +0 -0
  296. data/spec/spam_corpus/360.txt.gz +0 -0
  297. data/spec/spam_corpus/362.txt.gz +0 -0
  298. data/spec/spam_corpus/364.txt.gz +0 -0
  299. data/spec/spam_corpus/366.txt.gz +0 -0
  300. data/spec/spam_corpus/368.txt.gz +0 -0
  301. data/spec/spam_corpus/370.txt.gz +0 -0
  302. data/spec/spam_corpus/372.txt.gz +0 -0
  303. data/spec/spam_corpus/374.txt.gz +0 -0
  304. data/spec/spam_corpus/376.txt.gz +0 -0
  305. data/spec/spam_corpus/378.txt.gz +0 -0
  306. data/spec/spam_corpus/38.txt.gz +0 -0
  307. data/spec/spam_corpus/380.txt.gz +0 -0
  308. data/spec/spam_corpus/382.txt.gz +0 -0
  309. data/spec/spam_corpus/384.txt.gz +0 -0
  310. data/spec/spam_corpus/386.txt.gz +0 -0
  311. data/spec/spam_corpus/388.txt.gz +0 -0
  312. data/spec/spam_corpus/390.txt.gz +0 -0
  313. data/spec/spam_corpus/392.txt.gz +0 -0
  314. data/spec/spam_corpus/394.txt.gz +0 -0
  315. data/spec/spam_corpus/396.txt.gz +0 -0
  316. data/spec/spam_corpus/398.txt.gz +0 -0
  317. data/spec/spam_corpus/4.txt.gz +0 -0
  318. data/spec/spam_corpus/40.txt.gz +0 -0
  319. data/spec/spam_corpus/400.txt.gz +0 -0
  320. data/spec/spam_corpus/402.txt.gz +0 -0
  321. data/spec/spam_corpus/404.txt.gz +0 -0
  322. data/spec/spam_corpus/406.txt.gz +0 -0
  323. data/spec/spam_corpus/408.txt.gz +0 -0
  324. data/spec/spam_corpus/410.txt.gz +0 -0
  325. data/spec/spam_corpus/412.txt.gz +0 -0
  326. data/spec/spam_corpus/414.txt.gz +0 -0
  327. data/spec/spam_corpus/416.txt.gz +0 -0
  328. data/spec/spam_corpus/418.txt.gz +0 -0
  329. data/spec/spam_corpus/42.txt.gz +0 -0
  330. data/spec/spam_corpus/420.txt.gz +0 -0
  331. data/spec/spam_corpus/422.txt.gz +0 -0
  332. data/spec/spam_corpus/424.txt.gz +0 -0
  333. data/spec/spam_corpus/426.txt.gz +0 -0
  334. data/spec/spam_corpus/428.txt.gz +0 -0
  335. data/spec/spam_corpus/430.txt.gz +0 -0
  336. data/spec/spam_corpus/432.txt.gz +0 -0
  337. data/spec/spam_corpus/434.txt.gz +0 -0
  338. data/spec/spam_corpus/436.txt.gz +0 -0
  339. data/spec/spam_corpus/438.txt.gz +0 -0
  340. data/spec/spam_corpus/44.txt.gz +0 -0
  341. data/spec/spam_corpus/440.txt.gz +0 -0
  342. data/spec/spam_corpus/442.txt.gz +0 -0
  343. data/spec/spam_corpus/444.txt.gz +0 -0
  344. data/spec/spam_corpus/446.txt.gz +0 -0
  345. data/spec/spam_corpus/448.txt.gz +0 -0
  346. data/spec/spam_corpus/450.txt.gz +0 -0
  347. data/spec/spam_corpus/452.txt.gz +0 -0
  348. data/spec/spam_corpus/454.txt.gz +0 -0
  349. data/spec/spam_corpus/456.txt.gz +0 -0
  350. data/spec/spam_corpus/458.txt.gz +0 -0
  351. data/spec/spam_corpus/46.txt.gz +0 -0
  352. data/spec/spam_corpus/460.txt.gz +0 -0
  353. data/spec/spam_corpus/462.txt.gz +0 -0
  354. data/spec/spam_corpus/464.txt.gz +0 -0
  355. data/spec/spam_corpus/466.txt.gz +0 -0
  356. data/spec/spam_corpus/468.txt.gz +0 -0
  357. data/spec/spam_corpus/470.txt.gz +0 -0
  358. data/spec/spam_corpus/472.txt.gz +0 -0
  359. data/spec/spam_corpus/474.txt.gz +0 -0
  360. data/spec/spam_corpus/476.txt.gz +0 -0
  361. data/spec/spam_corpus/478.txt.gz +0 -0
  362. data/spec/spam_corpus/48.txt.gz +0 -0
  363. data/spec/spam_corpus/480.txt.gz +0 -0
  364. data/spec/spam_corpus/482.txt.gz +0 -0
  365. data/spec/spam_corpus/484.txt.gz +0 -0
  366. data/spec/spam_corpus/486.txt.gz +0 -0
  367. data/spec/spam_corpus/488.txt.gz +0 -0
  368. data/spec/spam_corpus/490.txt.gz +0 -0
  369. data/spec/spam_corpus/492.txt.gz +0 -0
  370. data/spec/spam_corpus/494.txt.gz +0 -0
  371. data/spec/spam_corpus/496.txt.gz +0 -0
  372. data/spec/spam_corpus/498.txt.gz +0 -0
  373. data/spec/spam_corpus/5.txt.gz +0 -0
  374. data/spec/spam_corpus/50.txt.gz +0 -0
  375. data/spec/spam_corpus/500.txt.gz +0 -0
  376. data/spec/spam_corpus/502.txt.gz +0 -0
  377. data/spec/spam_corpus/504.txt.gz +0 -0
  378. data/spec/spam_corpus/506.txt.gz +0 -0
  379. data/spec/spam_corpus/508.txt.gz +0 -0
  380. data/spec/spam_corpus/510.txt.gz +0 -0
  381. data/spec/spam_corpus/512.txt.gz +0 -0
  382. data/spec/spam_corpus/514.txt.gz +0 -0
  383. data/spec/spam_corpus/516.txt.gz +0 -0
  384. data/spec/spam_corpus/518.txt.gz +0 -0
  385. data/spec/spam_corpus/52.txt.gz +0 -0
  386. data/spec/spam_corpus/520.txt.gz +0 -0
  387. data/spec/spam_corpus/522.txt.gz +0 -0
  388. data/spec/spam_corpus/524.txt.gz +0 -0
  389. data/spec/spam_corpus/526.txt.gz +0 -0
  390. data/spec/spam_corpus/528.txt.gz +0 -0
  391. data/spec/spam_corpus/530.txt.gz +0 -0
  392. data/spec/spam_corpus/532.txt.gz +0 -0
  393. data/spec/spam_corpus/534.txt.gz +0 -0
  394. data/spec/spam_corpus/536.txt.gz +0 -0
  395. data/spec/spam_corpus/538.txt.gz +0 -0
  396. data/spec/spam_corpus/54.txt.gz +0 -0
  397. data/spec/spam_corpus/540.txt.gz +0 -0
  398. data/spec/spam_corpus/542.txt.gz +0 -0
  399. data/spec/spam_corpus/544.txt.gz +0 -0
  400. data/spec/spam_corpus/546.txt.gz +0 -0
  401. data/spec/spam_corpus/548.txt.gz +0 -0
  402. data/spec/spam_corpus/550.txt.gz +0 -0
  403. data/spec/spam_corpus/552.txt.gz +0 -0
  404. data/spec/spam_corpus/554.txt.gz +0 -0
  405. data/spec/spam_corpus/556.txt.gz +0 -0
  406. data/spec/spam_corpus/558.txt.gz +0 -0
  407. data/spec/spam_corpus/56.txt.gz +0 -0
  408. data/spec/spam_corpus/560.txt.gz +0 -0
  409. data/spec/spam_corpus/562.txt.gz +0 -0
  410. data/spec/spam_corpus/564.txt.gz +0 -0
  411. data/spec/spam_corpus/566.txt.gz +0 -0
  412. data/spec/spam_corpus/568.txt.gz +0 -0
  413. data/spec/spam_corpus/570.txt.gz +0 -0
  414. data/spec/spam_corpus/572.txt.gz +0 -0
  415. data/spec/spam_corpus/574.txt.gz +0 -0
  416. data/spec/spam_corpus/576.txt.gz +0 -0
  417. data/spec/spam_corpus/578.txt.gz +0 -0
  418. data/spec/spam_corpus/58.txt.gz +0 -0
  419. data/spec/spam_corpus/580.txt.gz +0 -0
  420. data/spec/spam_corpus/582.txt.gz +0 -0
  421. data/spec/spam_corpus/584.txt.gz +0 -0
  422. data/spec/spam_corpus/586.txt.gz +0 -0
  423. data/spec/spam_corpus/588.txt.gz +0 -0
  424. data/spec/spam_corpus/590.txt.gz +0 -0
  425. data/spec/spam_corpus/592.txt.gz +0 -0
  426. data/spec/spam_corpus/594.txt.gz +0 -0
  427. data/spec/spam_corpus/596.txt.gz +0 -0
  428. data/spec/spam_corpus/598.txt.gz +0 -0
  429. data/spec/spam_corpus/6.txt.gz +0 -0
  430. data/spec/spam_corpus/60.txt.gz +0 -0
  431. data/spec/spam_corpus/600.txt.gz +0 -0
  432. data/spec/spam_corpus/602.txt.gz +0 -0
  433. data/spec/spam_corpus/604.txt.gz +0 -0
  434. data/spec/spam_corpus/606.txt.gz +0 -0
  435. data/spec/spam_corpus/608.txt.gz +0 -0
  436. data/spec/spam_corpus/610.txt.gz +0 -0
  437. data/spec/spam_corpus/612.txt.gz +0 -0
  438. data/spec/spam_corpus/614.txt.gz +0 -0
  439. data/spec/spam_corpus/616.txt.gz +0 -0
  440. data/spec/spam_corpus/618.txt.gz +0 -0
  441. data/spec/spam_corpus/62.txt.gz +0 -0
  442. data/spec/spam_corpus/620.txt.gz +0 -0
  443. data/spec/spam_corpus/622.txt.gz +0 -0
  444. data/spec/spam_corpus/624.txt.gz +0 -0
  445. data/spec/spam_corpus/626.txt.gz +0 -0
  446. data/spec/spam_corpus/628.txt.gz +0 -0
  447. data/spec/spam_corpus/630.txt.gz +0 -0
  448. data/spec/spam_corpus/632.txt.gz +0 -0
  449. data/spec/spam_corpus/634.txt.gz +0 -0
  450. data/spec/spam_corpus/636.txt.gz +0 -0
  451. data/spec/spam_corpus/638.txt.gz +0 -0
  452. data/spec/spam_corpus/64.txt.gz +0 -0
  453. data/spec/spam_corpus/640.txt.gz +0 -0
  454. data/spec/spam_corpus/642.txt.gz +0 -0
  455. data/spec/spam_corpus/644.txt.gz +0 -0
  456. data/spec/spam_corpus/646.txt.gz +0 -0
  457. data/spec/spam_corpus/648.txt.gz +0 -0
  458. data/spec/spam_corpus/650.txt.gz +0 -0
  459. data/spec/spam_corpus/652.txt.gz +0 -0
  460. data/spec/spam_corpus/654.txt.gz +0 -0
  461. data/spec/spam_corpus/656.txt.gz +0 -0
  462. data/spec/spam_corpus/658.txt.gz +0 -0
  463. data/spec/spam_corpus/66.txt.gz +0 -0
  464. data/spec/spam_corpus/660.txt.gz +0 -0
  465. data/spec/spam_corpus/662.txt.gz +0 -0
  466. data/spec/spam_corpus/664.txt.gz +0 -0
  467. data/spec/spam_corpus/666.txt.gz +0 -0
  468. data/spec/spam_corpus/668.txt.gz +0 -0
  469. data/spec/spam_corpus/670.txt.gz +0 -0
  470. data/spec/spam_corpus/672.txt.gz +0 -0
  471. data/spec/spam_corpus/674.txt.gz +0 -0
  472. data/spec/spam_corpus/676.txt.gz +0 -0
  473. data/spec/spam_corpus/678.txt.gz +0 -0
  474. data/spec/spam_corpus/68.txt.gz +0 -0
  475. data/spec/spam_corpus/680.txt.gz +0 -0
  476. data/spec/spam_corpus/682.txt.gz +0 -0
  477. data/spec/spam_corpus/684.txt.gz +0 -0
  478. data/spec/spam_corpus/686.txt.gz +0 -0
  479. data/spec/spam_corpus/688.txt.gz +0 -0
  480. data/spec/spam_corpus/690.txt.gz +0 -0
  481. data/spec/spam_corpus/692.txt.gz +0 -0
  482. data/spec/spam_corpus/694.txt.gz +0 -0
  483. data/spec/spam_corpus/696.txt.gz +0 -0
  484. data/spec/spam_corpus/698.txt.gz +0 -0
  485. data/spec/spam_corpus/7.txt.gz +0 -0
  486. data/spec/spam_corpus/70.txt.gz +0 -0
  487. data/spec/spam_corpus/700.txt.gz +0 -0
  488. data/spec/spam_corpus/702.txt.gz +0 -0
  489. data/spec/spam_corpus/704.txt.gz +0 -0
  490. data/spec/spam_corpus/706.txt.gz +0 -0
  491. data/spec/spam_corpus/708.txt.gz +0 -0
  492. data/spec/spam_corpus/710.txt.gz +0 -0
  493. data/spec/spam_corpus/712.txt.gz +0 -0
  494. data/spec/spam_corpus/714.txt.gz +0 -0
  495. data/spec/spam_corpus/716.txt.gz +0 -0
  496. data/spec/spam_corpus/718.txt.gz +0 -0
  497. data/spec/spam_corpus/72.txt.gz +0 -0
  498. data/spec/spam_corpus/720.txt.gz +0 -0
  499. data/spec/spam_corpus/722.txt.gz +0 -0
  500. data/spec/spam_corpus/724.txt.gz +0 -0
  501. data/spec/spam_corpus/726.txt.gz +0 -0
  502. data/spec/spam_corpus/728.txt.gz +0 -0
  503. data/spec/spam_corpus/730.txt.gz +0 -0
  504. data/spec/spam_corpus/732.txt.gz +0 -0
  505. data/spec/spam_corpus/734.txt.gz +0 -0
  506. data/spec/spam_corpus/736.txt.gz +0 -0
  507. data/spec/spam_corpus/738.txt.gz +0 -0
  508. data/spec/spam_corpus/74.txt.gz +0 -0
  509. data/spec/spam_corpus/740.txt.gz +0 -0
  510. data/spec/spam_corpus/742.txt.gz +0 -0
  511. data/spec/spam_corpus/744.txt.gz +0 -0
  512. data/spec/spam_corpus/746.txt.gz +0 -0
  513. data/spec/spam_corpus/748.txt.gz +0 -0
  514. data/spec/spam_corpus/750.txt.gz +0 -0
  515. data/spec/spam_corpus/752.txt.gz +0 -0
  516. data/spec/spam_corpus/754.txt.gz +0 -0
  517. data/spec/spam_corpus/756.txt.gz +0 -0
  518. data/spec/spam_corpus/758.txt.gz +0 -0
  519. data/spec/spam_corpus/76.txt.gz +0 -0
  520. data/spec/spam_corpus/760.txt.gz +0 -0
  521. data/spec/spam_corpus/762.txt.gz +0 -0
  522. data/spec/spam_corpus/764.txt.gz +0 -0
  523. data/spec/spam_corpus/766.txt.gz +0 -0
  524. data/spec/spam_corpus/768.txt.gz +0 -0
  525. data/spec/spam_corpus/770.txt.gz +0 -0
  526. data/spec/spam_corpus/772.txt.gz +0 -0
  527. data/spec/spam_corpus/774.txt.gz +0 -0
  528. data/spec/spam_corpus/776.txt.gz +0 -0
  529. data/spec/spam_corpus/778.txt.gz +0 -0
  530. data/spec/spam_corpus/78.txt.gz +0 -0
  531. data/spec/spam_corpus/780.txt.gz +0 -0
  532. data/spec/spam_corpus/782.txt.gz +0 -0
  533. data/spec/spam_corpus/784.txt.gz +0 -0
  534. data/spec/spam_corpus/786.txt.gz +0 -0
  535. data/spec/spam_corpus/788.txt.gz +0 -0
  536. data/spec/spam_corpus/790.txt.gz +0 -0
  537. data/spec/spam_corpus/792.txt.gz +0 -0
  538. data/spec/spam_corpus/794.txt.gz +0 -0
  539. data/spec/spam_corpus/796.txt.gz +0 -0
  540. data/spec/spam_corpus/798.txt.gz +0 -0
  541. data/spec/spam_corpus/8.txt.gz +0 -0
  542. data/spec/spam_corpus/80.txt.gz +0 -0
  543. data/spec/spam_corpus/800.txt.gz +0 -0
  544. data/spec/spam_corpus/802.txt.gz +0 -0
  545. data/spec/spam_corpus/804.txt.gz +0 -0
  546. data/spec/spam_corpus/806.txt.gz +0 -0
  547. data/spec/spam_corpus/808.txt.gz +0 -0
  548. data/spec/spam_corpus/810.txt.gz +0 -0
  549. data/spec/spam_corpus/812.txt.gz +0 -0
  550. data/spec/spam_corpus/814.txt.gz +0 -0
  551. data/spec/spam_corpus/816.txt.gz +0 -0
  552. data/spec/spam_corpus/818.txt.gz +0 -0
  553. data/spec/spam_corpus/82.txt.gz +0 -0
  554. data/spec/spam_corpus/820.txt.gz +0 -0
  555. data/spec/spam_corpus/822.txt.gz +0 -0
  556. data/spec/spam_corpus/824.txt.gz +0 -0
  557. data/spec/spam_corpus/826.txt.gz +0 -0
  558. data/spec/spam_corpus/828.txt.gz +0 -0
  559. data/spec/spam_corpus/830.txt.gz +0 -0
  560. data/spec/spam_corpus/832.txt.gz +0 -0
  561. data/spec/spam_corpus/834.txt.gz +0 -0
  562. data/spec/spam_corpus/836.txt.gz +0 -0
  563. data/spec/spam_corpus/838.txt.gz +0 -0
  564. data/spec/spam_corpus/84.txt.gz +0 -0
  565. data/spec/spam_corpus/840.txt.gz +0 -0
  566. data/spec/spam_corpus/842.txt.gz +0 -0
  567. data/spec/spam_corpus/844.txt.gz +0 -0
  568. data/spec/spam_corpus/846.txt.gz +0 -0
  569. data/spec/spam_corpus/848.txt.gz +0 -0
  570. data/spec/spam_corpus/850.txt.gz +0 -0
  571. data/spec/spam_corpus/852.txt.gz +0 -0
  572. data/spec/spam_corpus/854.txt.gz +0 -0
  573. data/spec/spam_corpus/856.txt.gz +0 -0
  574. data/spec/spam_corpus/858.txt.gz +0 -0
  575. data/spec/spam_corpus/86.txt.gz +0 -0
  576. data/spec/spam_corpus/860.txt.gz +0 -0
  577. data/spec/spam_corpus/862.txt.gz +0 -0
  578. data/spec/spam_corpus/864.txt.gz +0 -0
  579. data/spec/spam_corpus/866.txt.gz +0 -0
  580. data/spec/spam_corpus/868.txt.gz +0 -0
  581. data/spec/spam_corpus/870.txt.gz +0 -0
  582. data/spec/spam_corpus/872.txt.gz +0 -0
  583. data/spec/spam_corpus/874.txt.gz +0 -0
  584. data/spec/spam_corpus/876.txt.gz +0 -0
  585. data/spec/spam_corpus/878.txt.gz +0 -0
  586. data/spec/spam_corpus/88.txt.gz +0 -0
  587. data/spec/spam_corpus/880.txt.gz +0 -0
  588. data/spec/spam_corpus/882.txt.gz +0 -0
  589. data/spec/spam_corpus/884.txt.gz +0 -0
  590. data/spec/spam_corpus/886.txt.gz +0 -0
  591. data/spec/spam_corpus/888.txt.gz +0 -0
  592. data/spec/spam_corpus/890.txt.gz +0 -0
  593. data/spec/spam_corpus/892.txt.gz +0 -0
  594. data/spec/spam_corpus/894.txt.gz +0 -0
  595. data/spec/spam_corpus/896.txt.gz +0 -0
  596. data/spec/spam_corpus/898.txt.gz +0 -0
  597. data/spec/spam_corpus/9.txt.gz +0 -0
  598. data/spec/spam_corpus/90.txt.gz +0 -0
  599. data/spec/spam_corpus/900.txt.gz +0 -0
  600. data/spec/spam_corpus/902.txt.gz +0 -0
  601. data/spec/spam_corpus/904.txt.gz +0 -0
  602. data/spec/spam_corpus/906.txt.gz +0 -0
  603. data/spec/spam_corpus/908.txt.gz +0 -0
  604. data/spec/spam_corpus/910.txt.gz +0 -0
  605. data/spec/spam_corpus/912.txt.gz +0 -0
  606. data/spec/spam_corpus/914.txt.gz +0 -0
  607. data/spec/spam_corpus/916.txt.gz +0 -0
  608. data/spec/spam_corpus/918.txt.gz +0 -0
  609. data/spec/spam_corpus/92.txt.gz +0 -0
  610. data/spec/spam_corpus/920.txt.gz +0 -0
  611. data/spec/spam_corpus/922.txt.gz +0 -0
  612. data/spec/spam_corpus/924.txt.gz +0 -0
  613. data/spec/spam_corpus/926.txt.gz +0 -0
  614. data/spec/spam_corpus/928.txt.gz +0 -0
  615. data/spec/spam_corpus/930.txt.gz +0 -0
  616. data/spec/spam_corpus/932.txt.gz +0 -0
  617. data/spec/spam_corpus/934.txt.gz +0 -0
  618. data/spec/spam_corpus/936.txt.gz +0 -0
  619. data/spec/spam_corpus/938.txt.gz +0 -0
  620. data/spec/spam_corpus/94.txt.gz +0 -0
  621. data/spec/spam_corpus/940.txt.gz +0 -0
  622. data/spec/spam_corpus/942.txt.gz +0 -0
  623. data/spec/spam_corpus/944.txt.gz +0 -0
  624. data/spec/spam_corpus/946.txt.gz +0 -0
  625. data/spec/spam_corpus/948.txt.gz +0 -0
  626. data/spec/spam_corpus/950.txt.gz +0 -0
  627. data/spec/spam_corpus/952.txt.gz +0 -0
  628. data/spec/spam_corpus/954.txt.gz +0 -0
  629. data/spec/spam_corpus/956.txt.gz +0 -0
  630. data/spec/spam_corpus/958.txt.gz +0 -0
  631. data/spec/spam_corpus/96.txt.gz +0 -0
  632. data/spec/spam_corpus/960.txt.gz +0 -0
  633. data/spec/spam_corpus/962.txt.gz +0 -0
  634. data/spec/spam_corpus/964.txt.gz +0 -0
  635. data/spec/spam_corpus/966.txt.gz +0 -0
  636. data/spec/spam_corpus/968.txt.gz +0 -0
  637. data/spec/spam_corpus/970.txt.gz +0 -0
  638. data/spec/spam_corpus/972.txt.gz +0 -0
  639. data/spec/spam_corpus/974.txt.gz +0 -0
  640. data/spec/spam_corpus/98.txt.gz +0 -0
  641. data/spec/spam_corpus/debugyouradd.com.txt.gz +0 -0
  642. data/spec/spam_corpus/humandesignconsulting.comm.txt.gz +0 -0
  643. data/spec/spam_corpus_spec.rb +0 -11
  644. data/spec/spec.opts +0 -1
  645. data/spec/spec_helper.rb +0 -16
  646. data/tasks/rspec.rake +0 -21
data/README.rdoc CHANGED
@@ -10,13 +10,15 @@ some commonly used heuristics from the world of anti-spam to help you decide whe
10
10
 
11
11
  == FEATURES/PROBLEMS:
12
12
 
13
- * Added sexy rails validation!! :D
13
+ * Moved Rails-esque validation gem to the despamilator-rails gem.
14
14
 
15
15
  == SYNOPSIS:
16
16
 
17
- # using Despamilator
18
- require 'rubygems'
19
- require 'desplamilator'
17
+ Using Despamilator:
18
+
19
+ require 'despamilator'
20
+
21
+ # some time later...
20
22
 
21
23
  dspam = Despamilator.new('some text with an <h2> tag qthhg')
22
24
 
@@ -27,35 +29,6 @@ some commonly used heuristics from the world of anti-spam to help you decide whe
27
29
  first_match.description #=> some string to describe
28
30
  first_match.score #=> the individual score assigned by this filter
29
31
 
30
- # adding a new filter! example: detecting the letter "a"
31
- # put the following code in lib/despamilator/filter/detect_letter_a.rb
32
- def name
33
- 'Detecting the letter A'
34
- end
35
-
36
- def description
37
- 'Detects the letter "a" in a string for no reason other than a demo'
38
- end
39
-
40
- def parse
41
- if self.text.downcase.scan(/a/)
42
- # add 0.1 to the score of the text
43
- self.append_score = 0.1
44
- end
45
- end
46
-
47
- == RAILS SYNOPSIS:
48
-
49
- # in your environment.rb
50
- require 'despamilator/validation'
51
-
52
- # in your model
53
- include DespamilatorValidation
54
-
55
- def validate
56
- validates_despamilation_of :text, :threshold => 1
57
- end
58
-
59
32
  == FILTERING:
60
33
 
61
34
  As stated, this is a heuristic scanner so its up to the user to decide the thresholds of the scanner. I usually
@@ -76,17 +49,44 @@ They should always supply the following methods:
76
49
 
77
50
  * name #=> the name of your filter.
78
51
  * description #=> what your filter will look for.
79
- * parse #=> the method that will be called when parsing.
52
+ * parse(text) #=> the method that will be called when parsing. A copy of the message is passed in.
80
53
 
81
54
  Along side the above, the following methods are made available to each filter:
82
55
 
83
- * text #=> a copy of the text your parser will parse
84
56
  * append_score= #=> method to append a score to the text if there are matches in your parser.
85
57
  * matched? #=> whether or not any filter has so far detected something suspect
86
58
  * score #=> the current score assigned to the text
87
59
 
88
- spec tests are an absolute must!
60
+ Take a look at the "naughty_q" code and tests in "spec/filters/naughty_q.rb".
61
+
62
+ ==== Example Filter:
63
+
64
+ This example is to detect the letter "a". Put the code in
65
+ lib/despamilator/filter/detect_letter_a.rb:
66
+
67
+ require 'despamilator/filter_base'
68
+
69
+ module DespamilatorFilter
70
+
71
+ class DetectLetterA < Despamilator::FilterBase
72
+
73
+ def name
74
+ 'Detecting the letter A'
75
+ end
76
+
77
+ def description
78
+ 'Detects the letter "a" in a string for no reason other than a demo'
79
+ end
80
+
81
+ def parse text
82
+ if text.downcase.scan(/a/)
83
+ # add 0.1 to the score of the text
84
+ self.append_score = 0.1
85
+ end
86
+ end
87
+ end
89
88
 
89
+ As previously stated, ensure you put a spec test together as well!
90
90
 
91
91
  == REQUIREMENTS:
92
92
 
@@ -100,7 +100,7 @@ spec tests are an absolute must!
100
100
 
101
101
  == LICENSE:
102
102
 
103
- Copyright (c) 2010 Stephen Hardisty
103
+ Copyright (c) 2011 Stephen Hardisty
104
104
 
105
105
  Permission is hereby granted, free of charge, to any person obtaining
106
106
  a copy of this software and associated documentation files (the
data/Rakefile CHANGED
@@ -27,7 +27,14 @@ task :test => [:spec]
27
27
  task :default => [:test]
28
28
  task :install => [:install_gem]
29
29
 
30
+ desc 'Generate relevant documentation.'
31
+ task :rdoc do
32
+ sh 'rdoc lib/despamilator.rb lib/despamilator/filter_base.rb'
33
+ end
34
+
30
35
  task :cultivate do
31
- system "touch Manifest.txt; rake check_manifest | grep -v \"(in \" | patch"
32
- system "rake debug_gem | grep -v \"(in \" > `basename \\`pwd\\``.gemspec |grep -v _spec.rb |grep -v _corpus"
33
- end
36
+ sh "touch Manifest.txt; rake check_manifest |grep -v \"(in \" | patch"
37
+ sh "cat Manifest.txt | grep -v 'bundle/config' | grep -v '_corpus' > Manifest.txt2"
38
+ sh "mv Manifest.txt2 Manifest.txt"
39
+ sh "rake debug_gem | grep -v \"(in \" > `basename \\`pwd\\``.gemspec"
40
+ end
data/despamilator.gemspec CHANGED
@@ -2,38 +2,35 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{despamilator}
5
- s.version = "0.7"
5
+ s.version = "1.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Stephen Hardisty"]
9
- s.date = %q{2010-09-01}
9
+ s.date = %q{2010-12-11}
10
10
  s.description = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances:
11
11
  Spam being submitted in my web forms and CAPTCHAS being intrusive. Despamilator will apply
12
12
  some commonly used heuristics from the world of anti-spam to help you decide whether your users are human or machine.}
13
13
  s.email = ["moowahaha@hotmail.com"]
14
14
  s.extra_rdoc_files = ["History.txt", "Manifest.txt", "PostInstall.txt"]
15
- s.files = ["History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "lib/despamilator/validation.rb", "scripts/despamilator_score.rb", "spec/clean_corpus/101.txt.gz", "spec/clean_corpus/103.txt.gz", "spec/clean_corpus/105.txt.gz", "spec/clean_corpus/107.txt.gz", "spec/clean_corpus/109.txt.gz", "spec/clean_corpus/111.txt.gz", "spec/clean_corpus/113.txt.gz", "spec/clean_corpus/115.txt.gz", "spec/clean_corpus/117.txt.gz", "spec/clean_corpus/119.txt.gz", "spec/clean_corpus/121.txt.gz", "spec/clean_corpus/123.txt.gz", "spec/clean_corpus/125.txt.gz", "spec/clean_corpus/127.txt.gz", "spec/clean_corpus/129.txt.gz", "spec/clean_corpus/131.txt.gz", "spec/clean_corpus/133.txt.gz", "spec/clean_corpus/135.txt.gz", "spec/clean_corpus/137.txt.gz", "spec/clean_corpus/139.txt.gz", "spec/clean_corpus/141.txt.gz", "spec/clean_corpus/143.txt.gz", "spec/clean_corpus/145.txt.gz", "spec/clean_corpus/147.txt.gz", "spec/clean_corpus/149.txt.gz", "spec/clean_corpus/151.txt.gz", "spec/clean_corpus/153.txt.gz", "spec/clean_corpus/155.txt.gz", "spec/clean_corpus/157.txt.gz", "spec/clean_corpus/159.txt.gz", "spec/clean_corpus/161.txt.gz", "spec/clean_corpus/163.txt.gz", "spec/clean_corpus/165.txt.gz", "spec/clean_corpus/167.txt.gz", "spec/clean_corpus/169.txt.gz", "spec/clean_corpus/171.txt.gz", "spec/clean_corpus/173.txt.gz", "spec/clean_corpus/175.txt.gz", "spec/clean_corpus/177.txt.gz", "spec/clean_corpus/179.txt.gz", "spec/clean_corpus/18.txt.gz", "spec/clean_corpus/181.txt.gz", "spec/clean_corpus/183.txt.gz", "spec/clean_corpus/185.txt.gz", "spec/clean_corpus/187.txt.gz", "spec/clean_corpus/189.txt.gz", "spec/clean_corpus/191.txt.gz", "spec/clean_corpus/193.txt.gz", "spec/clean_corpus/195.txt.gz", "spec/clean_corpus/197.txt.gz", "spec/clean_corpus/199.txt.gz", "spec/clean_corpus/20.txt.gz", "spec/clean_corpus/201.txt.gz", "spec/clean_corpus/203.txt.gz", "spec/clean_corpus/205.txt.gz", "spec/clean_corpus/207.txt.gz", "spec/clean_corpus/209.txt.gz", "spec/clean_corpus/211.txt.gz", "spec/clean_corpus/213.txt.gz", "spec/clean_corpus/215.txt.gz", "spec/clean_corpus/217.txt.gz", "spec/clean_corpus/219.txt.gz", "spec/clean_corpus/22.txt.gz", "spec/clean_corpus/221.txt.gz", "spec/clean_corpus/223.txt.gz", "spec/clean_corpus/225.txt.gz", "spec/clean_corpus/24.txt.gz", "spec/clean_corpus/26.txt.gz", "spec/clean_corpus/27.txt.gz", "spec/clean_corpus/29.txt.gz", "spec/clean_corpus/31.txt.gz", "spec/clean_corpus/33.txt.gz", "spec/clean_corpus/35.txt.gz", "spec/clean_corpus/37.txt.gz", "spec/clean_corpus/39.txt.gz", "spec/clean_corpus/41.txt.gz", "spec/clean_corpus/43.txt.gz", "spec/clean_corpus/45.txt.gz", "spec/clean_corpus/47.txt.gz", "spec/clean_corpus/49.txt.gz", "spec/clean_corpus/51.txt.gz", "spec/clean_corpus/53.txt.gz", "spec/clean_corpus/55.txt.gz", "spec/clean_corpus/57.txt.gz", "spec/clean_corpus/59.txt.gz", "spec/clean_corpus/61.txt.gz", "spec/clean_corpus/63.txt.gz", "spec/clean_corpus/65.txt.gz", "spec/clean_corpus/67.txt.gz", "spec/clean_corpus/69.txt.gz", "spec/clean_corpus/71.txt.gz", "spec/clean_corpus/73.txt.gz", "spec/clean_corpus/75.txt.gz", "spec/clean_corpus/77.txt.gz", "spec/clean_corpus/79.txt.gz", "spec/clean_corpus/81.txt.gz", "spec/clean_corpus/83.txt.gz", "spec/clean_corpus/85.txt.gz", "spec/clean_corpus/87.txt.gz", "spec/clean_corpus/89.txt.gz", "spec/clean_corpus/91.txt.gz", "spec/clean_corpus/93.txt.gz", "spec/clean_corpus/95.txt.gz", "spec/clean_corpus/97.txt.gz", "spec/clean_corpus/99.txt.gz", "spec/clean_corpus_spec.rb", "spec/despamilator_spec.rb", "spec/despamilator_validation_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/urls_spec.rb", "spec/spam_corpus/0.txt.gz", "spec/spam_corpus/1.txt.gz", "spec/spam_corpus/10.txt.gz", "spec/spam_corpus/100.txt.gz", "spec/spam_corpus/102.txt.gz", "spec/spam_corpus/104.txt.gz", "spec/spam_corpus/106.txt.gz", "spec/spam_corpus/108.txt.gz", "spec/spam_corpus/11.txt.gz", "spec/spam_corpus/110.txt.gz", "spec/spam_corpus/112.txt.gz", "spec/spam_corpus/114.txt.gz", "spec/spam_corpus/116.txt.gz", "spec/spam_corpus/118.txt.gz", "spec/spam_corpus/12.txt.gz", "spec/spam_corpus/120.txt.gz", "spec/spam_corpus/122.txt.gz", "spec/spam_corpus/124.txt.gz", "spec/spam_corpus/126.txt.gz", "spec/spam_corpus/128.txt.gz", "spec/spam_corpus/13.txt.gz", "spec/spam_corpus/130.txt.gz", "spec/spam_corpus/132.txt.gz", "spec/spam_corpus/134.txt.gz", "spec/spam_corpus/136.txt.gz", "spec/spam_corpus/138.txt.gz", "spec/spam_corpus/14.txt.gz", "spec/spam_corpus/140.txt.gz", "spec/spam_corpus/142.txt.gz", "spec/spam_corpus/144.txt.gz", "spec/spam_corpus/146.txt.gz", "spec/spam_corpus/148.txt.gz", "spec/spam_corpus/15.txt.gz", "spec/spam_corpus/150.txt.gz", "spec/spam_corpus/152.txt.gz", "spec/spam_corpus/154.txt.gz", "spec/spam_corpus/156.txt.gz", "spec/spam_corpus/158.txt.gz", "spec/spam_corpus/16.txt.gz", "spec/spam_corpus/160.txt.gz", "spec/spam_corpus/162.txt.gz", "spec/spam_corpus/164.txt.gz", "spec/spam_corpus/166.txt.gz", "spec/spam_corpus/168.txt.gz", "spec/spam_corpus/170.txt.gz", "spec/spam_corpus/172.txt.gz", "spec/spam_corpus/174.txt.gz", "spec/spam_corpus/176.txt.gz", "spec/spam_corpus/178.txt.gz", "spec/spam_corpus/180.txt.gz", "spec/spam_corpus/182.txt.gz", "spec/spam_corpus/184.txt.gz", "spec/spam_corpus/186.txt.gz", "spec/spam_corpus/188.txt.gz", "spec/spam_corpus/190.txt.gz", "spec/spam_corpus/192.txt.gz", "spec/spam_corpus/194.txt.gz", "spec/spam_corpus/196.txt.gz", "spec/spam_corpus/198.txt.gz", "spec/spam_corpus/2.txt.gz", "spec/spam_corpus/200.txt.gz", "spec/spam_corpus/202.txt.gz", "spec/spam_corpus/204.txt.gz", "spec/spam_corpus/206.txt.gz", "spec/spam_corpus/208.txt.gz", "spec/spam_corpus/210.txt.gz", "spec/spam_corpus/212.txt.gz", "spec/spam_corpus/214.txt.gz", "spec/spam_corpus/216.txt.gz", "spec/spam_corpus/218.txt.gz", "spec/spam_corpus/220.txt.gz", "spec/spam_corpus/222.txt.gz", "spec/spam_corpus/224.txt.gz", "spec/spam_corpus/226.txt.gz", "spec/spam_corpus/228.txt.gz", "spec/spam_corpus/230.txt.gz", "spec/spam_corpus/232.txt.gz", "spec/spam_corpus/234.txt.gz", "spec/spam_corpus/236.txt.gz", "spec/spam_corpus/238.txt.gz", "spec/spam_corpus/240.txt.gz", "spec/spam_corpus/242.txt.gz", "spec/spam_corpus/244.txt.gz", "spec/spam_corpus/246.txt.gz", "spec/spam_corpus/248.txt.gz", "spec/spam_corpus/250.txt.gz", "spec/spam_corpus/252.txt.gz", "spec/spam_corpus/254.txt.gz", "spec/spam_corpus/256.txt.gz", "spec/spam_corpus/258.txt.gz", "spec/spam_corpus/260.txt.gz", "spec/spam_corpus/262.txt.gz", "spec/spam_corpus/264.txt.gz", "spec/spam_corpus/266.txt.gz", "spec/spam_corpus/268.txt.gz", "spec/spam_corpus/270.txt.gz", "spec/spam_corpus/272.txt.gz", "spec/spam_corpus/274.txt.gz", "spec/spam_corpus/276.txt.gz", "spec/spam_corpus/278.txt.gz", "spec/spam_corpus/28.txt.gz", "spec/spam_corpus/280.txt.gz", "spec/spam_corpus/282.txt.gz", "spec/spam_corpus/284.txt.gz", "spec/spam_corpus/286.txt.gz", "spec/spam_corpus/288.txt.gz", "spec/spam_corpus/290.txt.gz", "spec/spam_corpus/292.txt.gz", "spec/spam_corpus/294.txt.gz", "spec/spam_corpus/296.txt.gz", "spec/spam_corpus/298.txt.gz", "spec/spam_corpus/3.txt.gz", "spec/spam_corpus/30.txt.gz", "spec/spam_corpus/300.txt.gz", "spec/spam_corpus/302.txt.gz", "spec/spam_corpus/304.txt.gz", "spec/spam_corpus/306.txt.gz", "spec/spam_corpus/308.txt.gz", "spec/spam_corpus/310.txt.gz", "spec/spam_corpus/312.txt.gz", "spec/spam_corpus/314.txt.gz", "spec/spam_corpus/316.txt.gz", "spec/spam_corpus/318.txt.gz", "spec/spam_corpus/32.txt.gz", "spec/spam_corpus/320.txt.gz", "spec/spam_corpus/322.txt.gz", "spec/spam_corpus/324.txt.gz", "spec/spam_corpus/326.txt.gz", "spec/spam_corpus/328.txt.gz", "spec/spam_corpus/330.txt.gz", "spec/spam_corpus/332.txt.gz", "spec/spam_corpus/334.txt.gz", "spec/spam_corpus/336.txt.gz", "spec/spam_corpus/338.txt.gz", "spec/spam_corpus/34.txt.gz", "spec/spam_corpus/340.txt.gz", "spec/spam_corpus/342.txt.gz", "spec/spam_corpus/344.txt.gz", "spec/spam_corpus/346.txt.gz", "spec/spam_corpus/348.txt.gz", "spec/spam_corpus/350.txt.gz", "spec/spam_corpus/352.txt.gz", "spec/spam_corpus/354.txt.gz", "spec/spam_corpus/356.txt.gz", "spec/spam_corpus/358.txt.gz", "spec/spam_corpus/36.txt.gz", "spec/spam_corpus/360.txt.gz", "spec/spam_corpus/362.txt.gz", "spec/spam_corpus/364.txt.gz", "spec/spam_corpus/366.txt.gz", "spec/spam_corpus/368.txt.gz", "spec/spam_corpus/370.txt.gz", "spec/spam_corpus/372.txt.gz", "spec/spam_corpus/374.txt.gz", "spec/spam_corpus/376.txt.gz", "spec/spam_corpus/378.txt.gz", "spec/spam_corpus/38.txt.gz", "spec/spam_corpus/380.txt.gz", "spec/spam_corpus/382.txt.gz", "spec/spam_corpus/384.txt.gz", "spec/spam_corpus/386.txt.gz", "spec/spam_corpus/388.txt.gz", "spec/spam_corpus/390.txt.gz", "spec/spam_corpus/392.txt.gz", "spec/spam_corpus/394.txt.gz", "spec/spam_corpus/396.txt.gz", "spec/spam_corpus/398.txt.gz", "spec/spam_corpus/4.txt.gz", "spec/spam_corpus/40.txt.gz", "spec/spam_corpus/400.txt.gz", "spec/spam_corpus/402.txt.gz", "spec/spam_corpus/404.txt.gz", "spec/spam_corpus/406.txt.gz", "spec/spam_corpus/408.txt.gz", "spec/spam_corpus/410.txt.gz", "spec/spam_corpus/412.txt.gz", "spec/spam_corpus/414.txt.gz", "spec/spam_corpus/416.txt.gz", "spec/spam_corpus/418.txt.gz", "spec/spam_corpus/42.txt.gz", "spec/spam_corpus/420.txt.gz", "spec/spam_corpus/422.txt.gz", "spec/spam_corpus/424.txt.gz", "spec/spam_corpus/426.txt.gz", "spec/spam_corpus/428.txt.gz", "spec/spam_corpus/430.txt.gz", "spec/spam_corpus/432.txt.gz", "spec/spam_corpus/434.txt.gz", "spec/spam_corpus/436.txt.gz", "spec/spam_corpus/438.txt.gz", "spec/spam_corpus/44.txt.gz", "spec/spam_corpus/440.txt.gz", "spec/spam_corpus/442.txt.gz", "spec/spam_corpus/444.txt.gz", "spec/spam_corpus/446.txt.gz", "spec/spam_corpus/448.txt.gz", "spec/spam_corpus/450.txt.gz", "spec/spam_corpus/452.txt.gz", "spec/spam_corpus/454.txt.gz", "spec/spam_corpus/456.txt.gz", "spec/spam_corpus/458.txt.gz", "spec/spam_corpus/46.txt.gz", "spec/spam_corpus/460.txt.gz", "spec/spam_corpus/462.txt.gz", "spec/spam_corpus/464.txt.gz", "spec/spam_corpus/466.txt.gz", "spec/spam_corpus/468.txt.gz", "spec/spam_corpus/470.txt.gz", "spec/spam_corpus/472.txt.gz", "spec/spam_corpus/474.txt.gz", "spec/spam_corpus/476.txt.gz", "spec/spam_corpus/478.txt.gz", "spec/spam_corpus/48.txt.gz", "spec/spam_corpus/480.txt.gz", "spec/spam_corpus/482.txt.gz", "spec/spam_corpus/484.txt.gz", "spec/spam_corpus/486.txt.gz", "spec/spam_corpus/488.txt.gz", "spec/spam_corpus/490.txt.gz", "spec/spam_corpus/492.txt.gz", "spec/spam_corpus/494.txt.gz", "spec/spam_corpus/496.txt.gz", "spec/spam_corpus/498.txt.gz", "spec/spam_corpus/5.txt.gz", "spec/spam_corpus/50.txt.gz", "spec/spam_corpus/500.txt.gz", "spec/spam_corpus/502.txt.gz", "spec/spam_corpus/504.txt.gz", "spec/spam_corpus/506.txt.gz", "spec/spam_corpus/508.txt.gz", "spec/spam_corpus/510.txt.gz", "spec/spam_corpus/512.txt.gz", "spec/spam_corpus/514.txt.gz", "spec/spam_corpus/516.txt.gz", "spec/spam_corpus/518.txt.gz", "spec/spam_corpus/52.txt.gz", "spec/spam_corpus/520.txt.gz", "spec/spam_corpus/522.txt.gz", "spec/spam_corpus/524.txt.gz", "spec/spam_corpus/526.txt.gz", "spec/spam_corpus/528.txt.gz", "spec/spam_corpus/530.txt.gz", "spec/spam_corpus/532.txt.gz", "spec/spam_corpus/534.txt.gz", "spec/spam_corpus/536.txt.gz", "spec/spam_corpus/538.txt.gz", "spec/spam_corpus/54.txt.gz", "spec/spam_corpus/540.txt.gz", "spec/spam_corpus/542.txt.gz", "spec/spam_corpus/544.txt.gz", "spec/spam_corpus/546.txt.gz", "spec/spam_corpus/548.txt.gz", "spec/spam_corpus/550.txt.gz", "spec/spam_corpus/552.txt.gz", "spec/spam_corpus/554.txt.gz", "spec/spam_corpus/556.txt.gz", "spec/spam_corpus/558.txt.gz", "spec/spam_corpus/56.txt.gz", "spec/spam_corpus/560.txt.gz", "spec/spam_corpus/562.txt.gz", "spec/spam_corpus/564.txt.gz", "spec/spam_corpus/566.txt.gz", "spec/spam_corpus/568.txt.gz", "spec/spam_corpus/570.txt.gz", "spec/spam_corpus/572.txt.gz", "spec/spam_corpus/574.txt.gz", "spec/spam_corpus/576.txt.gz", "spec/spam_corpus/578.txt.gz", "spec/spam_corpus/58.txt.gz", "spec/spam_corpus/580.txt.gz", "spec/spam_corpus/582.txt.gz", "spec/spam_corpus/584.txt.gz", "spec/spam_corpus/586.txt.gz", "spec/spam_corpus/588.txt.gz", "spec/spam_corpus/590.txt.gz", "spec/spam_corpus/592.txt.gz", "spec/spam_corpus/594.txt.gz", "spec/spam_corpus/596.txt.gz", "spec/spam_corpus/598.txt.gz", "spec/spam_corpus/6.txt.gz", "spec/spam_corpus/60.txt.gz", "spec/spam_corpus/600.txt.gz", "spec/spam_corpus/602.txt.gz", "spec/spam_corpus/604.txt.gz", "spec/spam_corpus/606.txt.gz", "spec/spam_corpus/608.txt.gz", "spec/spam_corpus/610.txt.gz", "spec/spam_corpus/612.txt.gz", "spec/spam_corpus/614.txt.gz", "spec/spam_corpus/616.txt.gz", "spec/spam_corpus/618.txt.gz", "spec/spam_corpus/62.txt.gz", "spec/spam_corpus/620.txt.gz", "spec/spam_corpus/622.txt.gz", "spec/spam_corpus/624.txt.gz", "spec/spam_corpus/626.txt.gz", "spec/spam_corpus/628.txt.gz", "spec/spam_corpus/630.txt.gz", "spec/spam_corpus/632.txt.gz", "spec/spam_corpus/634.txt.gz", "spec/spam_corpus/636.txt.gz", "spec/spam_corpus/638.txt.gz", "spec/spam_corpus/64.txt.gz", "spec/spam_corpus/640.txt.gz", "spec/spam_corpus/642.txt.gz", "spec/spam_corpus/644.txt.gz", "spec/spam_corpus/646.txt.gz", "spec/spam_corpus/648.txt.gz", "spec/spam_corpus/650.txt.gz", "spec/spam_corpus/652.txt.gz", "spec/spam_corpus/654.txt.gz", "spec/spam_corpus/656.txt.gz", "spec/spam_corpus/658.txt.gz", "spec/spam_corpus/66.txt.gz", "spec/spam_corpus/660.txt.gz", "spec/spam_corpus/662.txt.gz", "spec/spam_corpus/664.txt.gz", "spec/spam_corpus/666.txt.gz", "spec/spam_corpus/668.txt.gz", "spec/spam_corpus/670.txt.gz", "spec/spam_corpus/672.txt.gz", "spec/spam_corpus/674.txt.gz", "spec/spam_corpus/676.txt.gz", "spec/spam_corpus/678.txt.gz", "spec/spam_corpus/68.txt.gz", "spec/spam_corpus/680.txt.gz", "spec/spam_corpus/682.txt.gz", "spec/spam_corpus/684.txt.gz", "spec/spam_corpus/686.txt.gz", "spec/spam_corpus/688.txt.gz", "spec/spam_corpus/690.txt.gz", "spec/spam_corpus/692.txt.gz", "spec/spam_corpus/694.txt.gz", "spec/spam_corpus/696.txt.gz", "spec/spam_corpus/698.txt.gz", "spec/spam_corpus/7.txt.gz", "spec/spam_corpus/70.txt.gz", "spec/spam_corpus/700.txt.gz", "spec/spam_corpus/702.txt.gz", "spec/spam_corpus/704.txt.gz", "spec/spam_corpus/706.txt.gz", "spec/spam_corpus/708.txt.gz", "spec/spam_corpus/710.txt.gz", "spec/spam_corpus/712.txt.gz", "spec/spam_corpus/714.txt.gz", "spec/spam_corpus/716.txt.gz", "spec/spam_corpus/718.txt.gz", "spec/spam_corpus/72.txt.gz", "spec/spam_corpus/720.txt.gz", "spec/spam_corpus/722.txt.gz", "spec/spam_corpus/724.txt.gz", "spec/spam_corpus/726.txt.gz", "spec/spam_corpus/728.txt.gz", "spec/spam_corpus/730.txt.gz", "spec/spam_corpus/732.txt.gz", "spec/spam_corpus/734.txt.gz", "spec/spam_corpus/736.txt.gz", "spec/spam_corpus/738.txt.gz", "spec/spam_corpus/74.txt.gz", "spec/spam_corpus/740.txt.gz", "spec/spam_corpus/742.txt.gz", "spec/spam_corpus/744.txt.gz", "spec/spam_corpus/746.txt.gz", "spec/spam_corpus/748.txt.gz", "spec/spam_corpus/750.txt.gz", "spec/spam_corpus/752.txt.gz", "spec/spam_corpus/754.txt.gz", "spec/spam_corpus/756.txt.gz", "spec/spam_corpus/758.txt.gz", "spec/spam_corpus/76.txt.gz", "spec/spam_corpus/760.txt.gz", "spec/spam_corpus/762.txt.gz", "spec/spam_corpus/764.txt.gz", "spec/spam_corpus/766.txt.gz", "spec/spam_corpus/768.txt.gz", "spec/spam_corpus/770.txt.gz", "spec/spam_corpus/772.txt.gz", "spec/spam_corpus/774.txt.gz", "spec/spam_corpus/776.txt.gz", "spec/spam_corpus/778.txt.gz", "spec/spam_corpus/78.txt.gz", "spec/spam_corpus/780.txt.gz", "spec/spam_corpus/782.txt.gz", "spec/spam_corpus/784.txt.gz", "spec/spam_corpus/786.txt.gz", "spec/spam_corpus/788.txt.gz", "spec/spam_corpus/790.txt.gz", "spec/spam_corpus/792.txt.gz", "spec/spam_corpus/794.txt.gz", "spec/spam_corpus/796.txt.gz", "spec/spam_corpus/798.txt.gz", "spec/spam_corpus/8.txt.gz", "spec/spam_corpus/80.txt.gz", "spec/spam_corpus/800.txt.gz", "spec/spam_corpus/802.txt.gz", "spec/spam_corpus/804.txt.gz", "spec/spam_corpus/806.txt.gz", "spec/spam_corpus/808.txt.gz", "spec/spam_corpus/810.txt.gz", "spec/spam_corpus/812.txt.gz", "spec/spam_corpus/814.txt.gz", "spec/spam_corpus/816.txt.gz", "spec/spam_corpus/818.txt.gz", "spec/spam_corpus/82.txt.gz", "spec/spam_corpus/820.txt.gz", "spec/spam_corpus/822.txt.gz", "spec/spam_corpus/824.txt.gz", "spec/spam_corpus/826.txt.gz", "spec/spam_corpus/828.txt.gz", "spec/spam_corpus/830.txt.gz", "spec/spam_corpus/832.txt.gz", "spec/spam_corpus/834.txt.gz", "spec/spam_corpus/836.txt.gz", "spec/spam_corpus/838.txt.gz", "spec/spam_corpus/84.txt.gz", "spec/spam_corpus/840.txt.gz", "spec/spam_corpus/842.txt.gz", "spec/spam_corpus/844.txt.gz", "spec/spam_corpus/846.txt.gz", "spec/spam_corpus/848.txt.gz", "spec/spam_corpus/850.txt.gz", "spec/spam_corpus/852.txt.gz", "spec/spam_corpus/854.txt.gz", "spec/spam_corpus/856.txt.gz", "spec/spam_corpus/858.txt.gz", "spec/spam_corpus/86.txt.gz", "spec/spam_corpus/860.txt.gz", "spec/spam_corpus/862.txt.gz", "spec/spam_corpus/864.txt.gz", "spec/spam_corpus/866.txt.gz", "spec/spam_corpus/868.txt.gz", "spec/spam_corpus/870.txt.gz", "spec/spam_corpus/872.txt.gz", "spec/spam_corpus/874.txt.gz", "spec/spam_corpus/876.txt.gz", "spec/spam_corpus/878.txt.gz", "spec/spam_corpus/88.txt.gz", "spec/spam_corpus/880.txt.gz", "spec/spam_corpus/882.txt.gz", "spec/spam_corpus/884.txt.gz", "spec/spam_corpus/886.txt.gz", "spec/spam_corpus/888.txt.gz", "spec/spam_corpus/890.txt.gz", "spec/spam_corpus/892.txt.gz", "spec/spam_corpus/894.txt.gz", "spec/spam_corpus/896.txt.gz", "spec/spam_corpus/898.txt.gz", "spec/spam_corpus/9.txt.gz", "spec/spam_corpus/90.txt.gz", "spec/spam_corpus/900.txt.gz", "spec/spam_corpus/902.txt.gz", "spec/spam_corpus/904.txt.gz", "spec/spam_corpus/906.txt.gz", "spec/spam_corpus/908.txt.gz", "spec/spam_corpus/910.txt.gz", "spec/spam_corpus/912.txt.gz", "spec/spam_corpus/914.txt.gz", "spec/spam_corpus/916.txt.gz", "spec/spam_corpus/918.txt.gz", "spec/spam_corpus/92.txt.gz", "spec/spam_corpus/920.txt.gz", "spec/spam_corpus/922.txt.gz", "spec/spam_corpus/924.txt.gz", "spec/spam_corpus/926.txt.gz", "spec/spam_corpus/928.txt.gz", "spec/spam_corpus/930.txt.gz", "spec/spam_corpus/932.txt.gz", "spec/spam_corpus/934.txt.gz", "spec/spam_corpus/936.txt.gz", "spec/spam_corpus/938.txt.gz", "spec/spam_corpus/94.txt.gz", "spec/spam_corpus/940.txt.gz", "spec/spam_corpus/942.txt.gz", "spec/spam_corpus/944.txt.gz", "spec/spam_corpus/946.txt.gz", "spec/spam_corpus/948.txt.gz", "spec/spam_corpus/950.txt.gz", "spec/spam_corpus/952.txt.gz", "spec/spam_corpus/954.txt.gz", "spec/spam_corpus/956.txt.gz", "spec/spam_corpus/958.txt.gz", "spec/spam_corpus/96.txt.gz", "spec/spam_corpus/960.txt.gz", "spec/spam_corpus/962.txt.gz", "spec/spam_corpus/964.txt.gz", "spec/spam_corpus/966.txt.gz", "spec/spam_corpus/968.txt.gz", "spec/spam_corpus/970.txt.gz", "spec/spam_corpus/972.txt.gz", "spec/spam_corpus/974.txt.gz", "spec/spam_corpus/98.txt.gz", "spec/spam_corpus/debugyouradd.com.txt.gz", "spec/spam_corpus/humandesignconsulting.comm.txt.gz", "spec/spam_corpus_spec.rb", "spec/spec.opts", "spec/spec_helper.rb", "tasks/rspec.rake"]
15
+ s.files = [".rspec", ".rvmrc", "Gemfile", "Gemfile.lock", "History.txt", "Manifest.txt", "PostInstall.txt", "README.rdoc", "Rakefile", "despamilator.gemspec", "lib/despamilator.rb", "lib/despamilator/filter.rb", "lib/despamilator/filter/funky_consonant.rb", "lib/despamilator/filter/html_tags.rb", "lib/despamilator/filter/ip_address_url.rb", "lib/despamilator/filter/long_words.rb", "lib/despamilator/filter/naughty_q.rb", "lib/despamilator/filter/naughty_words.rb", "lib/despamilator/filter/numbers_and_words.rb", "lib/despamilator/filter/script_tag.rb", "lib/despamilator/filter/shouting.rb", "lib/despamilator/filter/square_brackets.rb", "lib/despamilator/filter/urls.rb", "lib/despamilator/filter_base.rb", "scripts/despamilator_score.rb", "spec/despamilator_spec.rb", "spec/filter_base_spec.rb", "spec/filters/funky_consonant_spec.rb", "spec/filters/html_tags_spec.rb", "spec/filters/ip_address_url_spec.rb", "spec/filters/long_words_spec.rb", "spec/filters/naughty_q_spec.rb", "spec/filters/naughty_words_spec.rb", "spec/filters/numbers_and_words_spec.rb", "spec/filters/script_tag_spec.rb", "spec/filters/shouting_spec.rb", "spec/filters/square_brackets_spec.rb", "spec/filters/urls_spec.rb", "spec/helpers/corpus_helper.rb", "spec/helpers/filter_helper.rb", "spec/helpers/spec_helper.rb", "tasks/test.rake"]
16
16
  s.homepage = %q{http://github.com/moowahaha/despamilator}
17
17
  s.post_install_message = %q{PostInstall.txt}
18
18
  s.rdoc_options = ["--main", "README.rdoc"]
19
19
  s.require_paths = ["lib"]
20
20
  s.rubyforge_project = %q{despamilator}
21
- s.rubygems_version = %q{1.3.6}
21
+ s.rubygems_version = %q{1.3.7}
22
22
  s.summary = %q{Despamilator is a plugin based spam detector designed for use on your web forms borne out of two annoyances: Spam being submitted in my web forms and CAPTCHAS being intrusive}
23
23
 
24
24
  if s.respond_to? :specification_version then
25
25
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
26
26
  s.specification_version = 3
27
27
 
28
- if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
29
- s.add_development_dependency(%q<rubyforge>, [">= 2.0.4"])
30
- s.add_development_dependency(%q<hoe>, [">= 2.6.0"])
28
+ if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
29
+ s.add_development_dependency(%q<hoe>, [">= 2.7.0"])
31
30
  else
32
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
33
- s.add_dependency(%q<hoe>, [">= 2.6.0"])
31
+ s.add_dependency(%q<hoe>, [">= 2.7.0"])
34
32
  end
35
33
  else
36
- s.add_dependency(%q<rubyforge>, [">= 2.0.4"])
37
- s.add_dependency(%q<hoe>, [">= 2.6.0"])
34
+ s.add_dependency(%q<hoe>, [">= 2.7.0"])
38
35
  end
39
36
  end
data/lib/despamilator.rb CHANGED
@@ -2,18 +2,43 @@ $:.unshift(File.dirname(__FILE__)) unless $:.include?(File.dirname(__FILE__)) ||
2
2
 
3
3
  require 'despamilator/filter'
4
4
 
5
+ #== SYNOPSIS:
6
+ #
7
+ # require 'despamilator'
8
+ #
9
+ # # some time later...
10
+ #
11
+ # dspam = Despamilator.new('some text with an <h2> tag qthhg')
12
+ #
13
+ # dspam.score #=> the total score for this string (1 is normally my threshold)
14
+ # dspam.matched_by #=> array of matching filters
15
+
5
16
  class Despamilator
6
- VERSION = "0.8"
17
+ VERSION = "1.0"
18
+
19
+ # Constructor. Takes the text you which to parse and score.
7
20
 
8
21
  def initialize text
9
22
  @filters = Despamilator::Filter.new text
10
23
  end
11
24
 
25
+ # Returns the total score as a Float.
26
+
12
27
  def score
13
28
  @filters.score
14
29
  end
15
30
 
31
+ # Returns an array of filters that have matched and contributed to the score.
32
+ # Each element is a a child of the Despamilator::FilterBase class.
33
+
16
34
  def matched_by
17
35
  @filters.matches
18
36
  end
37
+
38
+ # Generic Test for Unsolicited Bulk Submissions. Similar to SpamAssassin's GTUBE.
39
+ # A string that will result in a spam score of at least 100. Handy for testing.
40
+
41
+ def self.gtubs_test_string
42
+ '89913b8a065b7092721fe995877e097681683af9d3ab767146d5d6fd050fc0bda7ab99f4232d94a1'
43
+ end
19
44
  end
@@ -3,46 +3,35 @@ class Despamilator
3
3
  attr_accessor :matches, :score
4
4
 
5
5
  def initialize text
6
- @filters ||= []
7
6
  @matches ||= []
8
7
  @score ||= 0
9
- load_filters text
10
- run_filters
8
+ run_filters text
11
9
  end
12
10
 
13
11
  private
14
12
 
15
- def load_filters text
16
- Dir.glob(File.dirname(__FILE__) + "/filter/*.rb").each do |filter_file|
17
- filter_name = classify_filename filter_file
13
+ def run_filters text
14
+ filter_namespace = Object.const_get('DespamilatorFilter')
18
15
 
19
- filter_code = File.open(filter_file, File::RDWR).read
20
- filter = Class.new
21
- filter.class_eval(
22
- "require 'despamilator/filter_base'\nclass #{filter_name} < Despamilator::FilterBase\n#{filter_code}\nend"
23
- )
24
-
25
- @filters.push(filter.const_get(filter_name).new(text.to_s.dup, File.basename(filter_file)))
16
+ filter_namespace.constants.each do |filter_class|
17
+ execute_filter(filter_namespace.const_get(filter_class).new, text)
26
18
  end
27
19
  end
28
20
 
29
- def run_filters
30
- @filters.each do |filter|
31
- filter.parse
21
+ private
22
+
23
+ def execute_filter filter, text
24
+ filter.parse text.dup
32
25
 
33
- if filter.matched?
34
- @matches.push(filter)
35
- @score += filter.score
36
- end
26
+ if filter.matched?
27
+ @matches.push(filter)
28
+ @score += filter.score
37
29
  end
38
30
  end
39
31
 
40
- def classify_filename filename
41
- classname = ''
42
- File.basename(filename).gsub(/\.rb$/, '').split('_').each do |filename_part|
43
- classname += filename_part.capitalize
44
- end
45
- classname || filename.capitalize
32
+ Dir.glob(File.join(File.dirname(__FILE__), 'filter', '*.rb')).each do |filter_file|
33
+ require filter_file
46
34
  end
35
+
47
36
  end
48
37
  end
@@ -1,21 +1,31 @@
1
- def name
2
- 'Funky Consonant'
3
- end
1
+ require 'despamilator/filter_base'
4
2
 
5
- def description
6
- 'Detects and scores each occurrence of a consonant next to an unlikely character'
7
- end
3
+ module DespamilatorFilter
4
+
5
+ class FunkyConsonant < Despamilator::FilterBase
8
6
 
9
- def parse
10
- text = self.text.downcase
7
+ def name
8
+ 'Funky Consonant'
9
+ end
11
10
 
12
- consonant_pairs.each do |pair|
13
- [pair, pair.reverse].each do |combo_pair|
14
- self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
11
+ def description
12
+ 'Detects and scores each occurrence of a consonant next to an unlikely character'
15
13
  end
14
+
15
+ def parse text
16
+ text.downcase!
17
+
18
+ consonant_pairs.each do |pair|
19
+ [pair, pair.reverse].each do |combo_pair|
20
+ self.append_score = 0.05 unless text.scan(/#{combo_pair}/).empty?
21
+ end
22
+ end
23
+ end
24
+
25
+ def consonant_pairs
26
+ %w{ zt gb vk vt jk mj dm jm xz bn }
27
+ end
28
+
16
29
  end
17
- end
18
30
 
19
- def consonant_pairs
20
- %w{ zt gb vk vt jk mj dm jm }
21
- end
31
+ end
@@ -1,116 +1,127 @@
1
- def parse
2
- html = self.text.downcase
1
+ require 'despamilator/filter_base'
3
2
 
4
- html_tags.each do |tag|
5
- if html.match(/<\s*#{tag}\W/) || html.match(/<\n*#{tag}\W/) || html.match(/\W#{tag}\s*\//) || html.match(/\W#{tag}\n*\//)
6
- self.append_score = 0.3
3
+ module DespamilatorFilter
4
+
5
+ class HtmlTags < Despamilator::FilterBase
6
+
7
+ def parse text
8
+ text.downcase!
9
+
10
+ html_tags.each do |tag|
11
+ if text.match(/<\s*#{tag}\W/) || text.match(/<\n*#{tag}\W/) || text.match(/\W#{tag}\s*\//) || text.match(/\W#{tag}\n*\//)
12
+ self.append_score = 0.3
13
+ end
14
+ end
7
15
  end
8
- end
9
- end
10
16
 
11
- def name
12
- 'Detects HTML tags in text'
13
- end
17
+ def name
18
+ 'HTML tags'
19
+ end
14
20
 
15
- def description
16
- 'Searches for various HTML tags'
17
- end
21
+ def description
22
+ 'Detects HTML tags in text'
23
+ end
24
+
25
+ def html_tags
26
+ # make sure these are lowercase, in order to save processing
27
+ [
28
+ '!--',
29
+ '!doctype',
30
+ 'a',
31
+ 'abbr',
32
+ 'acronym',
33
+ 'address',
34
+ 'applet',
35
+ 'area',
36
+ 'b',
37
+ 'base',
38
+ 'basefont',
39
+ 'bdo',
40
+ 'big',
41
+ 'blockquote',
42
+ 'body',
43
+ 'br',
44
+ 'button',
45
+ 'caption',
46
+ 'center',
47
+ 'cite',
48
+ 'code',
49
+ 'col',
50
+ 'colgroup',
51
+ 'dd',
52
+ 'del',
53
+ 'dfn',
54
+ 'dir',
55
+ 'div',
56
+ 'dl',
57
+ 'dt',
58
+ 'em',
59
+ 'fieldset',
60
+ 'font',
61
+ 'form',
62
+ 'frame',
63
+ 'frameset',
64
+ 'h1',
65
+ 'h2',
66
+ 'h3',
67
+ 'h4',
68
+ 'h5',
69
+ 'h6',
70
+ 'head',
71
+ 'hr',
72
+ 'html',
73
+ 'i',
74
+ 'iframe',
75
+ 'img',
76
+ 'input',
77
+ 'ins',
78
+ 'isindex',
79
+ 'kbd',
80
+ 'label',
81
+ 'legend',
82
+ 'li',
83
+ 'link',
84
+ 'map',
85
+ 'menu',
86
+ 'meta',
87
+ 'noframes',
88
+ 'noscript',
89
+ 'object',
90
+ 'ol',
91
+ 'optgroup',
92
+ 'option',
93
+ 'p',
94
+ 'param',
95
+ 'pre',
96
+ 'q',
97
+ 's',
98
+ 'samp',
99
+ 'select',
100
+ 'small',
101
+ 'span',
102
+ 'strike',
103
+ 'strong',
104
+ 'style',
105
+ 'sub',
106
+ 'sup',
107
+ 'table',
108
+ 'tbody',
109
+ 'td',
110
+ 'textarea',
111
+ 'tfoot',
112
+ 'th',
113
+ 'thead',
114
+ 'title',
115
+ 'tr',
116
+ 'tt',
117
+ 'u',
118
+ 'ul',
119
+ 'var',
120
+ 'xmp'
121
+ ]
122
+
123
+ end
18
124
 
19
- def html_tags
20
- # make sure these are lowercase, in order to save processing
21
- [
22
- '!--',
23
- '!doctype',
24
- 'a',
25
- 'abbr',
26
- 'acronym',
27
- 'address',
28
- 'applet',
29
- 'area',
30
- 'b',
31
- 'base',
32
- 'basefont',
33
- 'bdo',
34
- 'big',
35
- 'blockquote',
36
- 'body',
37
- 'br',
38
- 'button',
39
- 'caption',
40
- 'center',
41
- 'cite',
42
- 'code',
43
- 'col',
44
- 'colgroup',
45
- 'dd',
46
- 'del',
47
- 'dfn',
48
- 'dir',
49
- 'div',
50
- 'dl',
51
- 'dt',
52
- 'em',
53
- 'fieldset',
54
- 'font',
55
- 'form',
56
- 'frame',
57
- 'frameset',
58
- 'h1',
59
- 'h2',
60
- 'h3',
61
- 'h4',
62
- 'h5',
63
- 'h6',
64
- 'head',
65
- 'hr',
66
- 'html',
67
- 'i',
68
- 'iframe',
69
- 'img',
70
- 'input',
71
- 'ins',
72
- 'isindex',
73
- 'kbd',
74
- 'label',
75
- 'legend',
76
- 'li',
77
- 'link',
78
- 'map',
79
- 'menu',
80
- 'meta',
81
- 'noframes',
82
- 'noscript',
83
- 'object',
84
- 'ol',
85
- 'optgroup',
86
- 'option',
87
- 'p',
88
- 'param',
89
- 'pre',
90
- 'q',
91
- 's',
92
- 'samp',
93
- 'select',
94
- 'small',
95
- 'span',
96
- 'strike',
97
- 'strong',
98
- 'style',
99
- 'sub',
100
- 'sup',
101
- 'table',
102
- 'tbody',
103
- 'td',
104
- 'textarea',
105
- 'tfoot',
106
- 'th',
107
- 'thead',
108
- 'title',
109
- 'tr',
110
- 'tt',
111
- 'u',
112
- 'ul',
113
- 'var',
114
- 'xmp'
115
- ]
116
- end
125
+ end
126
+
127
+ end