classifier 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,119 +0,0 @@
1
- # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
- # Copyright:: Copyright (c) 2005 Lucas Carlson
3
- # License:: LGPL
4
-
5
- module Classifier
6
-
7
- # This module is mixed into String to provide convenience
8
- # methods for the Classifier package.
9
- module WordHash
10
-
11
- # Removes common punctuation symbols, returning a new string. E.g.,
12
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
13
- # => "Hello greetings with braces "
14
- def without_punctuation
15
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
16
- end
17
-
18
- # Return a Hash of strings => ints. Each word in the string is stemmed,
19
- # interned, and indexes to its frequency in the document.
20
- def word_hash
21
- d = Hash.new
22
- corpus = without_punctuation
23
- (corpus.split + gsub(/[\w+]/,"").split).each do |word|
24
- item = word.downcase
25
- key = item.stem.intern
26
- if !(word =~ /[\w+]/) || word.length > 2
27
- d[key] ||= 0
28
- d[key] += 1
29
- end unless CORPUS_SKIP_WORDS[item]
30
- end
31
- return d
32
- end
33
-
34
- private
35
- CORPUS_SKIP_WORDS = {
36
- "a" => 1,
37
- "again" => 1,
38
- "all" => 1,
39
- "along" => 1,
40
- "are" => 1,
41
- "also" => 1,
42
- "an" => 1,
43
- "and" => 1,
44
- "as" => 1,
45
- "at" => 1,
46
- "but" => 1,
47
- "by" => 1,
48
- "came" => 1,
49
- "can" => 1,
50
- "cant" => 1,
51
- "couldnt" => 1,
52
- "did" => 1,
53
- "didn" => 1,
54
- "didnt" => 1,
55
- "do" => 1,
56
- "doesnt" => 1,
57
- "dont" => 1,
58
- "ever" => 1,
59
- "first" => 1,
60
- "from" => 1,
61
- "have" => 1,
62
- "her" => 1,
63
- "here" => 1,
64
- "him" => 1,
65
- "how" => 1,
66
- "i" => 1,
67
- "if" => 1,
68
- "in" => 1,
69
- "into" => 1,
70
- "is" => 1,
71
- "isnt" => 1,
72
- "it" => 1,
73
- "itll" => 1,
74
- "just" => 1,
75
- "last" => 1,
76
- "least" => 1,
77
- "like" => 1,
78
- "most" => 1,
79
- "my" => 1,
80
- "new" => 1,
81
- "no" => 1,
82
- "not" => 1,
83
- "now" => 1,
84
- "of" => 1,
85
- "on" => 1,
86
- "or" => 1,
87
- "should" => 1,
88
- "sinc" => 1,
89
- "so" => 1,
90
- "some" => 1,
91
- "th" => 1,
92
- "than" => 1,
93
- "this" => 1,
94
- "that" => 1,
95
- "the" => 1,
96
- "their" => 1,
97
- "then" => 1,
98
- "those" => 1,
99
- "to" => 1,
100
- "told" => 1,
101
- "too" => 1,
102
- "true" => 1,
103
- "try" => 1,
104
- "until" => 1,
105
- "url" => 1,
106
- "us" => 1,
107
- "were" => 1,
108
- "when" => 1,
109
- "whether" => 1,
110
- "while" => 1,
111
- "with" => 1,
112
- "within" => 1,
113
- "yes" => 1,
114
- "you" => 1,
115
- "youll" => 1,
116
- }
117
- end
118
-
119
- end