classifier 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. data/LICENSE +341 -0
  2. data/README +59 -6
  3. data/Rakefile +16 -4
  4. data/bin/bayes.rb +8 -2
  5. data/doc/classes/Classifier.html +15 -10
  6. data/doc/classes/Classifier/Bayes.html +68 -38
  7. data/doc/classes/Classifier/Bayes.src/{M000005.html → M000023.html} +1 -1
  8. data/doc/classes/Classifier/Bayes.src/{M000006.html → M000024.html} +1 -1
  9. data/doc/classes/Classifier/Bayes.src/M000025.html +30 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000007.html → M000026.html} +1 -1
  11. data/doc/classes/Classifier/Bayes.src/{M000008.html → M000027.html} +1 -1
  12. data/doc/classes/Classifier/Bayes.src/{M000009.html → M000028.html} +4 -4
  13. data/doc/classes/Classifier/Bayes.src/{M000010.html → M000029.html} +2 -2
  14. data/doc/classes/Classifier/ContentNode.html +252 -0
  15. data/doc/classes/Classifier/ContentNode.src/M000031.html +21 -0
  16. data/doc/classes/Classifier/ContentNode.src/M000032.html +18 -0
  17. data/doc/classes/Classifier/ContentNode.src/M000033.html +18 -0
  18. data/doc/classes/Classifier/ContentNode.src/M000034.html +41 -0
  19. data/doc/classes/Classifier/LSI.html +449 -0
  20. data/doc/classes/Classifier/LSI.src/M000011.html +20 -0
  21. data/doc/classes/Classifier/LSI.src/M000012.html +18 -0
  22. data/doc/classes/Classifier/LSI.src/M000013.html +20 -0
  23. data/doc/classes/Classifier/LSI.src/M000014.html +18 -0
  24. data/doc/classes/Classifier/LSI.src/M000015.html +21 -0
  25. data/doc/classes/Classifier/LSI.src/M000016.html +18 -0
  26. data/doc/classes/Classifier/LSI.src/M000017.html +32 -0
  27. data/doc/classes/Classifier/LSI.src/M000018.html +26 -0
  28. data/doc/classes/Classifier/LSI.src/M000019.html +26 -0
  29. data/doc/classes/Classifier/LSI.src/M000020.html +23 -0
  30. data/doc/classes/Classifier/LSI.src/M000021.html +21 -0
  31. data/doc/classes/Classifier/LSI.src/M000022.html +31 -0
  32. data/doc/classes/Classifier/WordList.html +202 -0
  33. data/doc/classes/Classifier/WordList.src/M000007.html +18 -0
  34. data/doc/classes/Classifier/WordList.src/M000008.html +19 -0
  35. data/doc/classes/Classifier/WordList.src/M000009.html +19 -0
  36. data/doc/classes/Classifier/WordList.src/M000010.html +18 -0
  37. data/doc/classes/GSL.html +111 -0
  38. data/doc/classes/GSL/Vector.html +156 -0
  39. data/doc/classes/GSL/Vector.src/M000005.html +18 -0
  40. data/doc/classes/GSL/Vector.src/M000006.html +19 -0
  41. data/doc/classes/Object.html +139 -0
  42. data/doc/classes/Object.src/M000001.html +16 -0
  43. data/doc/classes/String.html +95 -9
  44. data/doc/classes/{Classifier/WordHash.src/M000001.html → String.src/M000002.html} +3 -3
  45. data/doc/classes/String.src/M000003.html +18 -0
  46. data/doc/classes/String.src/M000004.html +18 -0
  47. data/doc/created.rid +1 -1
  48. data/doc/files/README.html +102 -12
  49. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  50. data/doc/files/lib/classifier/{string_extensions/porter_stemmer_rb.html → extensions/vector_serialize_rb.html} +4 -15
  51. data/doc/files/lib/classifier/{string_extensions → extensions}/word_hash_rb.html +2 -2
  52. data/doc/files/lib/classifier/extensions/word_list_rb.html +115 -0
  53. data/doc/files/lib/classifier/lsi/content_node_rb.html +115 -0
  54. data/doc/files/lib/classifier/lsi_rb.html +125 -0
  55. data/doc/files/lib/classifier/string_extensions_rb.html +2 -3
  56. data/doc/files/lib/classifier_rb.html +3 -1
  57. data/doc/fr_class_index.html +6 -2
  58. data/doc/fr_file_index.html +5 -2
  59. data/doc/fr_method_index.html +34 -11
  60. data/lib/classifier.rb +3 -1
  61. data/lib/classifier/bayes.rb +34 -9
  62. data/lib/classifier/extensions/vector_serialize.rb +14 -0
  63. data/lib/classifier/extensions/word_hash.rb +125 -0
  64. data/lib/classifier/extensions/word_list.rb +31 -0
  65. data/lib/classifier/lsi.rb +248 -0
  66. data/lib/classifier/lsi/content_node.rb +67 -0
  67. data/lib/classifier/string_extensions.rb +10 -5
  68. data/test/bayes/bayesian_test.rb +2 -2
  69. data/test/lsi/lsi_test.rb +88 -0
  70. data/test/string_extensions/word_hash_test.rb +7 -5
  71. metadata +79 -24
  72. data/doc/classes/Classifier/Stemmable.html +0 -243
  73. data/doc/classes/Classifier/Stemmable.src/M000003.html +0 -102
  74. data/doc/classes/Classifier/WordHash.html +0 -178
  75. data/doc/classes/Classifier/WordHash.src/M000002.html +0 -28
  76. data/lib/classifier/string_extensions/porter_stemmer.rb +0 -199
  77. data/lib/classifier/string_extensions/word_hash.rb +0 -119
@@ -1,119 +0,0 @@
1
- # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
- # Copyright:: Copyright (c) 2005 Lucas Carlson
3
- # License:: LGPL
4
-
5
- module Classifier
6
-
7
- # This module is mixed into String to provide convenience
8
- # methods for the Classifier package.
9
- module WordHash
10
-
11
- # Removes common punctuation symbols, returning a new string. E.g.,
12
- # "Hello (greeting's), with {braces} < >...?".without_punctuation
13
- # => "Hello greetings with braces "
14
- def without_punctuation
15
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
16
- end
17
-
18
- # Return a Hash of strings => ints. Each word in the string is stemmed,
19
- # interned, and indexes to its frequency in the document.
20
- def word_hash
21
- d = Hash.new
22
- corpus = without_punctuation
23
- (corpus.split + gsub(/[\w+]/,"").split).each do |word|
24
- item = word.downcase
25
- key = item.stem.intern
26
- if !(word =~ /[\w+]/) || word.length > 2
27
- d[key] ||= 0
28
- d[key] += 1
29
- end unless CORPUS_SKIP_WORDS[item]
30
- end
31
- return d
32
- end
33
-
34
- private
35
- CORPUS_SKIP_WORDS = {
36
- "a" => 1,
37
- "again" => 1,
38
- "all" => 1,
39
- "along" => 1,
40
- "are" => 1,
41
- "also" => 1,
42
- "an" => 1,
43
- "and" => 1,
44
- "as" => 1,
45
- "at" => 1,
46
- "but" => 1,
47
- "by" => 1,
48
- "came" => 1,
49
- "can" => 1,
50
- "cant" => 1,
51
- "couldnt" => 1,
52
- "did" => 1,
53
- "didn" => 1,
54
- "didnt" => 1,
55
- "do" => 1,
56
- "doesnt" => 1,
57
- "dont" => 1,
58
- "ever" => 1,
59
- "first" => 1,
60
- "from" => 1,
61
- "have" => 1,
62
- "her" => 1,
63
- "here" => 1,
64
- "him" => 1,
65
- "how" => 1,
66
- "i" => 1,
67
- "if" => 1,
68
- "in" => 1,
69
- "into" => 1,
70
- "is" => 1,
71
- "isnt" => 1,
72
- "it" => 1,
73
- "itll" => 1,
74
- "just" => 1,
75
- "last" => 1,
76
- "least" => 1,
77
- "like" => 1,
78
- "most" => 1,
79
- "my" => 1,
80
- "new" => 1,
81
- "no" => 1,
82
- "not" => 1,
83
- "now" => 1,
84
- "of" => 1,
85
- "on" => 1,
86
- "or" => 1,
87
- "should" => 1,
88
- "sinc" => 1,
89
- "so" => 1,
90
- "some" => 1,
91
- "th" => 1,
92
- "than" => 1,
93
- "this" => 1,
94
- "that" => 1,
95
- "the" => 1,
96
- "their" => 1,
97
- "then" => 1,
98
- "those" => 1,
99
- "to" => 1,
100
- "told" => 1,
101
- "too" => 1,
102
- "true" => 1,
103
- "try" => 1,
104
- "until" => 1,
105
- "url" => 1,
106
- "us" => 1,
107
- "were" => 1,
108
- "when" => 1,
109
- "whether" => 1,
110
- "while" => 1,
111
- "with" => 1,
112
- "within" => 1,
113
- "yes" => 1,
114
- "you" => 1,
115
- "youll" => 1,
116
- }
117
- end
118
-
119
- end