classifier 1.2.0 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (96) hide show
  1. data/LICENSE +361 -273
  2. data/README +6 -5
  3. data/Rakefile +12 -2
  4. data/bin/summarize.rb +11 -0
  5. data/doc/classes/Array.html +139 -0
  6. data/doc/classes/Array.src/M000003.html +18 -0
  7. data/doc/classes/Classifier.html +5 -5
  8. data/doc/classes/Classifier/Bayes.html +43 -43
  9. data/doc/classes/Classifier/Bayes.src/{M000023.html → M000038.html} +0 -0
  10. data/doc/classes/Classifier/Bayes.src/{M000024.html → M000039.html} +0 -0
  11. data/doc/classes/Classifier/Bayes.src/{M000025.html → M000040.html} +0 -0
  12. data/doc/classes/Classifier/Bayes.src/{M000026.html → M000041.html} +0 -0
  13. data/doc/classes/Classifier/Bayes.src/{M000027.html → M000042.html} +0 -0
  14. data/doc/classes/Classifier/Bayes.src/{M000028.html → M000043.html} +0 -0
  15. data/doc/classes/Classifier/Bayes.src/{M000029.html → M000044.html} +0 -0
  16. data/doc/classes/Classifier/ContentNode.html +23 -28
  17. data/doc/classes/Classifier/ContentNode.src/M000046.html +19 -0
  18. data/doc/classes/Classifier/ContentNode.src/{M000032.html → M000047.html} +1 -1
  19. data/doc/classes/Classifier/ContentNode.src/{M000033.html → M000048.html} +1 -1
  20. data/doc/classes/Classifier/ContentNode.src/M000049.html +49 -0
  21. data/doc/classes/Classifier/LSI.html +158 -68
  22. data/doc/classes/Classifier/LSI.src/M000022.html +6 -17
  23. data/doc/classes/Classifier/LSI.src/{M000012.html → M000023.html} +2 -2
  24. data/doc/classes/Classifier/LSI.src/{M000013.html → M000024.html} +3 -2
  25. data/doc/classes/Classifier/LSI.src/{M000014.html → M000025.html} +1 -1
  26. data/doc/classes/Classifier/LSI.src/M000026.html +19 -0
  27. data/doc/classes/Classifier/LSI.src/{M000015.html → M000027.html} +1 -1
  28. data/doc/classes/Classifier/LSI.src/{M000016.html → M000028.html} +1 -1
  29. data/doc/classes/Classifier/LSI.src/M000029.html +19 -0
  30. data/doc/classes/Classifier/LSI.src/M000030.html +43 -0
  31. data/doc/classes/Classifier/LSI.src/M000031.html +23 -0
  32. data/doc/classes/Classifier/LSI.src/{M000018.html → M000032.html} +7 -3
  33. data/doc/classes/Classifier/LSI.src/{M000019.html → M000033.html} +6 -2
  34. data/doc/classes/Classifier/LSI.src/{M000020.html → M000034.html} +2 -4
  35. data/doc/classes/Classifier/LSI.src/{M000021.html → M000035.html} +1 -1
  36. data/doc/classes/Classifier/LSI.src/M000036.html +31 -0
  37. data/doc/classes/Classifier/LSI.src/M000037.html +21 -0
  38. data/doc/classes/Classifier/WordList.html +37 -22
  39. data/doc/classes/Classifier/WordList.src/{M000007.html → M000017.html} +2 -2
  40. data/doc/classes/Classifier/WordList.src/{M000008.html → M000018.html} +1 -1
  41. data/doc/classes/Classifier/WordList.src/{M000009.html → M000019.html} +1 -1
  42. data/doc/classes/Classifier/WordList.src/M000020.html +18 -0
  43. data/doc/classes/Classifier/WordList.src/{M000010.html → M000021.html} +1 -1
  44. data/doc/classes/GSL.html +2 -1
  45. data/doc/classes/GSL/Matrix.html +126 -0
  46. data/doc/classes/GSL/Vector.html +10 -10
  47. data/doc/classes/GSL/Vector.src/{M000005.html → M000015.html} +0 -0
  48. data/doc/classes/GSL/Vector.src/{M000006.html → M000016.html} +0 -0
  49. data/doc/classes/Matrix.html +184 -0
  50. data/doc/classes/Matrix.src/M000004.html +18 -0
  51. data/doc/classes/Matrix.src/M000005.html +76 -0
  52. data/doc/classes/Matrix.src/M000006.html +18 -0
  53. data/doc/classes/Object.html +7 -7
  54. data/doc/classes/Object.src/{M000001.html → M000007.html} +1 -1
  55. data/doc/classes/String.html +90 -20
  56. data/doc/classes/String.src/{M000002.html → M000008.html} +0 -0
  57. data/doc/classes/String.src/{M000003.html → M000009.html} +0 -0
  58. data/doc/classes/String.src/{M000004.html → M000010.html} +0 -0
  59. data/doc/classes/String.src/M000011.html +18 -0
  60. data/doc/classes/String.src/M000012.html +18 -0
  61. data/doc/classes/String.src/M000013.html +18 -0
  62. data/doc/classes/String.src/M000014.html +18 -0
  63. data/doc/classes/Vector.html +154 -0
  64. data/doc/classes/Vector.src/M000001.html +22 -0
  65. data/doc/classes/Vector.src/M000002.html +25 -0
  66. data/doc/created.rid +1 -1
  67. data/doc/files/README.html +14 -8
  68. data/doc/files/lib/classifier/bayes_rb.html +1 -1
  69. data/doc/files/lib/classifier/{string_extensions_rb.html → extensions/string_rb.html} +5 -5
  70. data/doc/files/lib/classifier/extensions/vector_rb.html +120 -0
  71. data/doc/files/lib/classifier/extensions/vector_serialize_rb.html +1 -1
  72. data/doc/files/lib/classifier/extensions/word_hash_rb.html +1 -1
  73. data/doc/files/lib/classifier/lsi/content_node_rb.html +2 -2
  74. data/doc/files/lib/classifier/lsi/summary_rb.html +115 -0
  75. data/doc/files/lib/classifier/{extensions → lsi}/word_list_rb.html +3 -3
  76. data/doc/files/lib/classifier/lsi_rb.html +5 -3
  77. data/doc/files/lib/classifier_rb.html +2 -2
  78. data/doc/fr_class_index.html +4 -0
  79. data/doc/fr_file_index.html +4 -2
  80. data/doc/fr_method_index.html +49 -34
  81. data/doc/index.html +2 -2
  82. data/lib/classifier.rb +1 -1
  83. data/lib/classifier/{string_extensions.rb → extensions/string.rb} +0 -0
  84. data/lib/classifier/extensions/vector.rb +106 -0
  85. data/lib/classifier/extensions/vector_serialize.rb +6 -0
  86. data/lib/classifier/lsi.rb +101 -31
  87. data/lib/classifier/lsi/content_node.rb +28 -23
  88. data/lib/classifier/lsi/summary.rb +31 -0
  89. data/lib/classifier/{extensions → lsi}/word_list.rb +7 -2
  90. data/test/{string_extensions → extensions}/word_hash_test.rb +0 -0
  91. data/test/lsi/lsi_test.rb +36 -1
  92. metadata +68 -41
  93. data/doc/classes/Classifier/ContentNode.src/M000031.html +0 -21
  94. data/doc/classes/Classifier/ContentNode.src/M000034.html +0 -41
  95. data/doc/classes/Classifier/LSI.src/M000011.html +0 -20
  96. data/doc/classes/Classifier/LSI.src/M000017.html +0 -32
@@ -1,26 +1,23 @@
1
1
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
- # License:: GPL
3
+ # License:: LGPL
4
4
 
5
5
  module Classifier
6
6
 
7
-
8
7
  # This is an internal data structure class for the LSI node. Save for
9
8
  # raw_vector_with, it should be fairly straightforward to understand.
10
9
  # You should never have to use it directly.
11
10
  class ContentNode
12
- attr_accessor :word_hash, :raw_vector, :raw_norm,
13
- :lsi_vector, :lsi_norm,
14
- :categories
15
- attr_reader :source
16
-
11
+ attr_accessor :raw_vector, :raw_norm,
12
+ :lsi_vector, :lsi_norm,
13
+ :categories
14
+
15
+ attr_reader :word_hash
17
16
  # If text_proc is not specified, the source will be duck-typed
18
17
  # via source.to_s
19
- def initialize( source, categories=nil, text_proc=nil )
20
- text_proc = text_proc || (proc {|x| x.to_s})
18
+ def initialize( word_hash, *categories )
21
19
  @categories = categories || []
22
- @source = source
23
- @word_hash = text_proc.call( @source ).clean_word_hash
20
+ @word_hash = word_hash
24
21
  end
25
22
 
26
23
  # Use this to fetch the appropriate search vector.
@@ -36,32 +33,40 @@ module Classifier
36
33
  # Creates the raw vector out of word_hash using word_list as the
37
34
  # key for mapping the vector space.
38
35
  def raw_vector_with( word_list )
39
- vec = Array.new(word_list.size, 0)
36
+ if $GSL
37
+ vec = Vector.new(word_list.size)
38
+ else
39
+ vec = Array.new(word_list.size, 0)
40
+ end
40
41
 
41
42
  @word_hash.each_key do |word|
42
43
  vec[word_list[word]] = @word_hash[word] if word_list[word]
43
44
  end
44
45
 
45
46
  # Perform the scaling transform
46
- total_words = vec.inject(0) { |sum,term| sum += term }.to_f
47
+ total_words = vec.sum
47
48
 
48
49
  # Perform first-order association transform if this vector has more
49
50
  # than one word in it.
50
51
  if total_words > 1.0
51
- weighted_total = vec.inject(0.0) do |sum,term|
52
- if( term > 0 )
53
- sum += (( term / total_words ) * Math.log( term / total_words ))
54
- else
55
- sum
52
+ weighted_total = 0.0
53
+ vec.each do |term|
54
+ if ( term > 0 )
55
+ weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
56
  end
57
57
  end
58
- vec.map! { |val| Math.log( val + 1 ) / -weighted_total }
58
+ vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
59
+ end
60
+
61
+ if $GSL
62
+ @raw_norm = vec.normalize
63
+ @raw_vector = vec
64
+ else
65
+ @raw_norm = Vector[*vec].normalize
66
+ @raw_vector = Vector[*vec]
59
67
  end
60
-
61
- @raw_norm = GSL::Vector.new( vec ).normalize
62
- @raw_vector = GSL::Vector.new( vec )
63
68
  end
64
69
 
65
70
  end
66
71
 
67
- end
72
+ end
@@ -0,0 +1,31 @@
1
+ # Author:: Lucas Carlson (mailto:lucas@rufy.com)
2
+ # Copyright:: Copyright (c) 2005 Lucas Carlson
3
+ # License:: LGPL
4
+
5
+ class String
6
+ def summary( count=10, separator=" [...] " )
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
+
10
+ def paragraph_summary( count=1, separator=" [...] " )
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
+
14
+ def split_sentences
15
+ split /(\.|\!|\?)/ # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new :auto_rebuild => false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
+ end
31
+ end
@@ -1,13 +1,14 @@
1
1
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
- # License:: GPL
3
+ # License:: LGPL
4
4
 
5
5
  module Classifier
6
6
  # This class keeps a word => index mapping. It is used to map stemmed words
7
7
  # to dimensions of a vector.
8
+
8
9
  class WordList
9
10
  def initialize
10
- @location_table = {}
11
+ @location_table = Hash.new
11
12
  end
12
13
 
13
14
  # Adds a word (if it is new) and assigns it a unique dimension.
@@ -22,6 +23,10 @@ module Classifier
22
23
  @location_table[term]
23
24
  end
24
25
 
26
+ def word_for_index(ind)
27
+ @location_table.invert[ind]
28
+ end
29
+
25
30
  # Returns the number of words mapped.
26
31
  def size
27
32
  @location_table.size
data/test/lsi/lsi_test.rb CHANGED
@@ -3,7 +3,7 @@ class LSITest < Test::Unit::TestCase
3
3
  def setup
4
4
  # we repeat principle words to help weight them.
5
5
  # This test is rather delicate, since this system is mostly noise.
6
- @str1 = "This text deals with dogs. Dogs."
6
+ @str1 = "This text deals with dogs. Dogs."
7
7
  @str2 = "This text involves dogs too. Dogs! "
8
8
  @str3 = "This text revolves around cats. Cats."
9
9
  @str4 = "This text also involves cats. Cats!"
@@ -23,6 +23,7 @@ class LSITest < Test::Unit::TestCase
23
23
  def test_not_auto_rebuild
24
24
  lsi = Classifier::LSI.new :auto_rebuild => false
25
25
  lsi.add_item @str1, "Dog"
26
+ lsi.add_item @str2, "Dog"
26
27
  assert lsi.needs_rebuild?
27
28
  lsi.build_index
28
29
  assert ! lsi.needs_rebuild?
@@ -57,6 +58,25 @@ class LSITest < Test::Unit::TestCase
57
58
  assert_not_equal "Dog", bayes.classify( tricky_case )
58
59
  end
59
60
 
61
+ def test_recategorize_interface
62
+ lsi = Classifier::LSI.new
63
+ lsi.add_item @str1, "Dog"
64
+ lsi.add_item @str2, "Dog"
65
+ lsi.add_item @str3, "Cat"
66
+ lsi.add_item @str4, "Cat"
67
+ lsi.add_item @str5, "Bird"
68
+
69
+ tricky_case = "This text revolves around dogs."
70
+ assert_equal "Dog", lsi.classify( tricky_case )
71
+
72
+ # Recategorize as needed.
73
+ lsi.categories_for(@str1).clear.push "Cow"
74
+ lsi.categories_for(@str2).clear.push "Cow"
75
+
76
+ assert !lsi.needs_rebuild?
77
+ assert_equal "Cow", lsi.classify( tricky_case )
78
+ end
79
+
60
80
  def test_search
61
81
  lsi = Classifier::LSI.new
62
82
  [@str1, @str2, @str3, @str4, @str5].each { |x| lsi << x }
@@ -85,4 +105,19 @@ class LSITest < Test::Unit::TestCase
85
105
  assert_equal lsi_m.find_related(@str1, 3), lsi.find_related(@str1, 3)
86
106
  end
87
107
 
108
+ def test_keyword_search
109
+ lsi = Classifier::LSI.new
110
+ lsi.add_item @str1, "Dog"
111
+ lsi.add_item @str2, "Dog"
112
+ lsi.add_item @str3, "Cat"
113
+ lsi.add_item @str4, "Cat"
114
+ lsi.add_item @str5, "Bird"
115
+
116
+ assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1)
117
+ end
118
+
119
+ def test_summary
120
+ assert_equal "This text involves dogs too [...] This text also involves cats", [@str1, @str2, @str3, @str4, @str5].join.summary(2)
121
+ end
122
+
88
123
  end
metadata CHANGED
@@ -1,10 +1,10 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.8.6
2
+ rubygems_version: 0.8.10
3
3
  specification_version: 1
4
4
  name: classifier
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.2.0
7
- date: 2005-04-24
6
+ version: 1.3.0
7
+ date: 2005-05-05
8
8
  summary: A general classifier module to allow Bayesian and other types of classifications.
9
9
  require_paths:
10
10
  - lib
@@ -33,19 +33,22 @@ files:
33
33
  - lib/classifier/extensions
34
34
  - lib/classifier/lsi
35
35
  - lib/classifier/lsi.rb
36
- - lib/classifier/string_extensions.rb
36
+ - lib/classifier/extensions/string.rb
37
+ - lib/classifier/extensions/vector.rb
37
38
  - lib/classifier/extensions/vector_serialize.rb
38
39
  - lib/classifier/extensions/word_hash.rb
39
- - lib/classifier/extensions/word_list.rb
40
40
  - lib/classifier/lsi/content_node.rb
41
+ - lib/classifier/lsi/summary.rb
42
+ - lib/classifier/lsi/word_list.rb
41
43
  - bin/bayes.rb
44
+ - bin/summarize.rb
42
45
  - test/bayes
46
+ - test/extensions
43
47
  - test/lsi
44
- - test/string_extensions
45
48
  - test/test_helper.rb
46
49
  - test/bayes/bayesian_test.rb
50
+ - test/extensions/word_hash_test.rb
47
51
  - test/lsi/lsi_test.rb
48
- - test/string_extensions/word_hash_test.rb
49
52
  - LICENSE
50
53
  - Rakefile
51
54
  - README
@@ -57,14 +60,21 @@ files:
57
60
  - doc/fr_method_index.html
58
61
  - doc/index.html
59
62
  - doc/rdoc-style.css
63
+ - doc/classes/Array.html
64
+ - doc/classes/Array.src
60
65
  - doc/classes/Classifier
61
66
  - doc/classes/Classifier.html
62
67
  - doc/classes/GSL
63
68
  - doc/classes/GSL.html
69
+ - doc/classes/Matrix.html
70
+ - doc/classes/Matrix.src
64
71
  - doc/classes/Object.html
65
72
  - doc/classes/Object.src
66
73
  - doc/classes/String.html
67
74
  - doc/classes/String.src
75
+ - doc/classes/Vector.html
76
+ - doc/classes/Vector.src
77
+ - doc/classes/Array.src/M000003.html
68
78
  - doc/classes/Classifier/Bayes.html
69
79
  - doc/classes/Classifier/Bayes.src
70
80
  - doc/classes/Classifier/ContentNode.html
@@ -73,41 +83,56 @@ files:
73
83
  - doc/classes/Classifier/LSI.src
74
84
  - doc/classes/Classifier/WordList.html
75
85
  - doc/classes/Classifier/WordList.src
76
- - doc/classes/Classifier/Bayes.src/M000023.html
77
- - doc/classes/Classifier/Bayes.src/M000024.html
78
- - doc/classes/Classifier/Bayes.src/M000025.html
79
- - doc/classes/Classifier/Bayes.src/M000026.html
80
- - doc/classes/Classifier/Bayes.src/M000027.html
81
- - doc/classes/Classifier/Bayes.src/M000028.html
82
- - doc/classes/Classifier/Bayes.src/M000029.html
83
- - doc/classes/Classifier/ContentNode.src/M000031.html
84
- - doc/classes/Classifier/ContentNode.src/M000032.html
85
- - doc/classes/Classifier/ContentNode.src/M000033.html
86
- - doc/classes/Classifier/ContentNode.src/M000034.html
87
- - doc/classes/Classifier/LSI.src/M000011.html
88
- - doc/classes/Classifier/LSI.src/M000012.html
89
- - doc/classes/Classifier/LSI.src/M000013.html
90
- - doc/classes/Classifier/LSI.src/M000014.html
91
- - doc/classes/Classifier/LSI.src/M000015.html
92
- - doc/classes/Classifier/LSI.src/M000016.html
93
- - doc/classes/Classifier/LSI.src/M000017.html
94
- - doc/classes/Classifier/LSI.src/M000018.html
95
- - doc/classes/Classifier/LSI.src/M000019.html
96
- - doc/classes/Classifier/LSI.src/M000020.html
97
- - doc/classes/Classifier/LSI.src/M000021.html
86
+ - doc/classes/Classifier/Bayes.src/M000038.html
87
+ - doc/classes/Classifier/Bayes.src/M000039.html
88
+ - doc/classes/Classifier/Bayes.src/M000040.html
89
+ - doc/classes/Classifier/Bayes.src/M000041.html
90
+ - doc/classes/Classifier/Bayes.src/M000042.html
91
+ - doc/classes/Classifier/Bayes.src/M000043.html
92
+ - doc/classes/Classifier/Bayes.src/M000044.html
93
+ - doc/classes/Classifier/ContentNode.src/M000046.html
94
+ - doc/classes/Classifier/ContentNode.src/M000047.html
95
+ - doc/classes/Classifier/ContentNode.src/M000048.html
96
+ - doc/classes/Classifier/ContentNode.src/M000049.html
98
97
  - doc/classes/Classifier/LSI.src/M000022.html
99
- - doc/classes/Classifier/WordList.src/M000007.html
100
- - doc/classes/Classifier/WordList.src/M000008.html
101
- - doc/classes/Classifier/WordList.src/M000009.html
102
- - doc/classes/Classifier/WordList.src/M000010.html
98
+ - doc/classes/Classifier/LSI.src/M000023.html
99
+ - doc/classes/Classifier/LSI.src/M000024.html
100
+ - doc/classes/Classifier/LSI.src/M000025.html
101
+ - doc/classes/Classifier/LSI.src/M000026.html
102
+ - doc/classes/Classifier/LSI.src/M000027.html
103
+ - doc/classes/Classifier/LSI.src/M000028.html
104
+ - doc/classes/Classifier/LSI.src/M000029.html
105
+ - doc/classes/Classifier/LSI.src/M000030.html
106
+ - doc/classes/Classifier/LSI.src/M000031.html
107
+ - doc/classes/Classifier/LSI.src/M000032.html
108
+ - doc/classes/Classifier/LSI.src/M000033.html
109
+ - doc/classes/Classifier/LSI.src/M000034.html
110
+ - doc/classes/Classifier/LSI.src/M000035.html
111
+ - doc/classes/Classifier/LSI.src/M000036.html
112
+ - doc/classes/Classifier/LSI.src/M000037.html
113
+ - doc/classes/Classifier/WordList.src/M000017.html
114
+ - doc/classes/Classifier/WordList.src/M000018.html
115
+ - doc/classes/Classifier/WordList.src/M000019.html
116
+ - doc/classes/Classifier/WordList.src/M000020.html
117
+ - doc/classes/Classifier/WordList.src/M000021.html
118
+ - doc/classes/GSL/Matrix.html
103
119
  - doc/classes/GSL/Vector.html
104
120
  - doc/classes/GSL/Vector.src
105
- - doc/classes/GSL/Vector.src/M000005.html
106
- - doc/classes/GSL/Vector.src/M000006.html
107
- - doc/classes/Object.src/M000001.html
108
- - doc/classes/String.src/M000002.html
109
- - doc/classes/String.src/M000003.html
110
- - doc/classes/String.src/M000004.html
121
+ - doc/classes/GSL/Vector.src/M000015.html
122
+ - doc/classes/GSL/Vector.src/M000016.html
123
+ - doc/classes/Matrix.src/M000004.html
124
+ - doc/classes/Matrix.src/M000005.html
125
+ - doc/classes/Matrix.src/M000006.html
126
+ - doc/classes/Object.src/M000007.html
127
+ - doc/classes/String.src/M000008.html
128
+ - doc/classes/String.src/M000009.html
129
+ - doc/classes/String.src/M000010.html
130
+ - doc/classes/String.src/M000011.html
131
+ - doc/classes/String.src/M000012.html
132
+ - doc/classes/String.src/M000013.html
133
+ - doc/classes/String.src/M000014.html
134
+ - doc/classes/Vector.src/M000001.html
135
+ - doc/classes/Vector.src/M000002.html
111
136
  - doc/files/lib
112
137
  - doc/files/README.html
113
138
  - doc/files/lib/classifier
@@ -116,11 +141,13 @@ files:
116
141
  - doc/files/lib/classifier/extensions
117
142
  - doc/files/lib/classifier/lsi
118
143
  - doc/files/lib/classifier/lsi_rb.html
119
- - doc/files/lib/classifier/string_extensions_rb.html
144
+ - doc/files/lib/classifier/extensions/string_rb.html
145
+ - doc/files/lib/classifier/extensions/vector_rb.html
120
146
  - doc/files/lib/classifier/extensions/vector_serialize_rb.html
121
147
  - doc/files/lib/classifier/extensions/word_hash_rb.html
122
- - doc/files/lib/classifier/extensions/word_list_rb.html
123
148
  - doc/files/lib/classifier/lsi/content_node_rb.html
149
+ - doc/files/lib/classifier/lsi/summary_rb.html
150
+ - doc/files/lib/classifier/lsi/word_list_rb.html
124
151
  test_files: []
125
152
  rdoc_options: []
126
153
  extra_rdoc_files: []
@@ -1,21 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>new (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 19</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>( <span class="ruby-identifier">source</span>, <span class="ruby-identifier">categories</span>=<span class="ruby-keyword kw">nil</span>, <span class="ruby-identifier">text_proc</span>=<span class="ruby-keyword kw">nil</span> )
15
- <span class="ruby-identifier">text_proc</span> = <span class="ruby-identifier">text_proc</span> <span class="ruby-operator">||</span> (<span class="ruby-identifier">proc</span> {<span class="ruby-operator">|</span><span class="ruby-identifier">x</span><span class="ruby-operator">|</span> <span class="ruby-identifier">x</span>.<span class="ruby-identifier">to_s</span>})
16
- <span class="ruby-ivar">@categories</span> = <span class="ruby-identifier">categories</span> <span class="ruby-operator">||</span> []
17
- <span class="ruby-ivar">@source</span> = <span class="ruby-identifier">source</span>
18
- <span class="ruby-ivar">@word_hash</span> = <span class="ruby-identifier">text_proc</span>.<span class="ruby-identifier">call</span>( <span class="ruby-ivar">@source</span> ).<span class="ruby-identifier">clean_word_hash</span>
19
- <span class="ruby-keyword kw">end</span></pre>
20
- </body>
21
- </html>
@@ -1,41 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>raw_vector_with (Classifier::ContentNode)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi/content_node.rb, line 38</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">raw_vector_with</span>( <span class="ruby-identifier">word_list</span> )
15
- <span class="ruby-identifier">vec</span> = <span class="ruby-constant">Array</span>.<span class="ruby-identifier">new</span>(<span class="ruby-identifier">word_list</span>.<span class="ruby-identifier">size</span>, <span class="ruby-value">0</span>)
16
-
17
- <span class="ruby-ivar">@word_hash</span>.<span class="ruby-identifier">each_key</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">word</span><span class="ruby-operator">|</span>
18
- <span class="ruby-identifier">vec</span>[<span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]] = <span class="ruby-ivar">@word_hash</span>[<span class="ruby-identifier">word</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">word_list</span>[<span class="ruby-identifier">word</span>]
19
- <span class="ruby-keyword kw">end</span>
20
-
21
- <span class="ruby-comment cmt"># Perform the scaling transform</span>
22
- <span class="ruby-identifier">total_words</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span>) { <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span> <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">term</span> }.<span class="ruby-identifier">to_f</span>
23
-
24
- <span class="ruby-comment cmt"># Perform first-order association transform if this vector has more</span>
25
- <span class="ruby-comment cmt"># than one word in it. </span>
26
- <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">total_words</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">1.0</span>
27
- <span class="ruby-identifier">weighted_total</span> = <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">inject</span>(<span class="ruby-value">0</span><span class="ruby-value">.0</span>) <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sum</span>,<span class="ruby-identifier">term</span><span class="ruby-operator">|</span>
28
- <span class="ruby-keyword kw">if</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">&gt;</span> <span class="ruby-value">0</span> )
29
- <span class="ruby-identifier">sum</span> <span class="ruby-operator">+=</span> (( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ) <span class="ruby-operator">*</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">term</span> <span class="ruby-operator">/</span> <span class="ruby-identifier">total_words</span> ))
30
- <span class="ruby-keyword kw">else</span>
31
- <span class="ruby-identifier">sum</span>
32
- <span class="ruby-keyword kw">end</span>
33
- <span class="ruby-keyword kw">end</span>
34
- <span class="ruby-identifier">vec</span>.<span class="ruby-identifier">map!</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">val</span><span class="ruby-operator">|</span> <span class="ruby-constant">Math</span>.<span class="ruby-identifier">log</span>( <span class="ruby-identifier">val</span> <span class="ruby-operator">+</span> <span class="ruby-value">1</span> ) <span class="ruby-operator">/</span> <span class="ruby-operator">-</span><span class="ruby-identifier">weighted_total</span> }
35
- <span class="ruby-keyword kw">end</span>
36
-
37
- <span class="ruby-ivar">@raw_norm</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> ).<span class="ruby-identifier">normalize</span>
38
- <span class="ruby-ivar">@raw_vector</span> = <span class="ruby-constant">GSL</span><span class="ruby-operator">::</span><span class="ruby-constant">Vector</span>.<span class="ruby-identifier">new</span>( <span class="ruby-identifier">vec</span> )
39
- <span class="ruby-keyword kw">end</span></pre>
40
- </body>
41
- </html>
@@ -1,20 +0,0 @@
1
- <?xml version="1.0" encoding="iso-8859-1"?>
2
- <!DOCTYPE html
3
- PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
4
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
5
-
6
- <html>
7
- <head>
8
- <title>new (Classifier::LSI)</title>
9
- <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
10
- <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
11
- </head>
12
- <body class="standalone-code">
13
- <pre><span class="ruby-comment cmt"># File lib/classifier/lsi.rb, line 26</span>
14
- <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">options</span> = {})
15
- <span class="ruby-ivar">@auto_rebuild</span> = <span class="ruby-keyword kw">true</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">options</span>[<span class="ruby-identifier">:auto_rebuild</span>] <span class="ruby-operator">==</span> <span class="ruby-keyword kw">false</span>
16
- <span class="ruby-ivar">@word_list</span>, <span class="ruby-ivar">@items</span> = <span class="ruby-constant">WordList</span>.<span class="ruby-identifier">new</span>, {}
17
- <span class="ruby-ivar">@version</span>, <span class="ruby-ivar">@built_at_version</span> = <span class="ruby-value">0</span>, <span class="ruby-value">-1</span>
18
- <span class="ruby-keyword kw">end</span></pre>
19
- </body>
20
- </html>