classifier 1.4.4 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,5 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
@@ -7,34 +9,49 @@ module Classifier
7
9
  # raw_vector_with, it should be fairly straightforward to understand.
8
10
  # You should never have to use it directly.
9
11
  class ContentNode
10
- attr_accessor :raw_vector, :raw_norm,
11
- :lsi_vector, :lsi_norm,
12
- :categories
12
+ # @rbs @word_hash: Hash[Symbol, Integer]
13
+
14
+ # @rbs @raw_vector: untyped
15
+ # @rbs @raw_norm: untyped
16
+ # @rbs @lsi_vector: untyped
17
+ # @rbs @lsi_norm: untyped
18
+ attr_accessor :raw_vector, :raw_norm, :lsi_vector, :lsi_norm
19
+
20
+ # @rbs @categories: Array[String | Symbol]
21
+ attr_accessor :categories
13
22
 
14
23
  attr_reader :word_hash
15
24
 
16
25
  # If text_proc is not specified, the source will be duck-typed
17
26
  # via source.to_s
27
+ #
28
+ # @rbs (Hash[Symbol, Integer], *String | Symbol) -> void
18
29
  def initialize(word_frequencies, *categories)
19
30
  @categories = categories || []
20
31
  @word_hash = word_frequencies
21
32
  end
22
33
 
23
34
  # Use this to fetch the appropriate search vector.
35
+ #
36
+ # @rbs () -> untyped
24
37
  def search_vector
25
38
  @lsi_vector || @raw_vector
26
39
  end
27
40
 
28
41
  # Use this to fetch the appropriate search vector in normalized form.
42
+ #
43
+ # @rbs () -> untyped
29
44
  def search_norm
30
45
  @lsi_norm || @raw_norm
31
46
  end
32
47
 
33
48
  # Creates the raw vector out of word_hash using word_list as the
34
49
  # key for mapping the vector space.
50
+ #
51
+ # @rbs (WordList) -> untyped
35
52
  def raw_vector_with(word_list)
36
- vec = if $GSL
37
- GSL::Vector.alloc(word_list.size)
53
+ vec = if Classifier::LSI.native_available?
54
+ Classifier::LSI.vector_class.alloc(word_list.size)
38
55
  else
39
56
  Array.new(word_list.size, 0)
40
57
  end
@@ -44,8 +61,9 @@ module Classifier
44
61
  end
45
62
 
46
63
  # Perform the scaling transform
47
- total_words = $GSL ? vec.sum : vec.sum_with_identity
48
- total_unique_words = vec.count { |word| word != 0 }
64
+ total_words = Classifier::LSI.native_available? ? vec.sum : vec.sum_with_identity
65
+ vec_array = Classifier::LSI.native_available? ? vec.to_a : vec
66
+ total_unique_words = vec_array.count { |word| word != 0 }
49
67
 
50
68
  # Perform first-order association transform if this vector has more
51
69
  # than one word in it.
@@ -60,10 +78,13 @@ module Classifier
60
78
  val = term_over_total * Math.log(term_over_total)
61
79
  weighted_total += val unless val.nan?
62
80
  end
63
- vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
81
+
82
+ sign = weighted_total.negative? ? 1.0 : -1.0
83
+ divisor = sign * [weighted_total.abs, Vector::EPSILON].max
84
+ vec = vec.collect { |val| Math.log(val + 1) / divisor }
64
85
  end
65
86
 
66
- if $GSL
87
+ if Classifier::LSI.native_available?
67
88
  @raw_norm = vec.normalize
68
89
  @raw_vector = vec
69
90
  else
@@ -1,3 +1,5 @@
1
+ # rbs_inline: enabled
2
+
1
3
  # Author:: David Fayram (mailto:dfayram@lensmen.net)
2
4
  # Copyright:: Copyright (c) 2005 David Fayram II
3
5
  # License:: LGPL
@@ -5,29 +7,38 @@
5
7
  module Classifier
6
8
  # This class keeps a word => index mapping. It is used to map stemmed words
7
9
  # to dimensions of a vector.
8
-
9
10
  class WordList
11
+ # @rbs @location_table: Hash[Symbol, Integer]
12
+
13
+ # @rbs () -> void
10
14
  def initialize
11
15
  @location_table = {}
12
16
  end
13
17
 
14
18
  # Adds a word (if it is new) and assigns it a unique dimension.
19
+ #
20
+ # @rbs (Symbol) -> Integer?
15
21
  def add_word(word)
16
22
  term = word
17
23
  @location_table[term] = @location_table.size unless @location_table[term]
18
24
  end
19
25
 
20
26
  # Returns the dimension of the word or nil if the word is not in the space.
27
+ #
28
+ # @rbs (Symbol) -> Integer?
21
29
  def [](lookup)
22
30
  term = lookup
23
31
  @location_table[term]
24
32
  end
25
33
 
34
+ # @rbs (Integer) -> Symbol?
26
35
  def word_for_index(ind)
27
36
  @location_table.invert[ind]
28
37
  end
29
38
 
30
39
  # Returns the number of words mapped.
40
+ #
41
+ # @rbs () -> Integer
31
42
  def size
32
43
  @location_table.size
33
44
  end