classifier 1.3.4 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,135 +2,129 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
- require "set"
6
-
7
- # These are extensions to the String class to provide convenience
5
+ # These are extensions to the String class to provide convenience
8
6
  # methods for the Classifier package.
9
7
  class String
10
-
11
- # Removes common punctuation symbols, returning a new string.
8
+ # Removes common punctuation symbols, returning a new string.
12
9
  # E.g.,
13
10
  # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
11
  # => "Hello greetings with braces "
15
12
  def without_punctuation
16
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
13
+ tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
17
14
  end
18
-
15
+
19
16
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
- # interned, and indexes to its frequency in the document.
21
- def word_hash
22
- word_hash = clean_word_hash()
23
- symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
24
- return word_hash.merge(symbol_hash)
25
- end
17
+ # interned, and indexes to its frequency in the document.
18
+ def word_hash
19
+ word_hash = clean_word_hash
20
+ symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
21
+ word_hash.merge(symbol_hash)
22
+ end
23
+
24
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
25
+ def clean_word_hash
26
+ word_hash_for_words gsub(/[^\w\s]/, '').split
27
+ end
28
+
29
+ private
26
30
 
27
- # Return a word hash without extra punctuation or short symbols, just stemmed words
28
- def clean_word_hash
29
- word_hash_for_words gsub(/[^\w\s]/,"").split
30
- end
31
-
32
- private
33
-
34
- def word_hash_for_words(words)
35
- d = Hash.new(0)
36
- words.each do |word|
37
- word.downcase!
38
- if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
- d[word.stem.intern] += 1
40
- end
41
- end
42
- return d
43
- end
31
+ def word_hash_for_words(words)
32
+ d = Hash.new(0)
33
+ words.each do |word|
34
+ word.downcase!
35
+ d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
36
+ end
37
+ d
38
+ end
44
39
 
40
+ def word_hash_for_symbols(words)
41
+ d = Hash.new(0)
42
+ words.each do |word|
43
+ d[word.intern] += 1
44
+ end
45
+ d
46
+ end
45
47
 
46
- def word_hash_for_symbols(words)
47
- d = Hash.new(0)
48
- words.each do |word|
49
- d[word.intern] += 1
50
- end
51
- return d
52
- end
53
-
54
- CORPUS_SKIP_WORDS = Set.new([
55
- "a",
56
- "again",
57
- "all",
58
- "along",
59
- "are",
60
- "also",
61
- "an",
62
- "and",
63
- "as",
64
- "at",
65
- "but",
66
- "by",
67
- "came",
68
- "can",
69
- "cant",
70
- "couldnt",
71
- "did",
72
- "didn",
73
- "didnt",
74
- "do",
75
- "doesnt",
76
- "dont",
77
- "ever",
78
- "first",
79
- "from",
80
- "have",
81
- "her",
82
- "here",
83
- "him",
84
- "how",
85
- "i",
86
- "if",
87
- "in",
88
- "into",
89
- "is",
90
- "isnt",
91
- "it",
92
- "itll",
93
- "just",
94
- "last",
95
- "least",
96
- "like",
97
- "most",
98
- "my",
99
- "new",
100
- "no",
101
- "not",
102
- "now",
103
- "of",
104
- "on",
105
- "or",
106
- "should",
107
- "sinc",
108
- "so",
109
- "some",
110
- "th",
111
- "than",
112
- "this",
113
- "that",
114
- "the",
115
- "their",
116
- "then",
117
- "those",
118
- "to",
119
- "told",
120
- "too",
121
- "true",
122
- "try",
123
- "until",
124
- "url",
125
- "us",
126
- "were",
127
- "when",
128
- "whether",
129
- "while",
130
- "with",
131
- "within",
132
- "yes",
133
- "you",
134
- "youll",
135
- ])
48
+ CORPUS_SKIP_WORDS = Set.new(%w[
49
+ a
50
+ again
51
+ all
52
+ along
53
+ are
54
+ also
55
+ an
56
+ and
57
+ as
58
+ at
59
+ but
60
+ by
61
+ came
62
+ can
63
+ cant
64
+ couldnt
65
+ did
66
+ didn
67
+ didnt
68
+ do
69
+ doesnt
70
+ dont
71
+ ever
72
+ first
73
+ from
74
+ have
75
+ her
76
+ here
77
+ him
78
+ how
79
+ i
80
+ if
81
+ in
82
+ into
83
+ is
84
+ isnt
85
+ it
86
+ itll
87
+ just
88
+ last
89
+ least
90
+ like
91
+ most
92
+ my
93
+ new
94
+ no
95
+ not
96
+ now
97
+ of
98
+ on
99
+ or
100
+ should
101
+ sinc
102
+ so
103
+ some
104
+ th
105
+ than
106
+ this
107
+ that
108
+ the
109
+ their
110
+ then
111
+ those
112
+ to
113
+ told
114
+ too
115
+ true
116
+ try
117
+ until
118
+ url
119
+ us
120
+ were
121
+ when
122
+ whether
123
+ while
124
+ with
125
+ within
126
+ yes
127
+ you
128
+ youll
129
+ ])
136
130
  end
@@ -3,70 +3,72 @@
3
3
  # License:: LGPL
4
4
 
5
5
  module Classifier
6
-
7
- # This is an internal data structure class for the LSI node. Save for
8
- # raw_vector_with, it should be fairly straightforward to understand.
9
- # You should never have to use it directly.
6
+ # This is an internal data structure class for the LSI node. Save for
7
+ # raw_vector_with, it should be fairly straightforward to understand.
8
+ # You should never have to use it directly.
10
9
  class ContentNode
11
- attr_accessor :raw_vector, :raw_norm,
10
+ attr_accessor :raw_vector, :raw_norm,
12
11
  :lsi_vector, :lsi_norm,
13
- :categories
14
-
12
+ :categories
13
+
15
14
  attr_reader :word_hash
15
+
16
16
  # If text_proc is not specified, the source will be duck-typed
17
17
  # via source.to_s
18
- def initialize( word_hash, *categories )
18
+ def initialize(word_frequencies, *categories)
19
19
  @categories = categories || []
20
- @word_hash = word_hash
20
+ @word_hash = word_frequencies
21
21
  end
22
-
22
+
23
23
  # Use this to fetch the appropriate search vector.
24
24
  def search_vector
25
25
  @lsi_vector || @raw_vector
26
26
  end
27
-
27
+
28
28
  # Use this to fetch the appropriate search vector in normalized form.
29
29
  def search_norm
30
30
  @lsi_norm || @raw_norm
31
31
  end
32
-
32
+
33
33
  # Creates the raw vector out of word_hash using word_list as the
34
34
  # key for mapping the vector space.
35
- def raw_vector_with( word_list )
36
- if $GSL
37
- vec = GSL::Vector.alloc(word_list.size)
38
- else
39
- vec = Array.new(word_list.size, 0)
40
- end
35
+ def raw_vector_with(word_list)
36
+ vec = if $GSL
37
+ GSL::Vector.alloc(word_list.size)
38
+ else
39
+ Array.new(word_list.size, 0)
40
+ end
41
41
 
42
42
  @word_hash.each_key do |word|
43
43
  vec[word_list[word]] = @word_hash[word] if word_list[word]
44
44
  end
45
-
45
+
46
46
  # Perform the scaling transform
47
- total_words = vec.sum
48
-
47
+ total_words = $GSL ? vec.sum : vec.sum_with_identity
48
+
49
49
  # Perform first-order association transform if this vector has more
50
- # than one word in it.
51
- if total_words > 1.0
50
+ # than one word in it.
51
+ if total_words > 1.0
52
52
  weighted_total = 0.0
53
+
53
54
  vec.each do |term|
54
- if ( term > 0 )
55
- weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
- end
57
- end
58
- vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
55
+ next unless term.positive?
56
+ next if total_words.zero?
57
+
58
+ term_over_total = term / total_words
59
+ val = term_over_total * Math.log(term_over_total)
60
+ weighted_total += val unless val.nan?
61
+ end
62
+ vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
59
63
  end
60
-
64
+
61
65
  if $GSL
62
- @raw_norm = vec.normalize
63
- @raw_vector = vec
66
+ @raw_norm = vec.normalize
67
+ @raw_vector = vec
64
68
  else
65
- @raw_norm = Vector[*vec].normalize
66
- @raw_vector = Vector[*vec]
69
+ @raw_norm = Vector[*vec].normalize
70
+ @raw_vector = Vector[*vec]
67
71
  end
68
- end
69
-
70
- end
71
-
72
+ end
73
+ end
72
74
  end
@@ -3,29 +3,29 @@
3
3
  # License:: LGPL
4
4
 
5
5
  class String
6
- def summary( count=10, separator=" [...] " )
7
- perform_lsi split_sentences, count, separator
8
- end
6
+ def summary(count = 10, separator = ' [...] ')
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
9
 
10
- def paragraph_summary( count=1, separator=" [...] " )
11
- perform_lsi split_paragraphs, count, separator
12
- end
10
+ def paragraph_summary(count = 1, separator = ' [...] ')
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
13
 
14
- def split_sentences
15
- split /(\.|\!|\?)/ # TODO: make this less primitive
16
- end
17
-
18
- def split_paragraphs
19
- split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
- end
21
-
22
- private
23
-
24
- def perform_lsi(chunks, count, separator)
25
- lsi = Classifier::LSI.new :auto_rebuild => false
26
- chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
- lsi.build_index
28
- summaries = lsi.highest_relative_content count
29
- return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
- end
31
- end
14
+ def split_sentences
15
+ split(/(\.|!|\?)/) # TODO: make this less primitive
16
+ end
17
+
18
+ def split_paragraphs
19
+ split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
20
+ end
21
+
22
+ private
23
+
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new auto_rebuild: false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
30
+ end
31
+ end
@@ -2,35 +2,34 @@
2
2
  # Copyright:: Copyright (c) 2005 David Fayram II
3
3
  # License:: LGPL
4
4
 
5
- module Classifier
5
+ module Classifier
6
6
  # This class keeps a word => index mapping. It is used to map stemmed words
7
7
  # to dimensions of a vector.
8
-
8
+
9
9
  class WordList
10
10
  def initialize
11
- @location_table = Hash.new
11
+ @location_table = {}
12
12
  end
13
-
13
+
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
15
15
  def add_word(word)
16
16
  term = word
17
17
  @location_table[term] = @location_table.size unless @location_table[term]
18
18
  end
19
-
19
+
20
20
  # Returns the dimension of the word or nil if the word is not in the space.
21
21
  def [](lookup)
22
22
  term = lookup
23
23
  @location_table[term]
24
24
  end
25
-
25
+
26
26
  def word_for_index(ind)
27
27
  @location_table.invert[ind]
28
28
  end
29
-
29
+
30
30
  # Returns the number of words mapped.
31
31
  def size
32
32
  @location_table.size
33
33
  end
34
-
35
34
  end
36
35
  end