classifier 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,135 +2,129 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
- require "set"
6
-
7
5
  # These are extensions to the String class to provide convenience
8
6
  # methods for the Classifier package.
9
7
  class String
10
-
11
8
  # Removes common punctuation symbols, returning a new string.
12
9
  # E.g.,
13
10
  # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
11
  # => "Hello greetings with braces "
15
12
  def without_punctuation
16
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
13
+ tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
17
14
  end
18
15
 
19
16
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
17
  # interned, and indexes to its frequency in the document.
21
- def word_hash
22
- word_hash = clean_word_hash()
23
- symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
24
- return word_hash.merge(symbol_hash)
25
- end
26
-
27
- # Return a word hash without extra punctuation or short symbols, just stemmed words
28
- def clean_word_hash
29
- word_hash_for_words gsub(/[^\w\s]/,"").split
30
- end
18
+ def word_hash
19
+ word_hash = clean_word_hash
20
+ symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
21
+ word_hash.merge(symbol_hash)
22
+ end
31
23
 
32
- private
24
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
25
+ def clean_word_hash
26
+ word_hash_for_words gsub(/[^\w\s]/, '').split
27
+ end
33
28
 
34
- def word_hash_for_words(words)
35
- d = Hash.new(0)
36
- words.each do |word|
37
- word.downcase!
38
- if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
- d[word.stem.intern] += 1
40
- end
41
- end
42
- return d
43
- end
29
+ private
44
30
 
31
+ def word_hash_for_words(words)
32
+ d = Hash.new(0)
33
+ words.each do |word|
34
+ word.downcase!
35
+ d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
36
+ end
37
+ d
38
+ end
45
39
 
46
- def word_hash_for_symbols(words)
47
- d = Hash.new(0)
48
- words.each do |word|
49
- d[word.intern] += 1
50
- end
51
- return d
52
- end
40
+ def word_hash_for_symbols(words)
41
+ d = Hash.new(0)
42
+ words.each do |word|
43
+ d[word.intern] += 1
44
+ end
45
+ d
46
+ end
53
47
 
54
- CORPUS_SKIP_WORDS = Set.new([
55
- "a",
56
- "again",
57
- "all",
58
- "along",
59
- "are",
60
- "also",
61
- "an",
62
- "and",
63
- "as",
64
- "at",
65
- "but",
66
- "by",
67
- "came",
68
- "can",
69
- "cant",
70
- "couldnt",
71
- "did",
72
- "didn",
73
- "didnt",
74
- "do",
75
- "doesnt",
76
- "dont",
77
- "ever",
78
- "first",
79
- "from",
80
- "have",
81
- "her",
82
- "here",
83
- "him",
84
- "how",
85
- "i",
86
- "if",
87
- "in",
88
- "into",
89
- "is",
90
- "isnt",
91
- "it",
92
- "itll",
93
- "just",
94
- "last",
95
- "least",
96
- "like",
97
- "most",
98
- "my",
99
- "new",
100
- "no",
101
- "not",
102
- "now",
103
- "of",
104
- "on",
105
- "or",
106
- "should",
107
- "sinc",
108
- "so",
109
- "some",
110
- "th",
111
- "than",
112
- "this",
113
- "that",
114
- "the",
115
- "their",
116
- "then",
117
- "those",
118
- "to",
119
- "told",
120
- "too",
121
- "true",
122
- "try",
123
- "until",
124
- "url",
125
- "us",
126
- "were",
127
- "when",
128
- "whether",
129
- "while",
130
- "with",
131
- "within",
132
- "yes",
133
- "you",
134
- "youll",
135
- ])
48
+ CORPUS_SKIP_WORDS = Set.new(%w[
49
+ a
50
+ again
51
+ all
52
+ along
53
+ are
54
+ also
55
+ an
56
+ and
57
+ as
58
+ at
59
+ but
60
+ by
61
+ came
62
+ can
63
+ cant
64
+ couldnt
65
+ did
66
+ didn
67
+ didnt
68
+ do
69
+ doesnt
70
+ dont
71
+ ever
72
+ first
73
+ from
74
+ have
75
+ her
76
+ here
77
+ him
78
+ how
79
+ i
80
+ if
81
+ in
82
+ into
83
+ is
84
+ isnt
85
+ it
86
+ itll
87
+ just
88
+ last
89
+ least
90
+ like
91
+ most
92
+ my
93
+ new
94
+ no
95
+ not
96
+ now
97
+ of
98
+ on
99
+ or
100
+ should
101
+ sinc
102
+ so
103
+ some
104
+ th
105
+ than
106
+ this
107
+ that
108
+ the
109
+ their
110
+ then
111
+ those
112
+ to
113
+ told
114
+ too
115
+ true
116
+ try
117
+ until
118
+ url
119
+ us
120
+ were
121
+ when
122
+ whether
123
+ while
124
+ with
125
+ within
126
+ yes
127
+ you
128
+ youll
129
+ ])
136
130
  end
@@ -3,21 +3,21 @@
3
3
  # License:: LGPL
4
4
 
5
5
  module Classifier
6
-
7
- # This is an internal data structure class for the LSI node. Save for
8
- # raw_vector_with, it should be fairly straightforward to understand.
9
- # You should never have to use it directly.
6
+ # This is an internal data structure class for the LSI node. Save for
7
+ # raw_vector_with, it should be fairly straightforward to understand.
8
+ # You should never have to use it directly.
10
9
  class ContentNode
11
10
  attr_accessor :raw_vector, :raw_norm,
12
11
  :lsi_vector, :lsi_norm,
13
12
  :categories
14
13
 
15
14
  attr_reader :word_hash
15
+
16
16
  # If text_proc is not specified, the source will be duck-typed
17
17
  # via source.to_s
18
- def initialize( word_hash, *categories )
18
+ def initialize(word_frequencies, *categories)
19
19
  @categories = categories || []
20
- @word_hash = word_hash
20
+ @word_hash = word_frequencies
21
21
  end
22
22
 
23
23
  # Use this to fetch the appropriate search vector.
@@ -32,41 +32,43 @@ module Classifier
32
32
 
33
33
  # Creates the raw vector out of word_hash using word_list as the
34
34
  # key for mapping the vector space.
35
- def raw_vector_with( word_list )
36
- if $GSL
37
- vec = GSL::Vector.alloc(word_list.size)
38
- else
39
- vec = Array.new(word_list.size, 0)
40
- end
35
+ def raw_vector_with(word_list)
36
+ vec = if $GSL
37
+ GSL::Vector.alloc(word_list.size)
38
+ else
39
+ Array.new(word_list.size, 0)
40
+ end
41
41
 
42
42
  @word_hash.each_key do |word|
43
43
  vec[word_list[word]] = @word_hash[word] if word_list[word]
44
44
  end
45
45
 
46
46
  # Perform the scaling transform
47
- total_words = vec.sum
47
+ total_words = $GSL ? vec.sum : vec.sum_with_identity
48
48
 
49
49
  # Perform first-order association transform if this vector has more
50
50
  # than one word in it.
51
51
  if total_words > 1.0
52
52
  weighted_total = 0.0
53
+
53
54
  vec.each do |term|
54
- if ( term > 0 )
55
- weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
- end
55
+ next unless term.positive?
56
+ next if total_words.zero?
57
+
58
+ term_over_total = term / total_words
59
+ val = term_over_total * Math.log(term_over_total)
60
+ weighted_total += val unless val.nan?
57
61
  end
58
- vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
62
+ vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
59
63
  end
60
64
 
61
65
  if $GSL
62
- @raw_norm = vec.normalize
63
- @raw_vector = vec
66
+ @raw_norm = vec.normalize
67
+ @raw_vector = vec
64
68
  else
65
- @raw_norm = Vector[*vec].normalize
66
- @raw_vector = Vector[*vec]
69
+ @raw_norm = Vector[*vec].normalize
70
+ @raw_vector = Vector[*vec]
67
71
  end
68
72
  end
69
-
70
73
  end
71
-
72
74
  end
@@ -3,29 +3,29 @@
3
3
  # License:: LGPL
4
4
 
5
5
  class String
6
- def summary( count=10, separator=" [...] " )
7
- perform_lsi split_sentences, count, separator
8
- end
6
+ def summary(count = 10, separator = ' [...] ')
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
9
 
10
- def paragraph_summary( count=1, separator=" [...] " )
11
- perform_lsi split_paragraphs, count, separator
12
- end
10
+ def paragraph_summary(count = 1, separator = ' [...] ')
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
13
 
14
- def split_sentences
15
- split /(\.|\!|\?)/ # TODO: make this less primitive
16
- end
14
+ def split_sentences
15
+ split(/(\.|!|\?)/) # TODO: make this less primitive
16
+ end
17
17
 
18
- def split_paragraphs
19
- split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
- end
18
+ def split_paragraphs
19
+ split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
20
+ end
21
21
 
22
- private
22
+ private
23
23
 
24
- def perform_lsi(chunks, count, separator)
25
- lsi = Classifier::LSI.new :auto_rebuild => false
26
- chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
- lsi.build_index
28
- summaries = lsi.highest_relative_content count
29
- return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
- end
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new auto_rebuild: false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
30
+ end
31
31
  end
@@ -8,7 +8,7 @@ module Classifier
8
8
 
9
9
  class WordList
10
10
  def initialize
11
- @location_table = Hash.new
11
+ @location_table = {}
12
12
  end
13
13
 
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
@@ -31,6 +31,5 @@ module Classifier
31
31
  def size
32
32
  @location_table.size
33
33
  end
34
-
35
34
  end
36
35
  end