classifier 1.3.5 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,135 +2,129 @@
2
2
  # Copyright:: Copyright (c) 2005 Lucas Carlson
3
3
  # License:: LGPL
4
4
 
5
- require "set"
6
-
7
5
  # These are extensions to the String class to provide convenience
8
6
  # methods for the Classifier package.
9
7
  class String
10
-
11
8
  # Removes common punctuation symbols, returning a new string.
12
9
  # E.g.,
13
10
  # "Hello (greeting's), with {braces} < >...?".without_punctuation
14
11
  # => "Hello greetings with braces "
15
12
  def without_punctuation
16
- tr( ',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', " " ) .tr( "'\-", "")
13
+ tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
17
14
  end
18
15
 
19
16
  # Return a Hash of strings => ints. Each word in the string is stemmed,
20
17
  # interned, and indexes to its frequency in the document.
21
- def word_hash
22
- word_hash = clean_word_hash()
23
- symbol_hash = word_hash_for_symbols(gsub(/[\w]/," ").split)
24
- return word_hash.merge(symbol_hash)
25
- end
26
-
27
- # Return a word hash without extra punctuation or short symbols, just stemmed words
28
- def clean_word_hash
29
- word_hash_for_words gsub(/[^\w\s]/,"").split
30
- end
18
+ def word_hash
19
+ word_hash = clean_word_hash
20
+ symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
21
+ word_hash.merge(symbol_hash)
22
+ end
31
23
 
32
- private
24
+ # Return a word hash without extra punctuation or short symbols, just stemmed words
25
+ def clean_word_hash
26
+ word_hash_for_words gsub(/[^\w\s]/, '').split
27
+ end
33
28
 
34
- def word_hash_for_words(words)
35
- d = Hash.new(0)
36
- words.each do |word|
37
- word.downcase!
38
- if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
39
- d[word.stem.intern] += 1
40
- end
41
- end
42
- return d
43
- end
29
+ private
44
30
 
31
+ def word_hash_for_words(words)
32
+ d = Hash.new(0)
33
+ words.each do |word|
34
+ word.downcase!
35
+ d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
36
+ end
37
+ d
38
+ end
45
39
 
46
- def word_hash_for_symbols(words)
47
- d = Hash.new(0)
48
- words.each do |word|
49
- d[word.intern] += 1
50
- end
51
- return d
52
- end
40
+ def word_hash_for_symbols(words)
41
+ d = Hash.new(0)
42
+ words.each do |word|
43
+ d[word.intern] += 1
44
+ end
45
+ d
46
+ end
53
47
 
54
- CORPUS_SKIP_WORDS = Set.new([
55
- "a",
56
- "again",
57
- "all",
58
- "along",
59
- "are",
60
- "also",
61
- "an",
62
- "and",
63
- "as",
64
- "at",
65
- "but",
66
- "by",
67
- "came",
68
- "can",
69
- "cant",
70
- "couldnt",
71
- "did",
72
- "didn",
73
- "didnt",
74
- "do",
75
- "doesnt",
76
- "dont",
77
- "ever",
78
- "first",
79
- "from",
80
- "have",
81
- "her",
82
- "here",
83
- "him",
84
- "how",
85
- "i",
86
- "if",
87
- "in",
88
- "into",
89
- "is",
90
- "isnt",
91
- "it",
92
- "itll",
93
- "just",
94
- "last",
95
- "least",
96
- "like",
97
- "most",
98
- "my",
99
- "new",
100
- "no",
101
- "not",
102
- "now",
103
- "of",
104
- "on",
105
- "or",
106
- "should",
107
- "sinc",
108
- "so",
109
- "some",
110
- "th",
111
- "than",
112
- "this",
113
- "that",
114
- "the",
115
- "their",
116
- "then",
117
- "those",
118
- "to",
119
- "told",
120
- "too",
121
- "true",
122
- "try",
123
- "until",
124
- "url",
125
- "us",
126
- "were",
127
- "when",
128
- "whether",
129
- "while",
130
- "with",
131
- "within",
132
- "yes",
133
- "you",
134
- "youll",
135
- ])
48
+ CORPUS_SKIP_WORDS = Set.new(%w[
49
+ a
50
+ again
51
+ all
52
+ along
53
+ are
54
+ also
55
+ an
56
+ and
57
+ as
58
+ at
59
+ but
60
+ by
61
+ came
62
+ can
63
+ cant
64
+ couldnt
65
+ did
66
+ didn
67
+ didnt
68
+ do
69
+ doesnt
70
+ dont
71
+ ever
72
+ first
73
+ from
74
+ have
75
+ her
76
+ here
77
+ him
78
+ how
79
+ i
80
+ if
81
+ in
82
+ into
83
+ is
84
+ isnt
85
+ it
86
+ itll
87
+ just
88
+ last
89
+ least
90
+ like
91
+ most
92
+ my
93
+ new
94
+ no
95
+ not
96
+ now
97
+ of
98
+ on
99
+ or
100
+ should
101
+ sinc
102
+ so
103
+ some
104
+ th
105
+ than
106
+ this
107
+ that
108
+ the
109
+ their
110
+ then
111
+ those
112
+ to
113
+ told
114
+ too
115
+ true
116
+ try
117
+ until
118
+ url
119
+ us
120
+ were
121
+ when
122
+ whether
123
+ while
124
+ with
125
+ within
126
+ yes
127
+ you
128
+ youll
129
+ ])
136
130
  end
@@ -3,21 +3,21 @@
3
3
  # License:: LGPL
4
4
 
5
5
  module Classifier
6
-
7
- # This is an internal data structure class for the LSI node. Save for
8
- # raw_vector_with, it should be fairly straightforward to understand.
9
- # You should never have to use it directly.
6
+ # This is an internal data structure class for the LSI node. Save for
7
+ # raw_vector_with, it should be fairly straightforward to understand.
8
+ # You should never have to use it directly.
10
9
  class ContentNode
11
10
  attr_accessor :raw_vector, :raw_norm,
12
11
  :lsi_vector, :lsi_norm,
13
12
  :categories
14
13
 
15
14
  attr_reader :word_hash
15
+
16
16
  # If text_proc is not specified, the source will be duck-typed
17
17
  # via source.to_s
18
- def initialize( word_hash, *categories )
18
+ def initialize(word_frequencies, *categories)
19
19
  @categories = categories || []
20
- @word_hash = word_hash
20
+ @word_hash = word_frequencies
21
21
  end
22
22
 
23
23
  # Use this to fetch the appropriate search vector.
@@ -32,41 +32,43 @@ module Classifier
32
32
 
33
33
  # Creates the raw vector out of word_hash using word_list as the
34
34
  # key for mapping the vector space.
35
- def raw_vector_with( word_list )
36
- if $GSL
37
- vec = GSL::Vector.alloc(word_list.size)
38
- else
39
- vec = Array.new(word_list.size, 0)
40
- end
35
+ def raw_vector_with(word_list)
36
+ vec = if $GSL
37
+ GSL::Vector.alloc(word_list.size)
38
+ else
39
+ Array.new(word_list.size, 0)
40
+ end
41
41
 
42
42
  @word_hash.each_key do |word|
43
43
  vec[word_list[word]] = @word_hash[word] if word_list[word]
44
44
  end
45
45
 
46
46
  # Perform the scaling transform
47
- total_words = vec.sum
47
+ total_words = $GSL ? vec.sum : vec.sum_with_identity
48
48
 
49
49
  # Perform first-order association transform if this vector has more
50
50
  # than one word in it.
51
51
  if total_words > 1.0
52
52
  weighted_total = 0.0
53
+
53
54
  vec.each do |term|
54
- if ( term > 0 )
55
- weighted_total += (( term / total_words ) * Math.log( term / total_words ))
56
- end
55
+ next unless term.positive?
56
+ next if total_words.zero?
57
+
58
+ term_over_total = term / total_words
59
+ val = term_over_total * Math.log(term_over_total)
60
+ weighted_total += val unless val.nan?
57
61
  end
58
- vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
62
+ vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
59
63
  end
60
64
 
61
65
  if $GSL
62
- @raw_norm = vec.normalize
63
- @raw_vector = vec
66
+ @raw_norm = vec.normalize
67
+ @raw_vector = vec
64
68
  else
65
- @raw_norm = Vector[*vec].normalize
66
- @raw_vector = Vector[*vec]
69
+ @raw_norm = Vector[*vec].normalize
70
+ @raw_vector = Vector[*vec]
67
71
  end
68
72
  end
69
-
70
73
  end
71
-
72
74
  end
@@ -3,29 +3,29 @@
3
3
  # License:: LGPL
4
4
 
5
5
  class String
6
- def summary( count=10, separator=" [...] " )
7
- perform_lsi split_sentences, count, separator
8
- end
6
+ def summary(count = 10, separator = ' [...] ')
7
+ perform_lsi split_sentences, count, separator
8
+ end
9
9
 
10
- def paragraph_summary( count=1, separator=" [...] " )
11
- perform_lsi split_paragraphs, count, separator
12
- end
10
+ def paragraph_summary(count = 1, separator = ' [...] ')
11
+ perform_lsi split_paragraphs, count, separator
12
+ end
13
13
 
14
- def split_sentences
15
- split /(\.|\!|\?)/ # TODO: make this less primitive
16
- end
14
+ def split_sentences
15
+ split(/(\.|!|\?)/) # TODO: make this less primitive
16
+ end
17
17
 
18
- def split_paragraphs
19
- split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
20
- end
18
+ def split_paragraphs
19
+ split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
20
+ end
21
21
 
22
- private
22
+ private
23
23
 
24
- def perform_lsi(chunks, count, separator)
25
- lsi = Classifier::LSI.new :auto_rebuild => false
26
- chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
- lsi.build_index
28
- summaries = lsi.highest_relative_content count
29
- return summaries.reject { |chunk| !summaries.include? chunk }.map { |x| x.strip }.join(separator)
30
- end
24
+ def perform_lsi(chunks, count, separator)
25
+ lsi = Classifier::LSI.new auto_rebuild: false
26
+ chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
27
+ lsi.build_index
28
+ summaries = lsi.highest_relative_content count
29
+ summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
30
+ end
31
31
  end
@@ -8,7 +8,7 @@ module Classifier
8
8
 
9
9
  class WordList
10
10
  def initialize
11
- @location_table = Hash.new
11
+ @location_table = {}
12
12
  end
13
13
 
14
14
  # Adds a word (if it is new) and assigns it a unique dimension.
@@ -31,6 +31,5 @@ module Classifier
31
31
  def size
32
32
  @location_table.size
33
33
  end
34
-
35
34
  end
36
35
  end