classifier 1.3.5 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/classifier/bayes.rb +128 -120
- data/lib/classifier/extensions/string.rb +1 -1
- data/lib/classifier/extensions/vector.rb +66 -72
- data/lib/classifier/extensions/vector_serialize.rb +6 -8
- data/lib/classifier/extensions/word_hash.rb +108 -114
- data/lib/classifier/lsi/content_node.rb +25 -23
- data/lib/classifier/lsi/summary.rb +20 -20
- data/lib/classifier/lsi/word_list.rb +1 -2
- data/lib/classifier/lsi.rb +112 -89
- data/lib/classifier.rb +1 -0
- data/test/test_helper.rb +5 -0
- metadata +7 -21
@@ -2,135 +2,129 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2005 Lucas Carlson
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
|
-
require "set"
|
6
|
-
|
7
5
|
# These are extensions to the String class to provide convenience
|
8
6
|
# methods for the Classifier package.
|
9
7
|
class String
|
10
|
-
|
11
8
|
# Removes common punctuation symbols, returning a new string.
|
12
9
|
# E.g.,
|
13
10
|
# "Hello (greeting's), with {braces} < >...?".without_punctuation
|
14
11
|
# => "Hello greetings with braces "
|
15
12
|
def without_punctuation
|
16
|
-
tr(
|
13
|
+
tr(',?.!;:"@#$%^&*()_=+[]{}\|<>/`~', ' ').tr("'\-", '')
|
17
14
|
end
|
18
15
|
|
19
16
|
# Return a Hash of strings => ints. Each word in the string is stemmed,
|
20
17
|
# interned, and indexes to its frequency in the document.
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
28
|
-
def clean_word_hash
|
29
|
-
word_hash_for_words gsub(/[^\w\s]/,"").split
|
30
|
-
end
|
18
|
+
def word_hash
|
19
|
+
word_hash = clean_word_hash
|
20
|
+
symbol_hash = word_hash_for_symbols(gsub(/\w/, ' ').split)
|
21
|
+
word_hash.merge(symbol_hash)
|
22
|
+
end
|
31
23
|
|
32
|
-
|
24
|
+
# Return a word hash without extra punctuation or short symbols, just stemmed words
|
25
|
+
def clean_word_hash
|
26
|
+
word_hash_for_words gsub(/[^\w\s]/, '').split
|
27
|
+
end
|
33
28
|
|
34
|
-
|
35
|
-
d = Hash.new(0)
|
36
|
-
words.each do |word|
|
37
|
-
word.downcase!
|
38
|
-
if ! CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
39
|
-
d[word.stem.intern] += 1
|
40
|
-
end
|
41
|
-
end
|
42
|
-
return d
|
43
|
-
end
|
29
|
+
private
|
44
30
|
|
31
|
+
def word_hash_for_words(words)
|
32
|
+
d = Hash.new(0)
|
33
|
+
words.each do |word|
|
34
|
+
word.downcase!
|
35
|
+
d[word.stem.intern] += 1 if !CORPUS_SKIP_WORDS.include?(word) && word.length > 2
|
36
|
+
end
|
37
|
+
d
|
38
|
+
end
|
45
39
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
40
|
+
def word_hash_for_symbols(words)
|
41
|
+
d = Hash.new(0)
|
42
|
+
words.each do |word|
|
43
|
+
d[word.intern] += 1
|
44
|
+
end
|
45
|
+
d
|
46
|
+
end
|
53
47
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
48
|
+
CORPUS_SKIP_WORDS = Set.new(%w[
|
49
|
+
a
|
50
|
+
again
|
51
|
+
all
|
52
|
+
along
|
53
|
+
are
|
54
|
+
also
|
55
|
+
an
|
56
|
+
and
|
57
|
+
as
|
58
|
+
at
|
59
|
+
but
|
60
|
+
by
|
61
|
+
came
|
62
|
+
can
|
63
|
+
cant
|
64
|
+
couldnt
|
65
|
+
did
|
66
|
+
didn
|
67
|
+
didnt
|
68
|
+
do
|
69
|
+
doesnt
|
70
|
+
dont
|
71
|
+
ever
|
72
|
+
first
|
73
|
+
from
|
74
|
+
have
|
75
|
+
her
|
76
|
+
here
|
77
|
+
him
|
78
|
+
how
|
79
|
+
i
|
80
|
+
if
|
81
|
+
in
|
82
|
+
into
|
83
|
+
is
|
84
|
+
isnt
|
85
|
+
it
|
86
|
+
itll
|
87
|
+
just
|
88
|
+
last
|
89
|
+
least
|
90
|
+
like
|
91
|
+
most
|
92
|
+
my
|
93
|
+
new
|
94
|
+
no
|
95
|
+
not
|
96
|
+
now
|
97
|
+
of
|
98
|
+
on
|
99
|
+
or
|
100
|
+
should
|
101
|
+
sinc
|
102
|
+
so
|
103
|
+
some
|
104
|
+
th
|
105
|
+
than
|
106
|
+
this
|
107
|
+
that
|
108
|
+
the
|
109
|
+
their
|
110
|
+
then
|
111
|
+
those
|
112
|
+
to
|
113
|
+
told
|
114
|
+
too
|
115
|
+
true
|
116
|
+
try
|
117
|
+
until
|
118
|
+
url
|
119
|
+
us
|
120
|
+
were
|
121
|
+
when
|
122
|
+
whether
|
123
|
+
while
|
124
|
+
with
|
125
|
+
within
|
126
|
+
yes
|
127
|
+
you
|
128
|
+
youll
|
129
|
+
])
|
136
130
|
end
|
@@ -3,21 +3,21 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
module Classifier
|
6
|
-
|
7
|
-
#
|
8
|
-
#
|
9
|
-
# You should never have to use it directly.
|
6
|
+
# This is an internal data structure class for the LSI node. Save for
|
7
|
+
# raw_vector_with, it should be fairly straightforward to understand.
|
8
|
+
# You should never have to use it directly.
|
10
9
|
class ContentNode
|
11
10
|
attr_accessor :raw_vector, :raw_norm,
|
12
11
|
:lsi_vector, :lsi_norm,
|
13
12
|
:categories
|
14
13
|
|
15
14
|
attr_reader :word_hash
|
15
|
+
|
16
16
|
# If text_proc is not specified, the source will be duck-typed
|
17
17
|
# via source.to_s
|
18
|
-
def initialize(
|
18
|
+
def initialize(word_frequencies, *categories)
|
19
19
|
@categories = categories || []
|
20
|
-
@word_hash =
|
20
|
+
@word_hash = word_frequencies
|
21
21
|
end
|
22
22
|
|
23
23
|
# Use this to fetch the appropriate search vector.
|
@@ -32,41 +32,43 @@ module Classifier
|
|
32
32
|
|
33
33
|
# Creates the raw vector out of word_hash using word_list as the
|
34
34
|
# key for mapping the vector space.
|
35
|
-
def raw_vector_with(
|
36
|
-
if $GSL
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
def raw_vector_with(word_list)
|
36
|
+
vec = if $GSL
|
37
|
+
GSL::Vector.alloc(word_list.size)
|
38
|
+
else
|
39
|
+
Array.new(word_list.size, 0)
|
40
|
+
end
|
41
41
|
|
42
42
|
@word_hash.each_key do |word|
|
43
43
|
vec[word_list[word]] = @word_hash[word] if word_list[word]
|
44
44
|
end
|
45
45
|
|
46
46
|
# Perform the scaling transform
|
47
|
-
total_words = vec.sum
|
47
|
+
total_words = $GSL ? vec.sum : vec.sum_with_identity
|
48
48
|
|
49
49
|
# Perform first-order association transform if this vector has more
|
50
50
|
# than one word in it.
|
51
51
|
if total_words > 1.0
|
52
52
|
weighted_total = 0.0
|
53
|
+
|
53
54
|
vec.each do |term|
|
54
|
-
|
55
|
-
|
56
|
-
|
55
|
+
next unless term.positive?
|
56
|
+
next if total_words.zero?
|
57
|
+
|
58
|
+
term_over_total = term / total_words
|
59
|
+
val = term_over_total * Math.log(term_over_total)
|
60
|
+
weighted_total += val unless val.nan?
|
57
61
|
end
|
58
|
-
vec = vec.collect { |val| Math.log(
|
62
|
+
vec = vec.collect { |val| Math.log(val + 1) / -weighted_total }
|
59
63
|
end
|
60
64
|
|
61
65
|
if $GSL
|
62
|
-
|
63
|
-
|
66
|
+
@raw_norm = vec.normalize
|
67
|
+
@raw_vector = vec
|
64
68
|
else
|
65
|
-
|
66
|
-
|
69
|
+
@raw_norm = Vector[*vec].normalize
|
70
|
+
@raw_vector = Vector[*vec]
|
67
71
|
end
|
68
72
|
end
|
69
|
-
|
70
73
|
end
|
71
|
-
|
72
74
|
end
|
@@ -3,29 +3,29 @@
|
|
3
3
|
# License:: LGPL
|
4
4
|
|
5
5
|
class String
|
6
|
-
|
7
|
-
|
8
|
-
|
6
|
+
def summary(count = 10, separator = ' [...] ')
|
7
|
+
perform_lsi split_sentences, count, separator
|
8
|
+
end
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
10
|
+
def paragraph_summary(count = 1, separator = ' [...] ')
|
11
|
+
perform_lsi split_paragraphs, count, separator
|
12
|
+
end
|
13
13
|
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
def split_sentences
|
15
|
+
split(/(\.|!|\?)/) # TODO: make this less primitive
|
16
|
+
end
|
17
17
|
|
18
|
-
|
19
|
-
|
20
|
-
|
18
|
+
def split_paragraphs
|
19
|
+
split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
|
20
|
+
end
|
21
21
|
|
22
|
-
|
22
|
+
private
|
23
23
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
24
|
+
def perform_lsi(chunks, count, separator)
|
25
|
+
lsi = Classifier::LSI.new auto_rebuild: false
|
26
|
+
chunks.each { |chunk| lsi << chunk unless chunk.strip.empty? || chunk.strip.split.size == 1 }
|
27
|
+
lsi.build_index
|
28
|
+
summaries = lsi.highest_relative_content count
|
29
|
+
summaries.select { |chunk| summaries.include?(chunk) }.map(&:strip).join(separator)
|
30
|
+
end
|
31
31
|
end
|
@@ -8,7 +8,7 @@ module Classifier
|
|
8
8
|
|
9
9
|
class WordList
|
10
10
|
def initialize
|
11
|
-
@location_table =
|
11
|
+
@location_table = {}
|
12
12
|
end
|
13
13
|
|
14
14
|
# Adds a word (if it is new) and assigns it a unique dimension.
|
@@ -31,6 +31,5 @@ module Classifier
|
|
31
31
|
def size
|
32
32
|
@location_table.size
|
33
33
|
end
|
34
|
-
|
35
34
|
end
|
36
35
|
end
|