classifier-reborn 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.markdown +130 -14
- data/data/stopwords/ca +126 -0
- data/data/stopwords/cs +138 -0
- data/data/stopwords/da +101 -0
- data/data/stopwords/de +604 -0
- data/data/stopwords/en +80 -0
- data/data/stopwords/es +351 -0
- data/data/stopwords/fi +747 -0
- data/data/stopwords/fr +463 -0
- data/data/stopwords/hu +35 -0
- data/data/stopwords/it +430 -0
- data/data/stopwords/nl +48 -0
- data/data/stopwords/no +119 -0
- data/data/stopwords/pl +93 -0
- data/data/stopwords/pt +356 -0
- data/data/stopwords/se +386 -0
- data/data/stopwords/tr +114 -0
- data/lib/classifier-reborn/bayes.rb +86 -16
- data/lib/classifier-reborn/category_namer.rb +3 -1
- data/lib/classifier-reborn/extensions/hasher.rb +25 -100
- data/lib/classifier-reborn/extensions/vector.rb +0 -1
- data/lib/classifier-reborn/lsi.rb +36 -25
- data/lib/classifier-reborn/lsi/cached_content_node.rb +48 -0
- data/lib/classifier-reborn/lsi/content_node.rb +27 -10
- data/lib/classifier-reborn/lsi/summarizer.rb +2 -2
- data/lib/classifier-reborn/version.rb +1 -1
- metadata +37 -3
@@ -0,0 +1,48 @@
|
|
1
|
+
# Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
|
2
|
+
# Copyright:: Copyright (c) 2015 Kelley Reynolds
|
3
|
+
# License:: LGPL
|
4
|
+
|
5
|
+
module ClassifierReborn
|
6
|
+
|
7
|
+
# Subclass of ContentNode which caches the search_vector transpositions.
|
8
|
+
# Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
|
9
|
+
# if you Marshal your classifier and want to keep the size down, you'll need to manually
|
10
|
+
# clear the cache before you dump
|
11
|
+
class CachedContentNode < ContentNode
|
12
|
+
module InstanceMethods
|
13
|
+
# Go through each item in this index and clear the cache
|
14
|
+
def clear_cache!
|
15
|
+
@items.each_value(&:clear_cache!)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def initialize( word_hash, *categories )
|
20
|
+
clear_cache!
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
def clear_cache!
|
25
|
+
@transposed_search_vector = nil
|
26
|
+
end
|
27
|
+
|
28
|
+
# Cache the transposed vector, it gets used a lot
|
29
|
+
def transposed_search_vector
|
30
|
+
@transposed_search_vector ||= super
|
31
|
+
end
|
32
|
+
|
33
|
+
# Clear the cache before we continue on
|
34
|
+
def raw_vector_with( word_list )
|
35
|
+
clear_cache!
|
36
|
+
super
|
37
|
+
end
|
38
|
+
|
39
|
+
# We don't want the cached_data here
|
40
|
+
def marshal_dump
|
41
|
+
[@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
|
42
|
+
end
|
43
|
+
|
44
|
+
def marshal_load(array)
|
45
|
+
@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -18,6 +18,7 @@ module ClassifierReborn
|
|
18
18
|
def initialize( word_hash, *categories )
|
19
19
|
@categories = categories || []
|
20
20
|
@word_hash = word_hash
|
21
|
+
@lsi_norm, @lsi_vector = nil
|
21
22
|
end
|
22
23
|
|
23
24
|
# Use this to fetch the appropriate search vector.
|
@@ -25,6 +26,11 @@ module ClassifierReborn
|
|
25
26
|
@lsi_vector || @raw_vector
|
26
27
|
end
|
27
28
|
|
29
|
+
# Method to access the transposed search vector
|
30
|
+
def transposed_search_vector
|
31
|
+
search_vector.col
|
32
|
+
end
|
33
|
+
|
28
34
|
# Use this to fetch the appropriate search vector in normalized form.
|
29
35
|
def search_norm
|
30
36
|
@lsi_norm || @raw_norm
|
@@ -46,7 +52,7 @@ module ClassifierReborn
|
|
46
52
|
# Perform the scaling transform and force floating point arithmetic
|
47
53
|
if $GSL
|
48
54
|
sum = 0.0
|
49
|
-
vec.
|
55
|
+
vec.each {|v| sum += v }
|
50
56
|
total_words = sum
|
51
57
|
else
|
52
58
|
total_words = vec.reduce(0, :+).to_f
|
@@ -55,7 +61,7 @@ module ClassifierReborn
|
|
55
61
|
total_unique_words = 0
|
56
62
|
|
57
63
|
if $GSL
|
58
|
-
vec.each { |word| total_unique_words += 1 if word != 0 }
|
64
|
+
vec.each { |word| total_unique_words += 1 if word != 0.0 }
|
59
65
|
else
|
60
66
|
total_unique_words = vec.count{ |word| word != 0 }
|
61
67
|
end
|
@@ -64,20 +70,31 @@ module ClassifierReborn
|
|
64
70
|
# then one word in it.
|
65
71
|
if total_words > 1.0 && total_unique_words > 1
|
66
72
|
weighted_total = 0.0
|
73
|
+
# Cache calculations, this takes too long on large indexes
|
74
|
+
cached_calcs = Hash.new { |hash, term|
|
75
|
+
hash[term] = (( term / total_words ) * Math.log( term / total_words ))
|
76
|
+
}
|
77
|
+
|
67
78
|
vec.each do |term|
|
68
|
-
|
69
|
-
weighted_total += (( term / total_words ) * Math.log( term / total_words ))
|
70
|
-
end
|
79
|
+
weighted_total += cached_calcs[term] if term > 0.0
|
71
80
|
end
|
72
|
-
|
81
|
+
|
82
|
+
# Cache calculations, this takes too long on large indexes
|
83
|
+
cached_calcs = Hash.new do |hash, val|
|
84
|
+
hash[val] = Math.log( val + 1 ) / -weighted_total
|
85
|
+
end
|
86
|
+
|
87
|
+
vec.collect! { |val|
|
88
|
+
cached_calcs[val]
|
89
|
+
}
|
73
90
|
end
|
74
91
|
|
75
92
|
if $GSL
|
76
|
-
|
77
|
-
|
93
|
+
@raw_norm = vec.normalize
|
94
|
+
@raw_vector = vec
|
78
95
|
else
|
79
|
-
|
80
|
-
|
96
|
+
@raw_norm = Vector[*vec].normalize
|
97
|
+
@raw_vector = Vector[*vec]
|
81
98
|
end
|
82
99
|
end
|
83
100
|
|
@@ -15,11 +15,11 @@ module ClassifierReborn
|
|
15
15
|
end
|
16
16
|
|
17
17
|
def split_sentences(str)
|
18
|
-
str.split
|
18
|
+
str.split(/(\.|\!|\?)/) # TODO: make this less primitive
|
19
19
|
end
|
20
20
|
|
21
21
|
def split_paragraphs(str)
|
22
|
-
str.split
|
22
|
+
str.split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
|
23
23
|
end
|
24
24
|
|
25
25
|
def perform_lsi(chunks, count, separator)
|
metadata
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: classifier-reborn
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lucas Carlson
|
8
8
|
- Parker Moore
|
9
|
+
- Chase Gilliam
|
9
10
|
autorequire:
|
10
11
|
bindir: bin
|
11
12
|
cert_chain: []
|
12
|
-
date:
|
13
|
+
date: 2015-10-31 00:00:00.000000000 Z
|
13
14
|
dependencies:
|
14
15
|
- !ruby/object:Gem::Dependency
|
15
16
|
name: fast-stemmer
|
@@ -53,10 +54,25 @@ dependencies:
|
|
53
54
|
- - ">="
|
54
55
|
- !ruby/object:Gem::Version
|
55
56
|
version: '0'
|
57
|
+
- !ruby/object:Gem::Dependency
|
58
|
+
name: test-unit
|
59
|
+
requirement: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
type: :development
|
65
|
+
prerelease: false
|
66
|
+
version_requirements: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
56
71
|
description:
|
57
72
|
email:
|
58
73
|
- lucas@rufy.com
|
59
74
|
- parkrmoore@gmail.com
|
75
|
+
- chase.gilliam@gmail.com
|
60
76
|
executables:
|
61
77
|
- bayes.rb
|
62
78
|
- summarize.rb
|
@@ -69,6 +85,22 @@ files:
|
|
69
85
|
- README.markdown
|
70
86
|
- bin/bayes.rb
|
71
87
|
- bin/summarize.rb
|
88
|
+
- data/stopwords/ca
|
89
|
+
- data/stopwords/cs
|
90
|
+
- data/stopwords/da
|
91
|
+
- data/stopwords/de
|
92
|
+
- data/stopwords/en
|
93
|
+
- data/stopwords/es
|
94
|
+
- data/stopwords/fi
|
95
|
+
- data/stopwords/fr
|
96
|
+
- data/stopwords/hu
|
97
|
+
- data/stopwords/it
|
98
|
+
- data/stopwords/nl
|
99
|
+
- data/stopwords/no
|
100
|
+
- data/stopwords/pl
|
101
|
+
- data/stopwords/pt
|
102
|
+
- data/stopwords/se
|
103
|
+
- data/stopwords/tr
|
72
104
|
- lib/classifier-reborn.rb
|
73
105
|
- lib/classifier-reborn/bayes.rb
|
74
106
|
- lib/classifier-reborn/category_namer.rb
|
@@ -76,6 +108,7 @@ files:
|
|
76
108
|
- lib/classifier-reborn/extensions/vector.rb
|
77
109
|
- lib/classifier-reborn/extensions/vector_serialize.rb
|
78
110
|
- lib/classifier-reborn/lsi.rb
|
111
|
+
- lib/classifier-reborn/lsi/cached_content_node.rb
|
79
112
|
- lib/classifier-reborn/lsi/content_node.rb
|
80
113
|
- lib/classifier-reborn/lsi/summarizer.rb
|
81
114
|
- lib/classifier-reborn/lsi/word_list.rb
|
@@ -101,8 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
101
134
|
version: '0'
|
102
135
|
requirements: []
|
103
136
|
rubyforge_project:
|
104
|
-
rubygems_version: 2.
|
137
|
+
rubygems_version: 2.4.8
|
105
138
|
signing_key:
|
106
139
|
specification_version: 2
|
107
140
|
summary: A general classifier module to allow Bayesian and other types of classifications.
|
108
141
|
test_files: []
|
142
|
+
has_rdoc: true
|