classifier-reborn 2.0.3 → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,48 @@
1
+ # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
2
+ # Copyright:: Copyright (c) 2015 Kelley Reynolds
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+
7
+ # Subclass of ContentNode which caches the search_vector transpositions.
8
+ # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
9
+ # if you Marshal your classifier and want to keep the size down, you'll need to manually
10
+ # clear the cache before you dump
11
+ class CachedContentNode < ContentNode
12
+ module InstanceMethods
13
+ # Go through each item in this index and clear the cache
14
+ def clear_cache!
15
+ @items.each_value(&:clear_cache!)
16
+ end
17
+ end
18
+
19
+ def initialize( word_hash, *categories )
20
+ clear_cache!
21
+ super
22
+ end
23
+
24
+ def clear_cache!
25
+ @transposed_search_vector = nil
26
+ end
27
+
28
+ # Cache the transposed vector, it gets used a lot
29
+ def transposed_search_vector
30
+ @transposed_search_vector ||= super
31
+ end
32
+
33
+ # Clear the cache before we continue on
34
+ def raw_vector_with( word_list )
35
+ clear_cache!
36
+ super
37
+ end
38
+
39
+ # We don't want the cached_data here
40
+ def marshal_dump
41
+ [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
42
+ end
43
+
44
+ def marshal_load(array)
45
+ @lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
46
+ end
47
+ end
48
+ end
@@ -18,6 +18,7 @@ module ClassifierReborn
18
18
  def initialize( word_hash, *categories )
19
19
  @categories = categories || []
20
20
  @word_hash = word_hash
21
+ @lsi_norm, @lsi_vector = nil
21
22
  end
22
23
 
23
24
  # Use this to fetch the appropriate search vector.
@@ -25,6 +26,11 @@ module ClassifierReborn
25
26
  @lsi_vector || @raw_vector
26
27
  end
27
28
 
29
+ # Method to access the transposed search vector
30
+ def transposed_search_vector
31
+ search_vector.col
32
+ end
33
+
28
34
  # Use this to fetch the appropriate search vector in normalized form.
29
35
  def search_norm
30
36
  @lsi_norm || @raw_norm
@@ -46,7 +52,7 @@ module ClassifierReborn
46
52
  # Perform the scaling transform and force floating point arithmetic
47
53
  if $GSL
48
54
  sum = 0.0
49
- vec.collect{|v| sum += v}
55
+ vec.each {|v| sum += v }
50
56
  total_words = sum
51
57
  else
52
58
  total_words = vec.reduce(0, :+).to_f
@@ -55,7 +61,7 @@ module ClassifierReborn
55
61
  total_unique_words = 0
56
62
 
57
63
  if $GSL
58
- vec.each { |word| total_unique_words += 1 if word != 0 }
64
+ vec.each { |word| total_unique_words += 1 if word != 0.0 }
59
65
  else
60
66
  total_unique_words = vec.count{ |word| word != 0 }
61
67
  end
@@ -64,20 +70,31 @@ module ClassifierReborn
64
70
  # then one word in it.
65
71
  if total_words > 1.0 && total_unique_words > 1
66
72
  weighted_total = 0.0
73
+ # Cache calculations, this takes too long on large indexes
74
+ cached_calcs = Hash.new { |hash, term|
75
+ hash[term] = (( term / total_words ) * Math.log( term / total_words ))
76
+ }
77
+
67
78
  vec.each do |term|
68
- if ( term > 0 )
69
- weighted_total += (( term / total_words ) * Math.log( term / total_words ))
70
- end
79
+ weighted_total += cached_calcs[term] if term > 0.0
71
80
  end
72
- vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
81
+
82
+ # Cache calculations, this takes too long on large indexes
83
+ cached_calcs = Hash.new do |hash, val|
84
+ hash[val] = Math.log( val + 1 ) / -weighted_total
85
+ end
86
+
87
+ vec.collect! { |val|
88
+ cached_calcs[val]
89
+ }
73
90
  end
74
91
 
75
92
  if $GSL
76
- @raw_norm = vec.normalize
77
- @raw_vector = vec
93
+ @raw_norm = vec.normalize
94
+ @raw_vector = vec
78
95
  else
79
- @raw_norm = Vector[*vec].normalize
80
- @raw_vector = Vector[*vec]
96
+ @raw_norm = Vector[*vec].normalize
97
+ @raw_vector = Vector[*vec]
81
98
  end
82
99
  end
83
100
 
@@ -15,11 +15,11 @@ module ClassifierReborn
15
15
  end
16
16
 
17
17
  def split_sentences(str)
18
- str.split /(\.|\!|\?)/ # TODO: make this less primitive
18
+ str.split(/(\.|\!|\?)/) # TODO: make this less primitive
19
19
  end
20
20
 
21
21
  def split_paragraphs(str)
22
- str.split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
22
+ str.split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
23
23
  end
24
24
 
25
25
  def perform_lsi(chunks, count, separator)
@@ -1,3 +1,3 @@
1
1
  module ClassifierReborn
2
- VERSION = '2.0.3'
2
+ VERSION = '2.0.4'
3
3
  end
metadata CHANGED
@@ -1,15 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier-reborn
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 2.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
8
  - Parker Moore
9
+ - Chase Gilliam
9
10
  autorequire:
10
11
  bindir: bin
11
12
  cert_chain: []
12
- date: 2014-12-23 00:00:00.000000000 Z
13
+ date: 2015-10-31 00:00:00.000000000 Z
13
14
  dependencies:
14
15
  - !ruby/object:Gem::Dependency
15
16
  name: fast-stemmer
@@ -53,10 +54,25 @@ dependencies:
53
54
  - - ">="
54
55
  - !ruby/object:Gem::Version
55
56
  version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: test-unit
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
56
71
  description:
57
72
  email:
58
73
  - lucas@rufy.com
59
74
  - parkrmoore@gmail.com
75
+ - chase.gilliam@gmail.com
60
76
  executables:
61
77
  - bayes.rb
62
78
  - summarize.rb
@@ -69,6 +85,22 @@ files:
69
85
  - README.markdown
70
86
  - bin/bayes.rb
71
87
  - bin/summarize.rb
88
+ - data/stopwords/ca
89
+ - data/stopwords/cs
90
+ - data/stopwords/da
91
+ - data/stopwords/de
92
+ - data/stopwords/en
93
+ - data/stopwords/es
94
+ - data/stopwords/fi
95
+ - data/stopwords/fr
96
+ - data/stopwords/hu
97
+ - data/stopwords/it
98
+ - data/stopwords/nl
99
+ - data/stopwords/no
100
+ - data/stopwords/pl
101
+ - data/stopwords/pt
102
+ - data/stopwords/se
103
+ - data/stopwords/tr
72
104
  - lib/classifier-reborn.rb
73
105
  - lib/classifier-reborn/bayes.rb
74
106
  - lib/classifier-reborn/category_namer.rb
@@ -76,6 +108,7 @@ files:
76
108
  - lib/classifier-reborn/extensions/vector.rb
77
109
  - lib/classifier-reborn/extensions/vector_serialize.rb
78
110
  - lib/classifier-reborn/lsi.rb
111
+ - lib/classifier-reborn/lsi/cached_content_node.rb
79
112
  - lib/classifier-reborn/lsi/content_node.rb
80
113
  - lib/classifier-reborn/lsi/summarizer.rb
81
114
  - lib/classifier-reborn/lsi/word_list.rb
@@ -101,8 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
101
134
  version: '0'
102
135
  requirements: []
103
136
  rubyforge_project:
104
- rubygems_version: 2.2.2
137
+ rubygems_version: 2.4.8
105
138
  signing_key:
106
139
  specification_version: 2
107
140
  summary: A general classifier module to allow Bayesian and other types of classifications.
108
141
  test_files: []
142
+ has_rdoc: true