classifier-reborn 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ # Author:: Kelley Reynolds (mailto:kelley@insidesystems.net)
2
+ # Copyright:: Copyright (c) 2015 Kelley Reynolds
3
+ # License:: LGPL
4
+
5
+ module ClassifierReborn
6
+
7
+ # Subclass of ContentNode which caches the search_vector transpositions.
8
+ # Its great because its much faster for large indexes, but at the cost of more ram. Additionally,
9
+ # if you Marshal your classifier and want to keep the size down, you'll need to manually
10
+ # clear the cache before you dump
11
+ class CachedContentNode < ContentNode
12
+ module InstanceMethods
13
+ # Go through each item in this index and clear the cache
14
+ def clear_cache!
15
+ @items.each_value(&:clear_cache!)
16
+ end
17
+ end
18
+
19
+ def initialize( word_hash, *categories )
20
+ clear_cache!
21
+ super
22
+ end
23
+
24
+ def clear_cache!
25
+ @transposed_search_vector = nil
26
+ end
27
+
28
+ # Cache the transposed vector, it gets used a lot
29
+ def transposed_search_vector
30
+ @transposed_search_vector ||= super
31
+ end
32
+
33
+ # Clear the cache before we continue on
34
+ def raw_vector_with( word_list )
35
+ clear_cache!
36
+ super
37
+ end
38
+
39
+ # We don't want the cached_data here
40
+ def marshal_dump
41
+ [@lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash]
42
+ end
43
+
44
+ def marshal_load(array)
45
+ @lsi_vector, @lsi_norm, @raw_vector, @raw_norm, @categories, @word_hash = array
46
+ end
47
+ end
48
+ end
@@ -18,6 +18,7 @@ module ClassifierReborn
18
18
  def initialize( word_hash, *categories )
19
19
  @categories = categories || []
20
20
  @word_hash = word_hash
21
+ @lsi_norm, @lsi_vector = nil
21
22
  end
22
23
 
23
24
  # Use this to fetch the appropriate search vector.
@@ -25,6 +26,11 @@ module ClassifierReborn
25
26
  @lsi_vector || @raw_vector
26
27
  end
27
28
 
29
+ # Method to access the transposed search vector
30
+ def transposed_search_vector
31
+ search_vector.col
32
+ end
33
+
28
34
  # Use this to fetch the appropriate search vector in normalized form.
29
35
  def search_norm
30
36
  @lsi_norm || @raw_norm
@@ -46,7 +52,7 @@ module ClassifierReborn
46
52
  # Perform the scaling transform and force floating point arithmetic
47
53
  if $GSL
48
54
  sum = 0.0
49
- vec.collect{|v| sum += v}
55
+ vec.each {|v| sum += v }
50
56
  total_words = sum
51
57
  else
52
58
  total_words = vec.reduce(0, :+).to_f
@@ -55,7 +61,7 @@ module ClassifierReborn
55
61
  total_unique_words = 0
56
62
 
57
63
  if $GSL
58
- vec.each { |word| total_unique_words += 1 if word != 0 }
64
+ vec.each { |word| total_unique_words += 1 if word != 0.0 }
59
65
  else
60
66
  total_unique_words = vec.count{ |word| word != 0 }
61
67
  end
@@ -64,20 +70,31 @@ module ClassifierReborn
64
70
  # then one word in it.
65
71
  if total_words > 1.0 && total_unique_words > 1
66
72
  weighted_total = 0.0
73
+ # Cache calculations, this takes too long on large indexes
74
+ cached_calcs = Hash.new { |hash, term|
75
+ hash[term] = (( term / total_words ) * Math.log( term / total_words ))
76
+ }
77
+
67
78
  vec.each do |term|
68
- if ( term > 0 )
69
- weighted_total += (( term / total_words ) * Math.log( term / total_words ))
70
- end
79
+ weighted_total += cached_calcs[term] if term > 0.0
71
80
  end
72
- vec = vec.collect { |val| Math.log( val + 1 ) / -weighted_total }
81
+
82
+ # Cache calculations, this takes too long on large indexes
83
+ cached_calcs = Hash.new do |hash, val|
84
+ hash[val] = Math.log( val + 1 ) / -weighted_total
85
+ end
86
+
87
+ vec.collect! { |val|
88
+ cached_calcs[val]
89
+ }
73
90
  end
74
91
 
75
92
  if $GSL
76
- @raw_norm = vec.normalize
77
- @raw_vector = vec
93
+ @raw_norm = vec.normalize
94
+ @raw_vector = vec
78
95
  else
79
- @raw_norm = Vector[*vec].normalize
80
- @raw_vector = Vector[*vec]
96
+ @raw_norm = Vector[*vec].normalize
97
+ @raw_vector = Vector[*vec]
81
98
  end
82
99
  end
83
100
 
@@ -15,11 +15,11 @@ module ClassifierReborn
15
15
  end
16
16
 
17
17
  def split_sentences(str)
18
- str.split /(\.|\!|\?)/ # TODO: make this less primitive
18
+ str.split(/(\.|\!|\?)/) # TODO: make this less primitive
19
19
  end
20
20
 
21
21
  def split_paragraphs(str)
22
- str.split /(\n\n|\r\r|\r\n\r\n)/ # TODO: make this less primitive
22
+ str.split(/(\n\n|\r\r|\r\n\r\n)/) # TODO: make this less primitive
23
23
  end
24
24
 
25
25
  def perform_lsi(chunks, count, separator)
@@ -1,3 +1,3 @@
1
1
  module ClassifierReborn
2
- VERSION = '2.0.3'
2
+ VERSION = '2.0.4'
3
3
  end
metadata CHANGED
@@ -1,15 +1,16 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: classifier-reborn
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.3
4
+ version: 2.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lucas Carlson
8
8
  - Parker Moore
9
+ - Chase Gilliam
9
10
  autorequire:
10
11
  bindir: bin
11
12
  cert_chain: []
12
- date: 2014-12-23 00:00:00.000000000 Z
13
+ date: 2015-10-31 00:00:00.000000000 Z
13
14
  dependencies:
14
15
  - !ruby/object:Gem::Dependency
15
16
  name: fast-stemmer
@@ -53,10 +54,25 @@ dependencies:
53
54
  - - ">="
54
55
  - !ruby/object:Gem::Version
55
56
  version: '0'
57
+ - !ruby/object:Gem::Dependency
58
+ name: test-unit
59
+ requirement: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ type: :development
65
+ prerelease: false
66
+ version_requirements: !ruby/object:Gem::Requirement
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
56
71
  description:
57
72
  email:
58
73
  - lucas@rufy.com
59
74
  - parkrmoore@gmail.com
75
+ - chase.gilliam@gmail.com
60
76
  executables:
61
77
  - bayes.rb
62
78
  - summarize.rb
@@ -69,6 +85,22 @@ files:
69
85
  - README.markdown
70
86
  - bin/bayes.rb
71
87
  - bin/summarize.rb
88
+ - data/stopwords/ca
89
+ - data/stopwords/cs
90
+ - data/stopwords/da
91
+ - data/stopwords/de
92
+ - data/stopwords/en
93
+ - data/stopwords/es
94
+ - data/stopwords/fi
95
+ - data/stopwords/fr
96
+ - data/stopwords/hu
97
+ - data/stopwords/it
98
+ - data/stopwords/nl
99
+ - data/stopwords/no
100
+ - data/stopwords/pl
101
+ - data/stopwords/pt
102
+ - data/stopwords/se
103
+ - data/stopwords/tr
72
104
  - lib/classifier-reborn.rb
73
105
  - lib/classifier-reborn/bayes.rb
74
106
  - lib/classifier-reborn/category_namer.rb
@@ -76,6 +108,7 @@ files:
76
108
  - lib/classifier-reborn/extensions/vector.rb
77
109
  - lib/classifier-reborn/extensions/vector_serialize.rb
78
110
  - lib/classifier-reborn/lsi.rb
111
+ - lib/classifier-reborn/lsi/cached_content_node.rb
79
112
  - lib/classifier-reborn/lsi/content_node.rb
80
113
  - lib/classifier-reborn/lsi/summarizer.rb
81
114
  - lib/classifier-reborn/lsi/word_list.rb
@@ -101,8 +134,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
101
134
  version: '0'
102
135
  requirements: []
103
136
  rubyforge_project:
104
- rubygems_version: 2.2.2
137
+ rubygems_version: 2.4.8
105
138
  signing_key:
106
139
  specification_version: 2
107
140
  summary: A general classifier module to allow Bayesian and other types of classifications.
108
141
  test_files: []
142
+ has_rdoc: true