ankusa 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,6 +110,13 @@ To use the Cassandra storage class:
110
110
  # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
111
111
  storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
112
112
 
113
+ For MongoDB storage:
114
+ require 'ankusa/mongo_db_storage'
115
+ storage = Ankusa::MongoDbStorage.new :host => "localhost", :port => 27017, :db => "ankusa"
116
+ # defaults: :host => "localhost", :port => 27017, :db => "ankusa"
117
+ # no default username or password
118
+ # tou can also use frequency_tablename and summary_tablename options
119
+
113
120
 
114
121
  == Running Tests
115
122
  You can run the tests for any of the four storage methods. For instance, for memory storage:
@@ -121,6 +128,8 @@ For the other methods you will need to edit the file test/config.yml and set the
121
128
  rake test_cassandra
122
129
  # or
123
130
  rake test_filesystem
131
+ #or
132
+ rake test_mongo_db
124
133
 
125
134
 
126
135
 
data/Rakefile CHANGED
@@ -40,3 +40,10 @@ Rake::TestTask.new("test_filesystem") { |t|
40
40
  t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
41
41
  t.verbose = true
42
42
  }
43
+
44
+ desc "Run all unit tests with MongoDb storage"
45
+ Rake::TestTask.new("test_mongo_db") { |t|
46
+ t.libs << "lib"
47
+ t.test_files = FileList['test/hasher_test.rb', 'test/mongo_db_classifier_test.rb']
48
+ t.verbose = true
49
+ }
@@ -0,0 +1,127 @@
1
+ require 'mongo'
2
+ #require 'bson_ext'
3
+
4
+ module Ankusa
5
+ class MongoDbStorage
6
+
7
+ def initialize(opts={})
8
+ options = { :host => "localhost", :port => 27017, :db => "ankusa",
9
+ :frequency_tablename => "word_frequencies", :summary_tablename => "summary"
10
+ }.merge(opts)
11
+
12
+ @db = Mongo::Connection.new(options[:host], options[:port]).db(options[:db])
13
+ @db.authenticate(options[:username], options[:password]) if options[:password]
14
+
15
+ @ftablename = options[:frequency_tablename]
16
+ @stablename = options[:summary_tablename]
17
+
18
+ @klass_word_counts = {}
19
+ @klass_doc_counts = {}
20
+
21
+ init_tables
22
+ end
23
+
24
+ def init_tables
25
+ @db.create_collection(@ftablename) unless @db.collection_names.include?(@ftablename)
26
+ freq_table.create_index('word')
27
+ @db.create_collection(@stablename) unless @db.collection_names.include?(@stablename)
28
+ summary_table.create_index('klass')
29
+ end
30
+
31
+ def drop_tables
32
+ @db.drop_collection(@ftablename)
33
+ @db.drop_collection(@stablename)
34
+ end
35
+
36
+ def classnames
37
+ summary_table.distinct('klass')
38
+ end
39
+
40
+ def reset
41
+ drop_tables
42
+ init_tables
43
+ end
44
+
45
+ def incr_word_count(klass, word, count)
46
+ freq_table.update({:word => word}, { '$inc' => {klass => count} }, :upsert => true)
47
+
48
+ #update vocabulary size
49
+ word_doc = freq_table.find_one({:word => word})
50
+ if word_doc[klass.to_s] == count
51
+ increment_summary_klass(klass, 'vocabulary_size', 1)
52
+ elsif word_doc[klass.to_s] == 0
53
+ increment_summary_klass(klass, 'vocabulary_size', -1)
54
+ end
55
+ word_doc[klass.to_s]
56
+ end
57
+
58
+ def incr_total_word_count(klass, count)
59
+ increment_summary_klass(klass, 'word_count', count)
60
+ end
61
+
62
+ def incr_doc_count(klass, count)
63
+ increment_summary_klass(klass, 'doc_count', count)
64
+ end
65
+
66
+ def get_word_counts(word)
67
+ counts = Hash.new(0)
68
+
69
+ word_doc = freq_table.find_one({:word => word})
70
+ if word_doc
71
+ word_doc.delete("_id")
72
+ word_doc.delete("word")
73
+ #convert keys to symbols
74
+ counts.merge!(word_doc.inject({}){|h, (k, v)| h[(k.to_sym rescue k) || k] = v; h})
75
+ end
76
+
77
+ counts
78
+ end
79
+
80
+ def get_total_word_count(klass)
81
+ klass_doc = summary_table.find_one(:klass => klass)
82
+ klass_doc ? klass_doc['word_count'].to_f : 0.0
83
+ end
84
+
85
+ def doc_count_totals
86
+ count = Hash.new(0)
87
+
88
+ summary_table.find.each do |doc|
89
+ count[ doc['klass'] ] = doc['doc_count']
90
+ end
91
+
92
+ count
93
+ end
94
+
95
+ def get_vocabulary_sizes
96
+ count = Hash.new(0)
97
+
98
+ summary_table.find.each do |doc|
99
+ count[ doc['klass'] ] = doc['vocabulary_size']
100
+ end
101
+
102
+ count
103
+ end
104
+
105
+ def get_doc_count(klass)
106
+ klass_doc = summary_table.find_one(:klass => klass)
107
+ klass_doc ? klass_doc['doc_count'].to_f : 0.0
108
+ end
109
+
110
+ def close
111
+ end
112
+
113
+ private
114
+ def summary_table
115
+ @stable ||= @db[@stablename]
116
+ end
117
+
118
+ def freq_table
119
+ @ftable ||= @db[@ftablename]
120
+ end
121
+
122
+ def increment_summary_klass(klass, field, count)
123
+ summary_table.update({:klass => klass}, { '$inc' => {field => count} }, :upsert => true)
124
+ end
125
+
126
+ end
127
+ end
@@ -1,3 +1,3 @@
1
1
  module Ankusa
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 12
10
- version: 0.0.12
9
+ - 13
10
+ version: 0.0.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Muller
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-04-10 00:00:00 Z
18
+ date: 2012-04-16 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: fast-stemmer
@@ -50,6 +50,7 @@ files:
50
50
  - lib/ankusa/hbase_storage.rb
51
51
  - lib/ankusa/kl_divergence.rb
52
52
  - lib/ankusa/memory_storage.rb
53
+ - lib/ankusa/mongo_db_storage.rb
53
54
  - lib/ankusa/naive_bayes.rb
54
55
  - lib/ankusa/stopwords.rb
55
56
  - lib/ankusa/version.rb