ankusa 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -110,6 +110,13 @@ To use the Cassandra storage class:
110
110
  # defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
111
111
  storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
112
112
 
113
+ For MongoDB storage:
114
+ require 'ankusa/mongo_db_storage'
115
+ storage = Ankusa::MongoDbStorage.new :host => "localhost", :port => 27017, :db => "ankusa"
116
+ # defaults: :host => "localhost", :port => 27017, :db => "ankusa"
117
+ # no default username or password
118
+ # tou can also use frequency_tablename and summary_tablename options
119
+
113
120
 
114
121
  == Running Tests
115
122
  You can run the tests for any of the four storage methods. For instance, for memory storage:
@@ -121,6 +128,8 @@ For the other methods you will need to edit the file test/config.yml and set the
121
128
  rake test_cassandra
122
129
  # or
123
130
  rake test_filesystem
131
+ #or
132
+ rake test_mongo_db
124
133
 
125
134
 
126
135
 
data/Rakefile CHANGED
@@ -40,3 +40,10 @@ Rake::TestTask.new("test_filesystem") { |t|
40
40
  t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
41
41
  t.verbose = true
42
42
  }
43
+
44
+ desc "Run all unit tests with MongoDb storage"
45
+ Rake::TestTask.new("test_mongo_db") { |t|
46
+ t.libs << "lib"
47
+ t.test_files = FileList['test/hasher_test.rb', 'test/mongo_db_classifier_test.rb']
48
+ t.verbose = true
49
+ }
@@ -0,0 +1,127 @@
1
+ require 'mongo'
2
+ #require 'bson_ext'
3
+
4
+ module Ankusa
5
+ class MongoDbStorage
6
+
7
+ def initialize(opts={})
8
+ options = { :host => "localhost", :port => 27017, :db => "ankusa",
9
+ :frequency_tablename => "word_frequencies", :summary_tablename => "summary"
10
+ }.merge(opts)
11
+
12
+ @db = Mongo::Connection.new(options[:host], options[:port]).db(options[:db])
13
+ @db.authenticate(options[:username], options[:password]) if options[:password]
14
+
15
+ @ftablename = options[:frequency_tablename]
16
+ @stablename = options[:summary_tablename]
17
+
18
+ @klass_word_counts = {}
19
+ @klass_doc_counts = {}
20
+
21
+ init_tables
22
+ end
23
+
24
+ def init_tables
25
+ @db.create_collection(@ftablename) unless @db.collection_names.include?(@ftablename)
26
+ freq_table.create_index('word')
27
+ @db.create_collection(@stablename) unless @db.collection_names.include?(@stablename)
28
+ summary_table.create_index('klass')
29
+ end
30
+
31
+ def drop_tables
32
+ @db.drop_collection(@ftablename)
33
+ @db.drop_collection(@stablename)
34
+ end
35
+
36
+ def classnames
37
+ summary_table.distinct('klass')
38
+ end
39
+
40
+ def reset
41
+ drop_tables
42
+ init_tables
43
+ end
44
+
45
+ def incr_word_count(klass, word, count)
46
+ freq_table.update({:word => word}, { '$inc' => {klass => count} }, :upsert => true)
47
+
48
+ #update vocabulary size
49
+ word_doc = freq_table.find_one({:word => word})
50
+ if word_doc[klass.to_s] == count
51
+ increment_summary_klass(klass, 'vocabulary_size', 1)
52
+ elsif word_doc[klass.to_s] == 0
53
+ increment_summary_klass(klass, 'vocabulary_size', -1)
54
+ end
55
+ word_doc[klass.to_s]
56
+ end
57
+
58
+ def incr_total_word_count(klass, count)
59
+ increment_summary_klass(klass, 'word_count', count)
60
+ end
61
+
62
+ def incr_doc_count(klass, count)
63
+ increment_summary_klass(klass, 'doc_count', count)
64
+ end
65
+
66
+ def get_word_counts(word)
67
+ counts = Hash.new(0)
68
+
69
+ word_doc = freq_table.find_one({:word => word})
70
+ if word_doc
71
+ word_doc.delete("_id")
72
+ word_doc.delete("word")
73
+ #convert keys to symbols
74
+ counts.merge!(word_doc.inject({}){|h, (k, v)| h[(k.to_sym rescue k) || k] = v; h})
75
+ end
76
+
77
+ counts
78
+ end
79
+
80
+ def get_total_word_count(klass)
81
+ klass_doc = summary_table.find_one(:klass => klass)
82
+ klass_doc ? klass_doc['word_count'].to_f : 0.0
83
+ end
84
+
85
+ def doc_count_totals
86
+ count = Hash.new(0)
87
+
88
+ summary_table.find.each do |doc|
89
+ count[ doc['klass'] ] = doc['doc_count']
90
+ end
91
+
92
+ count
93
+ end
94
+
95
+ def get_vocabulary_sizes
96
+ count = Hash.new(0)
97
+
98
+ summary_table.find.each do |doc|
99
+ count[ doc['klass'] ] = doc['vocabulary_size']
100
+ end
101
+
102
+ count
103
+ end
104
+
105
+ def get_doc_count(klass)
106
+ klass_doc = summary_table.find_one(:klass => klass)
107
+ klass_doc ? klass_doc['doc_count'].to_f : 0.0
108
+ end
109
+
110
+ def close
111
+ end
112
+
113
+ private
114
+ def summary_table
115
+ @stable ||= @db[@stablename]
116
+ end
117
+
118
+ def freq_table
119
+ @ftable ||= @db[@ftablename]
120
+ end
121
+
122
+ def increment_summary_klass(klass, field, count)
123
+ summary_table.update({:klass => klass}, { '$inc' => {field => count} }, :upsert => true)
124
+ end
125
+
126
+ end
127
+ end
@@ -1,3 +1,3 @@
1
1
  module Ankusa
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ankusa
3
3
  version: !ruby/object:Gem::Version
4
- hash: 7
4
+ hash: 5
5
5
  prerelease:
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 12
10
- version: 0.0.12
9
+ - 13
10
+ version: 0.0.13
11
11
  platform: ruby
12
12
  authors:
13
13
  - Brian Muller
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2012-04-10 00:00:00 Z
18
+ date: 2012-04-16 00:00:00 Z
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
21
21
  name: fast-stemmer
@@ -50,6 +50,7 @@ files:
50
50
  - lib/ankusa/hbase_storage.rb
51
51
  - lib/ankusa/kl_divergence.rb
52
52
  - lib/ankusa/memory_storage.rb
53
+ - lib/ankusa/mongo_db_storage.rb
53
54
  - lib/ankusa/naive_bayes.rb
54
55
  - lib/ankusa/stopwords.rb
55
56
  - lib/ankusa/version.rb