ankusa 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +9 -0
- data/Rakefile +7 -0
- data/lib/ankusa/mongo_db_storage.rb +127 -0
- data/lib/ankusa/version.rb +1 -1
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -110,6 +110,13 @@ To use the Cassandra storage class:
|
|
110
110
|
# defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
|
111
111
|
storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
|
112
112
|
|
113
|
+
For MongoDB storage:
|
114
|
+
require 'ankusa/mongo_db_storage'
|
115
|
+
storage = Ankusa::MongoDbStorage.new :host => "localhost", :port => 27017, :db => "ankusa"
|
116
|
+
# defaults: :host => "localhost", :port => 27017, :db => "ankusa"
|
117
|
+
# no default username or password
|
118
|
+
# tou can also use frequency_tablename and summary_tablename options
|
119
|
+
|
113
120
|
|
114
121
|
== Running Tests
|
115
122
|
You can run the tests for any of the four storage methods. For instance, for memory storage:
|
@@ -121,6 +128,8 @@ For the other methods you will need to edit the file test/config.yml and set the
|
|
121
128
|
rake test_cassandra
|
122
129
|
# or
|
123
130
|
rake test_filesystem
|
131
|
+
#or
|
132
|
+
rake test_mongo_db
|
124
133
|
|
125
134
|
|
126
135
|
|
data/Rakefile
CHANGED
@@ -40,3 +40,10 @@ Rake::TestTask.new("test_filesystem") { |t|
|
|
40
40
|
t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
|
41
41
|
t.verbose = true
|
42
42
|
}
|
43
|
+
|
44
|
+
desc "Run all unit tests with MongoDb storage"
|
45
|
+
Rake::TestTask.new("test_mongo_db") { |t|
|
46
|
+
t.libs << "lib"
|
47
|
+
t.test_files = FileList['test/hasher_test.rb', 'test/mongo_db_classifier_test.rb']
|
48
|
+
t.verbose = true
|
49
|
+
}
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'mongo'
|
2
|
+
#require 'bson_ext'
|
3
|
+
|
4
|
+
module Ankusa
|
5
|
+
class MongoDbStorage
|
6
|
+
|
7
|
+
def initialize(opts={})
|
8
|
+
options = { :host => "localhost", :port => 27017, :db => "ankusa",
|
9
|
+
:frequency_tablename => "word_frequencies", :summary_tablename => "summary"
|
10
|
+
}.merge(opts)
|
11
|
+
|
12
|
+
@db = Mongo::Connection.new(options[:host], options[:port]).db(options[:db])
|
13
|
+
@db.authenticate(options[:username], options[:password]) if options[:password]
|
14
|
+
|
15
|
+
@ftablename = options[:frequency_tablename]
|
16
|
+
@stablename = options[:summary_tablename]
|
17
|
+
|
18
|
+
@klass_word_counts = {}
|
19
|
+
@klass_doc_counts = {}
|
20
|
+
|
21
|
+
init_tables
|
22
|
+
end
|
23
|
+
|
24
|
+
def init_tables
|
25
|
+
@db.create_collection(@ftablename) unless @db.collection_names.include?(@ftablename)
|
26
|
+
freq_table.create_index('word')
|
27
|
+
@db.create_collection(@stablename) unless @db.collection_names.include?(@stablename)
|
28
|
+
summary_table.create_index('klass')
|
29
|
+
end
|
30
|
+
|
31
|
+
def drop_tables
|
32
|
+
@db.drop_collection(@ftablename)
|
33
|
+
@db.drop_collection(@stablename)
|
34
|
+
end
|
35
|
+
|
36
|
+
def classnames
|
37
|
+
summary_table.distinct('klass')
|
38
|
+
end
|
39
|
+
|
40
|
+
def reset
|
41
|
+
drop_tables
|
42
|
+
init_tables
|
43
|
+
end
|
44
|
+
|
45
|
+
def incr_word_count(klass, word, count)
|
46
|
+
freq_table.update({:word => word}, { '$inc' => {klass => count} }, :upsert => true)
|
47
|
+
|
48
|
+
#update vocabulary size
|
49
|
+
word_doc = freq_table.find_one({:word => word})
|
50
|
+
if word_doc[klass.to_s] == count
|
51
|
+
increment_summary_klass(klass, 'vocabulary_size', 1)
|
52
|
+
elsif word_doc[klass.to_s] == 0
|
53
|
+
increment_summary_klass(klass, 'vocabulary_size', -1)
|
54
|
+
end
|
55
|
+
word_doc[klass.to_s]
|
56
|
+
end
|
57
|
+
|
58
|
+
def incr_total_word_count(klass, count)
|
59
|
+
increment_summary_klass(klass, 'word_count', count)
|
60
|
+
end
|
61
|
+
|
62
|
+
def incr_doc_count(klass, count)
|
63
|
+
increment_summary_klass(klass, 'doc_count', count)
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_word_counts(word)
|
67
|
+
counts = Hash.new(0)
|
68
|
+
|
69
|
+
word_doc = freq_table.find_one({:word => word})
|
70
|
+
if word_doc
|
71
|
+
word_doc.delete("_id")
|
72
|
+
word_doc.delete("word")
|
73
|
+
#convert keys to symbols
|
74
|
+
counts.merge!(word_doc.inject({}){|h, (k, v)| h[(k.to_sym rescue k) || k] = v; h})
|
75
|
+
end
|
76
|
+
|
77
|
+
counts
|
78
|
+
end
|
79
|
+
|
80
|
+
def get_total_word_count(klass)
|
81
|
+
klass_doc = summary_table.find_one(:klass => klass)
|
82
|
+
klass_doc ? klass_doc['word_count'].to_f : 0.0
|
83
|
+
end
|
84
|
+
|
85
|
+
def doc_count_totals
|
86
|
+
count = Hash.new(0)
|
87
|
+
|
88
|
+
summary_table.find.each do |doc|
|
89
|
+
count[ doc['klass'] ] = doc['doc_count']
|
90
|
+
end
|
91
|
+
|
92
|
+
count
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_vocabulary_sizes
|
96
|
+
count = Hash.new(0)
|
97
|
+
|
98
|
+
summary_table.find.each do |doc|
|
99
|
+
count[ doc['klass'] ] = doc['vocabulary_size']
|
100
|
+
end
|
101
|
+
|
102
|
+
count
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_doc_count(klass)
|
106
|
+
klass_doc = summary_table.find_one(:klass => klass)
|
107
|
+
klass_doc ? klass_doc['doc_count'].to_f : 0.0
|
108
|
+
end
|
109
|
+
|
110
|
+
def close
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
def summary_table
|
115
|
+
@stable ||= @db[@stablename]
|
116
|
+
end
|
117
|
+
|
118
|
+
def freq_table
|
119
|
+
@ftable ||= @db[@ftablename]
|
120
|
+
end
|
121
|
+
|
122
|
+
def increment_summary_klass(klass, field, count)
|
123
|
+
summary_table.update({:klass => klass}, { '$inc' => {field => count} }, :upsert => true)
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
data/lib/ankusa/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 13
|
10
|
+
version: 0.0.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-04-
|
18
|
+
date: 2012-04-16 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: fast-stemmer
|
@@ -50,6 +50,7 @@ files:
|
|
50
50
|
- lib/ankusa/hbase_storage.rb
|
51
51
|
- lib/ankusa/kl_divergence.rb
|
52
52
|
- lib/ankusa/memory_storage.rb
|
53
|
+
- lib/ankusa/mongo_db_storage.rb
|
53
54
|
- lib/ankusa/naive_bayes.rb
|
54
55
|
- lib/ankusa/stopwords.rb
|
55
56
|
- lib/ankusa/version.rb
|