ankusa 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +9 -0
- data/Rakefile +7 -0
- data/lib/ankusa/mongo_db_storage.rb +127 -0
- data/lib/ankusa/version.rb +1 -1
- metadata +5 -4
data/README.rdoc
CHANGED
@@ -110,6 +110,13 @@ To use the Cassandra storage class:
|
|
110
110
|
# defaults: host='127.0.0.1', port=9160, keyspace = 'ankusa', max_classes = 100
|
111
111
|
storage = Ankusa::HBaseStorage.new host, port, keyspace, max_classes
|
112
112
|
|
113
|
+
For MongoDB storage:
|
114
|
+
require 'ankusa/mongo_db_storage'
|
115
|
+
storage = Ankusa::MongoDbStorage.new :host => "localhost", :port => 27017, :db => "ankusa"
|
116
|
+
# defaults: :host => "localhost", :port => 27017, :db => "ankusa"
|
117
|
+
# no default username or password
|
118
|
+
# tou can also use frequency_tablename and summary_tablename options
|
119
|
+
|
113
120
|
|
114
121
|
== Running Tests
|
115
122
|
You can run the tests for any of the four storage methods. For instance, for memory storage:
|
@@ -121,6 +128,8 @@ For the other methods you will need to edit the file test/config.yml and set the
|
|
121
128
|
rake test_cassandra
|
122
129
|
# or
|
123
130
|
rake test_filesystem
|
131
|
+
#or
|
132
|
+
rake test_mongo_db
|
124
133
|
|
125
134
|
|
126
135
|
|
data/Rakefile
CHANGED
@@ -40,3 +40,10 @@ Rake::TestTask.new("test_filesystem") { |t|
|
|
40
40
|
t.test_files = FileList['test/hasher_test.rb', 'test/file_system_classifier_test.rb']
|
41
41
|
t.verbose = true
|
42
42
|
}
|
43
|
+
|
44
|
+
desc "Run all unit tests with MongoDb storage"
|
45
|
+
Rake::TestTask.new("test_mongo_db") { |t|
|
46
|
+
t.libs << "lib"
|
47
|
+
t.test_files = FileList['test/hasher_test.rb', 'test/mongo_db_classifier_test.rb']
|
48
|
+
t.verbose = true
|
49
|
+
}
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'mongo'
|
2
|
+
#require 'bson_ext'
|
3
|
+
|
4
|
+
module Ankusa
|
5
|
+
class MongoDbStorage
|
6
|
+
|
7
|
+
def initialize(opts={})
|
8
|
+
options = { :host => "localhost", :port => 27017, :db => "ankusa",
|
9
|
+
:frequency_tablename => "word_frequencies", :summary_tablename => "summary"
|
10
|
+
}.merge(opts)
|
11
|
+
|
12
|
+
@db = Mongo::Connection.new(options[:host], options[:port]).db(options[:db])
|
13
|
+
@db.authenticate(options[:username], options[:password]) if options[:password]
|
14
|
+
|
15
|
+
@ftablename = options[:frequency_tablename]
|
16
|
+
@stablename = options[:summary_tablename]
|
17
|
+
|
18
|
+
@klass_word_counts = {}
|
19
|
+
@klass_doc_counts = {}
|
20
|
+
|
21
|
+
init_tables
|
22
|
+
end
|
23
|
+
|
24
|
+
def init_tables
|
25
|
+
@db.create_collection(@ftablename) unless @db.collection_names.include?(@ftablename)
|
26
|
+
freq_table.create_index('word')
|
27
|
+
@db.create_collection(@stablename) unless @db.collection_names.include?(@stablename)
|
28
|
+
summary_table.create_index('klass')
|
29
|
+
end
|
30
|
+
|
31
|
+
def drop_tables
|
32
|
+
@db.drop_collection(@ftablename)
|
33
|
+
@db.drop_collection(@stablename)
|
34
|
+
end
|
35
|
+
|
36
|
+
def classnames
|
37
|
+
summary_table.distinct('klass')
|
38
|
+
end
|
39
|
+
|
40
|
+
def reset
|
41
|
+
drop_tables
|
42
|
+
init_tables
|
43
|
+
end
|
44
|
+
|
45
|
+
def incr_word_count(klass, word, count)
|
46
|
+
freq_table.update({:word => word}, { '$inc' => {klass => count} }, :upsert => true)
|
47
|
+
|
48
|
+
#update vocabulary size
|
49
|
+
word_doc = freq_table.find_one({:word => word})
|
50
|
+
if word_doc[klass.to_s] == count
|
51
|
+
increment_summary_klass(klass, 'vocabulary_size', 1)
|
52
|
+
elsif word_doc[klass.to_s] == 0
|
53
|
+
increment_summary_klass(klass, 'vocabulary_size', -1)
|
54
|
+
end
|
55
|
+
word_doc[klass.to_s]
|
56
|
+
end
|
57
|
+
|
58
|
+
def incr_total_word_count(klass, count)
|
59
|
+
increment_summary_klass(klass, 'word_count', count)
|
60
|
+
end
|
61
|
+
|
62
|
+
def incr_doc_count(klass, count)
|
63
|
+
increment_summary_klass(klass, 'doc_count', count)
|
64
|
+
end
|
65
|
+
|
66
|
+
def get_word_counts(word)
|
67
|
+
counts = Hash.new(0)
|
68
|
+
|
69
|
+
word_doc = freq_table.find_one({:word => word})
|
70
|
+
if word_doc
|
71
|
+
word_doc.delete("_id")
|
72
|
+
word_doc.delete("word")
|
73
|
+
#convert keys to symbols
|
74
|
+
counts.merge!(word_doc.inject({}){|h, (k, v)| h[(k.to_sym rescue k) || k] = v; h})
|
75
|
+
end
|
76
|
+
|
77
|
+
counts
|
78
|
+
end
|
79
|
+
|
80
|
+
def get_total_word_count(klass)
|
81
|
+
klass_doc = summary_table.find_one(:klass => klass)
|
82
|
+
klass_doc ? klass_doc['word_count'].to_f : 0.0
|
83
|
+
end
|
84
|
+
|
85
|
+
def doc_count_totals
|
86
|
+
count = Hash.new(0)
|
87
|
+
|
88
|
+
summary_table.find.each do |doc|
|
89
|
+
count[ doc['klass'] ] = doc['doc_count']
|
90
|
+
end
|
91
|
+
|
92
|
+
count
|
93
|
+
end
|
94
|
+
|
95
|
+
def get_vocabulary_sizes
|
96
|
+
count = Hash.new(0)
|
97
|
+
|
98
|
+
summary_table.find.each do |doc|
|
99
|
+
count[ doc['klass'] ] = doc['vocabulary_size']
|
100
|
+
end
|
101
|
+
|
102
|
+
count
|
103
|
+
end
|
104
|
+
|
105
|
+
def get_doc_count(klass)
|
106
|
+
klass_doc = summary_table.find_one(:klass => klass)
|
107
|
+
klass_doc ? klass_doc['doc_count'].to_f : 0.0
|
108
|
+
end
|
109
|
+
|
110
|
+
def close
|
111
|
+
end
|
112
|
+
|
113
|
+
private
|
114
|
+
def summary_table
|
115
|
+
@stable ||= @db[@stablename]
|
116
|
+
end
|
117
|
+
|
118
|
+
def freq_table
|
119
|
+
@ftable ||= @db[@ftablename]
|
120
|
+
end
|
121
|
+
|
122
|
+
def increment_summary_klass(klass, field, count)
|
123
|
+
summary_table.update({:klass => klass}, { '$inc' => {field => count} }, :upsert => true)
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
data/lib/ankusa/version.rb
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ankusa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 5
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 13
|
10
|
+
version: 0.0.13
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Brian Muller
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2012-04-
|
18
|
+
date: 2012-04-16 00:00:00 Z
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
21
21
|
name: fast-stemmer
|
@@ -50,6 +50,7 @@ files:
|
|
50
50
|
- lib/ankusa/hbase_storage.rb
|
51
51
|
- lib/ankusa/kl_divergence.rb
|
52
52
|
- lib/ankusa/memory_storage.rb
|
53
|
+
- lib/ankusa/mongo_db_storage.rb
|
53
54
|
- lib/ankusa/naive_bayes.rb
|
54
55
|
- lib/ankusa/stopwords.rb
|
55
56
|
- lib/ankusa/version.rb
|