tomoto 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/ext/tomoto/ct.cpp +11 -11
- data/ext/tomoto/dmr.cpp +14 -13
- data/ext/tomoto/dt.cpp +14 -14
- data/ext/tomoto/ext.cpp +7 -7
- data/ext/tomoto/extconf.rb +1 -3
- data/ext/tomoto/gdmr.cpp +7 -7
- data/ext/tomoto/hdp.cpp +9 -9
- data/ext/tomoto/hlda.cpp +13 -13
- data/ext/tomoto/hpa.cpp +5 -5
- data/ext/tomoto/lda.cpp +42 -39
- data/ext/tomoto/llda.cpp +6 -6
- data/ext/tomoto/mglda.cpp +15 -15
- data/ext/tomoto/pa.cpp +6 -6
- data/ext/tomoto/plda.cpp +6 -6
- data/ext/tomoto/slda.cpp +8 -8
- data/ext/tomoto/utils.h +16 -70
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +57 -0
- data/vendor/tomotopy/README.rst +55 -0
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +4 -4
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +3 -3
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +34 -14
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +2 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +5 -2
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +4 -1
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +48 -21
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
- data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
- data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
- data/vendor/tomotopy/src/Utils/math.h +2 -2
- data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
- metadata +6 -6
data/ext/tomoto/plda.cpp
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
#include <PLDA.h>
|
2
2
|
|
3
|
-
#include <rice/
|
3
|
+
#include <rice/rice.hpp>
|
4
4
|
|
5
5
|
#include "utils.h"
|
6
6
|
|
7
7
|
void init_plda(Rice::Module& m) {
|
8
8
|
Rice::define_class_under<tomoto::IPLDAModel, tomoto::ILLDAModel>(m, "PLDA")
|
9
|
-
.
|
9
|
+
.define_singleton_function(
|
10
10
|
"_new",
|
11
|
-
|
11
|
+
[](size_t tw, size_t latent_topics, tomoto::Float alpha, tomoto::Float eta, size_t seed) {
|
12
12
|
tomoto::PLDAArgs args;
|
13
13
|
args.numLatentTopics = latent_topics;
|
14
14
|
args.alpha = {alpha};
|
@@ -17,17 +17,17 @@ void init_plda(Rice::Module& m) {
|
|
17
17
|
args.seed = seed;
|
18
18
|
}
|
19
19
|
return tomoto::IPLDAModel::create((tomoto::TermWeight)tw, args);
|
20
|
-
})
|
20
|
+
}, Rice::Return().takeOwnership())
|
21
21
|
.define_method(
|
22
22
|
"_add_doc",
|
23
|
-
|
23
|
+
[](tomoto::IPLDAModel& self, std::vector<std::string> words, std::vector<std::string> labels) {
|
24
24
|
auto doc = buildDoc(words);
|
25
25
|
doc.misc["labels"] = labels;
|
26
26
|
return self.addDoc(doc);
|
27
27
|
})
|
28
28
|
.define_method(
|
29
29
|
"latent_topics",
|
30
|
-
|
30
|
+
[](tomoto::IPLDAModel& self) {
|
31
31
|
return self.getNumLatentTopics();
|
32
32
|
});
|
33
33
|
}
|
data/ext/tomoto/slda.cpp
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
#include <SLDA.h>
|
2
2
|
|
3
|
-
#include <rice/
|
3
|
+
#include <rice/rice.hpp>
|
4
4
|
|
5
5
|
#include "utils.h"
|
6
6
|
|
7
7
|
void init_slda(Rice::Module& m) {
|
8
8
|
Rice::define_class_under<tomoto::ISLDAModel, tomoto::ILDAModel>(m, "SLDA")
|
9
|
-
.
|
9
|
+
.define_singleton_function(
|
10
10
|
"_new",
|
11
|
-
|
11
|
+
[](size_t tw, size_t k, Array rb_vars, tomoto::Float alpha, tomoto::Float eta, std::vector<tomoto::Float> mu, std::vector<tomoto::Float> nu_sq, std::vector<tomoto::Float> glm_param, size_t seed) {
|
12
12
|
std::vector<tomoto::ISLDAModel::GLM> vars;
|
13
13
|
vars.reserve(rb_vars.size());
|
14
14
|
for (auto const& v : rb_vars) {
|
15
|
-
vars.push_back((tomoto::ISLDAModel::GLM)
|
15
|
+
vars.push_back((tomoto::ISLDAModel::GLM) Rice::detail::From_Ruby<int>().convert(v.value()));
|
16
16
|
}
|
17
17
|
tomoto::SLDAArgs args;
|
18
18
|
args.k = k;
|
@@ -26,22 +26,22 @@ void init_slda(Rice::Module& m) {
|
|
26
26
|
args.seed = seed;
|
27
27
|
}
|
28
28
|
return tomoto::ISLDAModel::create((tomoto::TermWeight)tw, args);
|
29
|
-
})
|
29
|
+
}, Rice::Return().takeOwnership())
|
30
30
|
.define_method(
|
31
31
|
"_add_doc",
|
32
|
-
|
32
|
+
[](tomoto::ISLDAModel& self, std::vector<std::string> words, std::vector<tomoto::Float> y) {
|
33
33
|
auto doc = buildDoc(words);
|
34
34
|
doc.misc["y"] = y;
|
35
35
|
return self.addDoc(doc);
|
36
36
|
})
|
37
37
|
.define_method(
|
38
38
|
"f",
|
39
|
-
|
39
|
+
[](tomoto::ISLDAModel& self) {
|
40
40
|
return self.getF();
|
41
41
|
})
|
42
42
|
.define_method(
|
43
43
|
"_var_type",
|
44
|
-
|
44
|
+
[](tomoto::ISLDAModel& self, size_t var_id) {
|
45
45
|
if (var_id >= self.getF()) throw std::runtime_error{ "'var_id' must be < 'f'" };
|
46
46
|
return self.getTypeOfVar(var_id) == tomoto::ISLDAModel::GLM::linear ? "l" : "b";
|
47
47
|
});
|
data/ext/tomoto/utils.h
CHANGED
@@ -1,80 +1,26 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
-
#include <rice/
|
3
|
+
#include <rice/rice.hpp>
|
4
|
+
#include <rice/stl.hpp>
|
4
5
|
|
5
6
|
using Rice::Array;
|
6
7
|
using Rice::Object;
|
7
8
|
|
8
|
-
|
9
|
-
inline
|
10
|
-
Object to_ruby<std::vector<tomoto::Float>>(std::vector<tomoto::Float> const & x)
|
9
|
+
namespace Rice::detail
|
11
10
|
{
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
res.push(v);
|
26
|
-
}
|
27
|
-
return res;
|
28
|
-
}
|
29
|
-
|
30
|
-
template<>
|
31
|
-
inline
|
32
|
-
Object to_ruby<std::vector<uint64_t>>(std::vector<uint64_t> const & x)
|
33
|
-
{
|
34
|
-
Array res;
|
35
|
-
for (auto const& v : x) {
|
36
|
-
res.push(v);
|
37
|
-
}
|
38
|
-
return res;
|
39
|
-
}
|
40
|
-
|
41
|
-
template<>
|
42
|
-
inline
|
43
|
-
std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
|
44
|
-
{
|
45
|
-
Array a = Array(x);
|
46
|
-
std::vector<std::string> res;
|
47
|
-
res.reserve(a.size());
|
48
|
-
for (auto const& v : a) {
|
49
|
-
res.push_back(from_ruby<std::string>(v));
|
50
|
-
}
|
51
|
-
return res;
|
52
|
-
}
|
53
|
-
|
54
|
-
template<>
|
55
|
-
inline
|
56
|
-
std::vector<tomoto::Float> from_ruby<std::vector<tomoto::Float>>(Object x)
|
57
|
-
{
|
58
|
-
Array a = Array(x);
|
59
|
-
std::vector<tomoto::Float> res;
|
60
|
-
res.reserve(a.size());
|
61
|
-
for (auto const& v : a) {
|
62
|
-
res.push_back(from_ruby<tomoto::Float>(v));
|
63
|
-
}
|
64
|
-
return res;
|
65
|
-
}
|
66
|
-
|
67
|
-
template<>
|
68
|
-
inline
|
69
|
-
std::vector<uint64_t> from_ruby<std::vector<uint64_t>>(Object x)
|
70
|
-
{
|
71
|
-
Array a = Array(x);
|
72
|
-
std::vector<uint64_t> res;
|
73
|
-
res.reserve(a.size());
|
74
|
-
for (auto const& v : a) {
|
75
|
-
res.push_back(from_ruby<uint64_t>(v));
|
76
|
-
}
|
77
|
-
return res;
|
11
|
+
template<typename T>
|
12
|
+
class To_Ruby<std::vector<T>>
|
13
|
+
{
|
14
|
+
public:
|
15
|
+
VALUE convert(std::vector<T> const & x)
|
16
|
+
{
|
17
|
+
auto a = rb_ary_new2(x.size());
|
18
|
+
for (const auto& v : x) {
|
19
|
+
detail::protect(rb_ary_push, a, To_Ruby<T>().convert(v));
|
20
|
+
}
|
21
|
+
return a;
|
22
|
+
}
|
23
|
+
};
|
78
24
|
}
|
79
25
|
|
80
26
|
inline tomoto::RawDoc buildDoc(std::vector<std::string>& words) {
|
data/lib/tomoto/version.rb
CHANGED
@@ -198,6 +198,57 @@ add_doc은 `tomotopy.LDAModel.train`을 시작하기 전까지만 사용할 수
|
|
198
198
|
infer 메소드는 `tomotopy.Document` 인스턴스 하나를 추론하거나 `tomotopy.Document` 인스턴스의 `list`를 추론하는데 사용할 수 있습니다.
|
199
199
|
자세한 것은 `tomotopy.LDAModel.infer`을 참조하길 바랍니다.
|
200
200
|
|
201
|
+
Corpus와 transform
|
202
|
+
--------------------
|
203
|
+
`tomotopy`의 모든 토픽 모델들은 각자 별도의 내부적인 문헌 타입을 가지고 있습니다.
|
204
|
+
그리고 이 문헌 타입들에 맞는 문헌들은 각 모델의 `add_doc` 메소드를 통해 생성될 수 있습니다.
|
205
|
+
하지만 이 때문에 동일한 목록의 문헌들을 서로 다른 토픽 모델에 입력해야 하는 경우
|
206
|
+
매 모델에 각 문헌을 추가할때마다 `add_doc`을 호출해야하기 때문에 비효율이 발생합니다.
|
207
|
+
따라서 `tomotopy`에서는 여러 문헌을 묶어서 관리해주는 `tomotopy.utils.Corpus` 클래스를 제공합니다.
|
208
|
+
토픽 모델 객체를 생성할때 `tomotopy.utils.Corpus`를 `__init__` 메소드의 `corpus` 인자로 넘겨줌으로써
|
209
|
+
어떤 모델에든 쉽게 문헌들을 삽입할 수 있게 해줍니다.
|
210
|
+
`tomotopy.utils.Corpus`를 토픽 모델에 삽입하면 corpus 객체가 가지고 있는 문헌들 전부가 모델에 자동으로 삽입됩니다.
|
211
|
+
|
212
|
+
그런데 일부 토픽 모델의 경우 문헌을 생성하기 위해 서로 다른 데이터를 요구합니다.
|
213
|
+
예를 들어 `tomotopy.DMRModel`는 `metadata`라는 `str` 타입의 데이터를 요구하고,
|
214
|
+
`tomotopy.PLDAModel`는 `labels`라는 `List[str]` 타입의 데이터를 요구합니다.
|
215
|
+
그러나 `tomotopy.utils.Corpus`는 토픽 모델에 종속되지 않은 독립적인 문헌 데이터를 보관하기 때문에,
|
216
|
+
corpus가 가지고 있는 문헌 데이터가 실제 토픽 모델이 요구하는 데이터와 일치하지 않을 가능성이 있습니다.
|
217
|
+
이 경우 `transform`라는 인자를 통해 corpus 내의 데이터를 변형시켜 토픽 모델이 요구하는 실제 데이터와 일치시킬 수 있습니다.
|
218
|
+
자세한 내용은 아래의 코드를 확인해주세요:
|
219
|
+
|
220
|
+
::
|
221
|
+
|
222
|
+
from tomotopy import DMRModel
|
223
|
+
from tomotopy.utils import Corpus
|
224
|
+
|
225
|
+
corpus = Corpus()
|
226
|
+
corpus.add_doc("a b c d e".split(), a_data=1)
|
227
|
+
corpus.add_doc("e f g h i".split(), a_data=2)
|
228
|
+
corpus.add_doc("i j k l m".split(), a_data=3)
|
229
|
+
|
230
|
+
model = DMRModel(k=10)
|
231
|
+
model.add_corpus(corpus)
|
232
|
+
# `corpus`에 있던 `a_data`는 사라지고
|
233
|
+
# `DMRModel`이 요구하는 `metadata`에는 기본값인 빈 문자열이 채워집니다.
|
234
|
+
|
235
|
+
assert model.docs[0].metadata == ''
|
236
|
+
assert model.docs[1].metadata == ''
|
237
|
+
assert model.docs[2].metadata == ''
|
238
|
+
|
239
|
+
def transform_a_data_to_metadata(misc: dict):
|
240
|
+
return {'metadata': str(misc['a_data'])}
|
241
|
+
# 이 함수는 `a_data`를 `metadata`로 변환합니다.
|
242
|
+
|
243
|
+
model = DMRModel(k=10)
|
244
|
+
model.add_corpus(corpus, transform=transform_a_data_to_metadata)
|
245
|
+
# 이제 `model`에는 기본값이 아닌 `metadata`가 입력됩니다. 이들은 `transform`에 의해 `a_data`로부터 생성됩니다.
|
246
|
+
|
247
|
+
assert model.docs[0].metadata == '1'
|
248
|
+
assert model.docs[1].metadata == '2'
|
249
|
+
assert model.docs[2].metadata == '3'
|
250
|
+
|
251
|
+
|
201
252
|
병렬 샘플링 알고리즘
|
202
253
|
----------------------------
|
203
254
|
`tomotopy`는 0.5.0버전부터 병렬 알고리즘을 고를 수 있는 선택지를 제공합니다.
|
@@ -254,6 +305,12 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
254
305
|
|
255
306
|
역사
|
256
307
|
-------
|
308
|
+
* 0.12.1 (2021-06-20)
|
309
|
+
* `tomotopy.LDAModel.set_word_prior()`가 크래시를 발생시키던 문제를 해결했습니다.
|
310
|
+
* 이제 `tomotopy.LDAModel.perplexity`와 `tomotopy.LDAModel.ll_per_word`가 TermWeight가 ONE이 아닌 경우에도 정확한 값을 반환합니다.
|
311
|
+
* 용어가중치가 적용된 빈도수를 반환하는 `tomotopy.LDAModel.used_vocab_weighted_freq`가 추가되었습니다.
|
312
|
+
* 이제 `tomotopy.LDAModel.summary()`가 단어의 엔트로피뿐만 아니라, 용어 가중치가 적용된 단어의 엔트로피도 함께 보여줍니다.
|
313
|
+
|
257
314
|
* 0.12.0 (2021-04-26)
|
258
315
|
* 이제 `tomotopy.DMRModel`와 `tomotopy.GDMRModel`가 다중 메타데이터를 지원합니다. (https://github.com/bab2min/tomotopy/blob/main/examples/dmr_multi_label.py 참조)
|
259
316
|
* `tomotopy.GDMRModel`의 성능이 개선되었습니다.
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -202,6 +202,55 @@ Inference for unseen document should be performed using `tomotopy.LDAModel.infer
|
|
202
202
|
The `infer` method can infer only one instance of `tomotopy.Document` or a `list` of instances of `tomotopy.Document`.
|
203
203
|
See more at `tomotopy.LDAModel.infer`.
|
204
204
|
|
205
|
+
Corpus and transform
|
206
|
+
--------------------
|
207
|
+
Every topic model in `tomotopy` has its own internal document type.
|
208
|
+
A document can be created and added into suitable for each model through each model's `add_doc` method.
|
209
|
+
However, trying to add the same list of documents to different models becomes quite inconvenient,
|
210
|
+
because `add_doc` should be called for the same list of documents to each different model.
|
211
|
+
Thus, `tomotopy` provides `tomotopy.utils.Corpus` class that holds a list of documents.
|
212
|
+
`tomotopy.utils.Corpus` can be inserted into any model by passing as argument `corpus` to `__init__` or `add_corpus` method of each model.
|
213
|
+
So, inserting `tomotopy.utils.Corpus` just has the same effect to inserting documents the corpus holds.
|
214
|
+
|
215
|
+
Some topic models requires different data for its documents.
|
216
|
+
For example, `tomotopy.DMRModel` requires argument `metadata` in `str` type,
|
217
|
+
but `tomotopy.PLDAModel` requires argument `labels` in `List[str]` type.
|
218
|
+
Since `tomotopy.utils.Corpus` holds an independent set of documents rather than being tied to a specific topic model,
|
219
|
+
data types required by a topic model may be inconsistent when a corpus is added into that topic model.
|
220
|
+
In this case, miscellaneous data can be transformed to be fitted target topic model using argument `transform`.
|
221
|
+
See more details in the following code:
|
222
|
+
|
223
|
+
::
|
224
|
+
|
225
|
+
from tomotopy import DMRModel
|
226
|
+
from tomotopy.utils import Corpus
|
227
|
+
|
228
|
+
corpus = Corpus()
|
229
|
+
corpus.add_doc("a b c d e".split(), a_data=1)
|
230
|
+
corpus.add_doc("e f g h i".split(), a_data=2)
|
231
|
+
corpus.add_doc("i j k l m".split(), a_data=3)
|
232
|
+
|
233
|
+
model = DMRModel(k=10)
|
234
|
+
model.add_corpus(corpus)
|
235
|
+
# You lose `a_data` field in `corpus`,
|
236
|
+
# and `metadata` that `DMRModel` requires is filled with the default value, empty str.
|
237
|
+
|
238
|
+
assert model.docs[0].metadata == ''
|
239
|
+
assert model.docs[1].metadata == ''
|
240
|
+
assert model.docs[2].metadata == ''
|
241
|
+
|
242
|
+
def transform_a_data_to_metadata(misc: dict):
|
243
|
+
return {'metadata': str(misc['a_data'])}
|
244
|
+
# this function transforms `a_data` to `metadata`
|
245
|
+
|
246
|
+
model = DMRModel(k=10)
|
247
|
+
model.add_corpus(corpus, transform=transform_a_data_to_metadata)
|
248
|
+
# Now docs in `model` has non-default `metadata`, that generated from `a_data` field.
|
249
|
+
|
250
|
+
assert model.docs[0].metadata == '1'
|
251
|
+
assert model.docs[1].metadata == '2'
|
252
|
+
assert model.docs[2].metadata == '3'
|
253
|
+
|
205
254
|
Parallel Sampling Algorithms
|
206
255
|
----------------------------
|
207
256
|
Since version 0.5.0, `tomotopy` allows you to choose a parallelism algorithm.
|
@@ -260,6 +309,12 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
260
309
|
|
261
310
|
History
|
262
311
|
-------
|
312
|
+
* 0.12.1 (2021-06-20)
|
313
|
+
* An issue where `tomotopy.LDAModel.set_word_prior()` causes a crash has been fixed.
|
314
|
+
* Now `tomotopy.LDAModel.perplexity` and `tomotopy.LDAModel.ll_per_word` return the accurate value when `TermWeight` is not `ONE`.
|
315
|
+
* `tomotopy.LDAModel.used_vocab_weighted_freq` was added, which returns term-weighted frequencies of words.
|
316
|
+
* Now `tomotopy.LDAModel.summary()` shows not only the entropy of words, but also the entropy of term-weighted words.
|
317
|
+
|
263
318
|
* 0.12.0 (2021-04-26)
|
264
319
|
* Now `tomotopy.DMRModel` and `tomotopy.GDMRModel` support multiple values of metadata (see https://github.com/bab2min/tomotopy/blob/main/examples/dmr_multi_label.py )
|
265
320
|
* The performance of `tomotopy.GDMRModel` was improved.
|
@@ -316,7 +316,7 @@ namespace tomoto
|
|
316
316
|
}
|
317
317
|
}
|
318
318
|
|
319
|
-
float totN = std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
319
|
+
float totN = (float)std::accumulate(vocabFreqs.begin(), vocabFreqs.end(), (size_t)0);
|
320
320
|
const float logTotN = std::log(totN);
|
321
321
|
|
322
322
|
// calculating PMIs
|
@@ -489,7 +489,7 @@ namespace tomoto
|
|
489
489
|
|
490
490
|
float rbe = branchingEntropy(trieNodes[0].getNext(bigram.first)->getNext(bigram.second), candMinCnt);
|
491
491
|
float lbe = branchingEntropy(trieNodesBw[0].getNext(bigram.second)->getNext(bigram.first), candMinCnt);
|
492
|
-
float nbe = std::sqrt(rbe * lbe) / std::log(p.second);
|
492
|
+
float nbe = std::sqrt(rbe * lbe) / (float)std::log(p.second);
|
493
493
|
if (nbe < minNBE) continue;
|
494
494
|
candidates.emplace_back(npmi * nbe, bigram.first, bigram.second);
|
495
495
|
candidates.back().cf = p.second;
|
@@ -512,7 +512,7 @@ namespace tomoto
|
|
512
512
|
|
513
513
|
float rbe = branchingEntropy(node, candMinCnt);
|
514
514
|
float lbe = branchingEntropy(trieNodesBw[0].findNode(rkeys.rbegin(), rkeys.rend()), candMinCnt);
|
515
|
-
float nbe = std::sqrt(rbe * lbe) / std::log(node->val);
|
515
|
+
float nbe = std::sqrt(rbe * lbe) / (float)std::log(node->val);
|
516
516
|
if (nbe < minNBE) return;
|
517
517
|
candidates.emplace_back(npmi * nbe, rkeys);
|
518
518
|
candidates.back().cf = node->val;
|
@@ -33,7 +33,10 @@ namespace tomoto
|
|
33
33
|
friend typename BaseClass::BaseClass;
|
34
34
|
using WeightType = typename BaseClass::WeightType;
|
35
35
|
|
36
|
-
static constexpr
|
36
|
+
static constexpr auto tmid()
|
37
|
+
{
|
38
|
+
return serializer::to_key("CTM\0");
|
39
|
+
}
|
37
40
|
|
38
41
|
uint64_t numBetaSample = 10;
|
39
42
|
uint64_t numTMNSample = 5;
|
@@ -247,7 +250,7 @@ namespace tomoto
|
|
247
250
|
this->optimInterval = 2;
|
248
251
|
}
|
249
252
|
|
250
|
-
std::vector<Float>
|
253
|
+
std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
|
251
254
|
{
|
252
255
|
std::vector<Float> ret(this->K);
|
253
256
|
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
|
@@ -47,7 +47,10 @@ namespace tomoto
|
|
47
47
|
friend typename BaseClass::BaseClass;
|
48
48
|
using WeightType = typename BaseClass::WeightType;
|
49
49
|
|
50
|
-
static constexpr
|
50
|
+
static constexpr auto tmid()
|
51
|
+
{
|
52
|
+
return serializer::to_key("DMR\0");
|
53
|
+
}
|
51
54
|
|
52
55
|
Matrix lambda;
|
53
56
|
mutable std::unordered_map<std::pair<uint64_t, Vector>, size_t, MdHash> mdHashMap;
|
@@ -449,7 +452,7 @@ namespace tomoto
|
|
449
452
|
optimRepeat = _optimRepeat;
|
450
453
|
}
|
451
454
|
|
452
|
-
std::vector<Float>
|
455
|
+
std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
|
453
456
|
{
|
454
457
|
std::vector<Float> ret(this->K);
|
455
458
|
auto alphaDoc = getCachedAlpha(doc);
|
@@ -41,7 +41,10 @@ namespace tomoto
|
|
41
41
|
friend typename BaseClass::BaseClass;
|
42
42
|
using WeightType = typename BaseClass::WeightType;
|
43
43
|
|
44
|
-
static constexpr
|
44
|
+
static constexpr auto tmid()
|
45
|
+
{
|
46
|
+
return serializer::to_key("DTM\0");
|
47
|
+
}
|
45
48
|
|
46
49
|
uint64_t T;
|
47
50
|
Float shapeA = 0.03f, shapeB = 0.1f, shapeC = 0.55f;
|
@@ -54,7 +57,7 @@ namespace tomoto
|
|
54
57
|
std::vector<sample::AliasMethod<>> wordAliasTables; // Dim: (Word * Time)
|
55
58
|
|
56
59
|
template<int _inc>
|
57
|
-
inline void addWordTo(_ModelState& ld, _DocType& doc,
|
60
|
+
inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid tid) const
|
58
61
|
{
|
59
62
|
assert(tid < this->K);
|
60
63
|
assert(vid < this->realV);
|
@@ -168,7 +168,7 @@ namespace tomoto
|
|
168
168
|
}
|
169
169
|
|
170
170
|
template<int _inc>
|
171
|
-
inline void addWordTo(_ModelState& ld, _DocType& doc,
|
171
|
+
inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, size_t tableId, Tid tid) const
|
172
172
|
{
|
173
173
|
addOnlyWordTo<_inc>(ld, doc, pid, vid, tid);
|
174
174
|
constexpr bool _dec = _inc < 0 && _tw != TermWeight::one;
|
@@ -490,7 +490,7 @@ namespace tomoto
|
|
490
490
|
THROW_ERROR_WITH_INFO(exc::Unimplemented, "HDPModel doesn't provide setWordPrior function.");
|
491
491
|
}
|
492
492
|
|
493
|
-
std::vector<Float>
|
493
|
+
std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
|
494
494
|
{
|
495
495
|
std::vector<Float> ret(this->K);
|
496
496
|
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
|
@@ -522,7 +522,7 @@ namespace tomoto
|
|
522
522
|
for (size_t i = 0; i < cntIdx.size(); ++i)
|
523
523
|
{
|
524
524
|
if (i && cntIdx[i].first / sum <= topicThreshold) break;
|
525
|
-
newK[cntIdx[i].second] = i;
|
525
|
+
newK[cntIdx[i].second] = (Tid)i;
|
526
526
|
liveK++;
|
527
527
|
}
|
528
528
|
|
@@ -558,7 +558,7 @@ namespace tomoto
|
|
558
558
|
lda->docs[i].Zs[j] = non_topic_id;
|
559
559
|
continue;
|
560
560
|
}
|
561
|
-
|
561
|
+
Tid newTopic = newK[this->docs[i].numTopicByTable[this->docs[i].Zs[j]].topic];
|
562
562
|
while (newTopic == (Tid)-1) newTopic = newK[randomTopic(rng)];
|
563
563
|
lda->docs[i].Zs[j] = newTopic;
|
564
564
|
}
|