tomoto 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/ext/tomoto/extconf.rb +4 -2
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +10 -1
- data/vendor/tomotopy/README.rst +10 -1
- data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +3 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +77 -3
- data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
- data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
- data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
- data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
- data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
- data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
- metadata +9 -4
- data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86215ec57ae6cf6e36531ee2896e2b81d591f61909eb5454ef70b69c5db0a39d
|
4
|
+
data.tar.gz: 3f31adcb38a1793caaaedc516f99c9ffce4b82ff0c93f2a169b85377e116433b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db0a4bd9831cecae6711e150ecc2c5d23b87ada83d418f784474cf2f260627e52e40d871d9af974c6a79790e8d0d060ac08fbb69775e1d2de316085421ef76af
|
7
|
+
data.tar.gz: '07779f29aa9bdb4b71d9a0acdfe26c84d041312ad769c54539d6377b0ab01327dba045d97b51179a50498b438d137b567a3708bd3a2ab902c16adec37ad3a779'
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
|
4
4
|
|
5
|
-
[](https://github.com/ankane/tomoto-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/ext/tomoto/extconf.rb
CHANGED
@@ -27,16 +27,18 @@ else
|
|
27
27
|
end
|
28
28
|
|
29
29
|
# silence tomoto warnings
|
30
|
-
$CXXFLAGS += " -Wno-unused-variable -Wno-switch"
|
30
|
+
$CXXFLAGS += " -Wno-unused-variable -Wno-switch -Wno-unqualified-std-cast-call"
|
31
31
|
|
32
32
|
ext = File.expand_path(".", __dir__)
|
33
33
|
tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
|
34
|
+
tomoto_utils = File.expand_path("../../vendor/tomotopy/src/Utils", __dir__)
|
34
35
|
eigen = File.expand_path("../../vendor/eigen", __dir__)
|
35
36
|
eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
|
36
37
|
variant = File.expand_path("../../vendor/variant/include", __dir__)
|
37
38
|
|
38
|
-
$srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
|
39
|
+
$srcs = Dir["{#{ext},#{tomoto},#{tomoto_utils}}/*.cpp"]
|
39
40
|
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
|
40
41
|
$VPATH << tomoto
|
42
|
+
$VPATH << tomoto_utils
|
41
43
|
|
42
44
|
create_makefile("tomoto/tomoto")
|
data/lib/tomoto/version.rb
CHANGED
@@ -7,7 +7,7 @@ tomotopy
|
|
7
7
|
.. image:: https://zenodo.org/badge/186155463.svg
|
8
8
|
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
9
|
|
10
|
-
|
10
|
+
🌐
|
11
11
|
`English`_,
|
12
12
|
**한국어**.
|
13
13
|
|
@@ -305,6 +305,15 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
305
305
|
|
306
306
|
역사
|
307
307
|
-------
|
308
|
+
* 0.13.0 (2024-08-05)
|
309
|
+
* 신규 기능
|
310
|
+
* 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`의 주요 기능이 완성되었습니다.
|
311
|
+
* `tomotopy.LDAModel.get_hash()`가 추가되었습니다. 모델의 128bit 해시를 구해줍니다.
|
312
|
+
* `ngram_list` 인자가 `tomotopy.utils.SimpleTokenizer`에 추가되었습니다.
|
313
|
+
* Bug fixes
|
314
|
+
* `Corpus.concat_ngrams` 호출 후에 `spans`이 비일관적인 버그가 수정되었습니다.
|
315
|
+
* `tomotopy.LDAModel.load()`와 `tomotopy.LDAModel.save()`의 병목을 최적화하여 속도를 10배 이상 개선했습니다.
|
316
|
+
|
308
317
|
* 0.12.7 (2023-12-19)
|
309
318
|
* 신규 기능
|
310
319
|
* 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`가 추가되었습니다.
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -7,7 +7,7 @@ tomotopy
|
|
7
7
|
.. image:: https://zenodo.org/badge/186155463.svg
|
8
8
|
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
9
|
|
10
|
-
|
10
|
+
🌐
|
11
11
|
**English**,
|
12
12
|
`한국어`_.
|
13
13
|
|
@@ -309,6 +309,15 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
309
309
|
|
310
310
|
History
|
311
311
|
-------
|
312
|
+
* 0.13.0 (2024-08-05)
|
313
|
+
* New features
|
314
|
+
* Major features of Topic Model Viewer `tomotopy.viewer.open_viewer()` are ready now.
|
315
|
+
* `tomotopy.LDAModel.get_hash()` is added. You can get 128bit hash value of the model.
|
316
|
+
* Add an argument `ngram_list` to `tomotopy.utils.SimpleTokenizer`.
|
317
|
+
* Bug fixes
|
318
|
+
* Fixed inconsistent `spans` bug after `Corpus.concat_ngrams` is called.
|
319
|
+
* Optimized the bottleneck of `tomotopy.LDAModel.load()` and `tomotopy.LDAModel.save()` and improved its speed more than 10 times.
|
320
|
+
|
312
321
|
* 0.12.7 (2023-12-19)
|
313
322
|
* New features
|
314
323
|
* Added Topic Model Viewer `tomotopy.viewer.open_viewer()`
|
@@ -11,8 +11,8 @@ namespace tomoto
|
|
11
11
|
Matrix beta; // Dim: (K, betaSample)
|
12
12
|
Vector smBeta; // Dim: K
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
16
16
|
};
|
17
17
|
|
18
18
|
struct CTArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentCTM);
|
9
|
+
|
5
10
|
ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
|
@@ -243,6 +243,7 @@ namespace tomoto
|
|
243
243
|
public:
|
244
244
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
|
245
245
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);
|
246
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, numBetaSample, numTMNSample, topicPrior);
|
246
247
|
|
247
248
|
CTModel(const CTArgs& args)
|
248
249
|
: BaseClass(args)
|
@@ -18,8 +18,8 @@ namespace tomoto
|
|
18
18
|
|
19
19
|
RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
22
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
23
23
|
};
|
24
24
|
|
25
25
|
struct DMRArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentDMR);
|
9
|
+
|
5
10
|
IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
|
@@ -364,6 +364,7 @@ namespace tomoto
|
|
364
364
|
public:
|
365
365
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda);
|
366
366
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
|
367
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
|
367
368
|
|
368
369
|
DMRModel(const DMRArgs& args)
|
369
370
|
: BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps)
|
@@ -21,8 +21,8 @@ namespace tomoto
|
|
21
21
|
return ret;
|
22
22
|
}
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
25
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
26
26
|
};
|
27
27
|
|
28
28
|
struct DTArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentDTM);
|
9
|
+
|
5
10
|
IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
|
@@ -22,6 +22,7 @@ namespace tomoto
|
|
22
22
|
Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
|
23
23
|
//ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
|
24
24
|
DEFINE_SERIALIZER(numByTopic, numByTopicWord);
|
25
|
+
DEFINE_HASHER(numByTopic, numByTopicWord);
|
25
26
|
};
|
26
27
|
|
27
28
|
template<TermWeight _tw, typename _RandGen,
|
@@ -496,6 +497,8 @@ namespace tomoto
|
|
496
497
|
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
497
498
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001,
|
498
499
|
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
500
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass,
|
501
|
+
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
499
502
|
|
500
503
|
GETTER(T, size_t, T);
|
501
504
|
GETTER(NumDocsByT, std::vector<uint32_t>, numDocsByTime);
|
@@ -17,8 +17,8 @@ namespace tomoto
|
|
17
17
|
return ret;
|
18
18
|
}
|
19
19
|
|
20
|
-
|
21
|
-
|
20
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
21
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
22
22
|
};
|
23
23
|
|
24
24
|
struct GDMRArgs : public DMRArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentGDMR);
|
9
|
+
|
5
10
|
IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
|
@@ -412,6 +412,7 @@ namespace tomoto
|
|
412
412
|
public:
|
413
413
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
|
414
414
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
|
415
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
|
415
416
|
|
416
417
|
GDMRModel(const GDMRArgs& args)
|
417
418
|
: BaseClass(args), sigma0(args.sigma0), orderDecay(args.orderDecay), degreeByF(args.degrees)
|
@@ -39,8 +39,8 @@ namespace tomoto
|
|
39
39
|
};
|
40
40
|
std::vector<TableTopicInfo> numTopicByTable;
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
43
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
44
44
|
|
45
45
|
size_t getNumTable() const
|
46
46
|
{
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHDP);
|
9
|
+
|
5
10
|
IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
|
@@ -19,6 +19,7 @@ namespace tomoto
|
|
19
19
|
size_t totalTable = 0;
|
20
20
|
|
21
21
|
DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
|
22
|
+
DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
|
22
23
|
};
|
23
24
|
|
24
25
|
template<TermWeight _tw, typename _RandGen,
|
@@ -457,6 +458,7 @@ namespace tomoto
|
|
457
458
|
public:
|
458
459
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
|
459
460
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
|
461
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
|
460
462
|
|
461
463
|
HDPModel(const HDPArgs& args)
|
462
464
|
: BaseClass(args), gamma(args.gamma)
|
@@ -16,8 +16,8 @@ namespace tomoto
|
|
16
16
|
|
17
17
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
20
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
21
21
|
};
|
22
22
|
|
23
23
|
struct HLDAArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHLDA);
|
9
|
+
|
5
10
|
IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
|
@@ -18,6 +18,7 @@ namespace tomoto
|
|
18
18
|
int32_t parent = 0, sibling = 0, child = 0;
|
19
19
|
|
20
20
|
DEFINE_SERIALIZER(numCustomers, level, parent, sibling, child);
|
21
|
+
DEFINE_HASHER(numCustomers, level, parent, sibling, child);
|
21
22
|
|
22
23
|
NCRPNode* getParent() const
|
23
24
|
{
|
@@ -118,6 +119,7 @@ namespace tomoto
|
|
118
119
|
Vector nodeWLikelihoods; //
|
119
120
|
|
120
121
|
DEFINE_SERIALIZER(nodes, levelBlocks);
|
122
|
+
DEFINE_HASHER(nodes, levelBlocks);
|
121
123
|
|
122
124
|
template<bool _makeNewPath = true>
|
123
125
|
void calcNodeLikelihood(Float gamma, size_t levelDepth)
|
@@ -317,6 +319,12 @@ namespace tomoto
|
|
317
319
|
ModelStateLDA<_tw>::serializerWrite(ostr);
|
318
320
|
nt->serializerWrite(ostr);
|
319
321
|
}
|
322
|
+
|
323
|
+
uint64_t computeHash(uint64_t seed) const
|
324
|
+
{
|
325
|
+
seed = ModelStateLDA<_tw>::computeHash(seed);
|
326
|
+
return nt->computeHash(seed);
|
327
|
+
}
|
320
328
|
};
|
321
329
|
|
322
330
|
template<TermWeight _tw, typename _RandGen,
|
@@ -596,6 +604,7 @@ namespace tomoto
|
|
596
604
|
public:
|
597
605
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
|
598
606
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
|
607
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
|
599
608
|
|
600
609
|
HLDAModel(const HLDAArgs& args)
|
601
610
|
: BaseClass(args), gamma(args.gamma)
|
@@ -12,8 +12,8 @@ namespace tomoto
|
|
12
12
|
|
13
13
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
16
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
17
17
|
};
|
18
18
|
|
19
19
|
struct HPAArgs : public PAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
|
6
|
+
DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHPA);
|
9
|
+
|
5
10
|
IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
if (_exclusive)
|
@@ -21,6 +21,7 @@ namespace tomoto
|
|
21
21
|
Eigen::Matrix<WeightType, -1, -1> numByTopic1_2;
|
22
22
|
|
23
23
|
DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
|
24
|
+
DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
|
24
25
|
};
|
25
26
|
|
26
27
|
template<TermWeight _tw, typename _RandGen,
|
@@ -439,6 +440,7 @@ namespace tomoto
|
|
439
440
|
public:
|
440
441
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
|
441
442
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
|
443
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
|
442
444
|
|
443
445
|
HPAModel(const HPAArgs& args)
|
444
446
|
: BaseClass(args, false), K2(args.k2)
|
@@ -93,6 +93,12 @@ namespace tomoto
|
|
93
93
|
if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
|
94
94
|
throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
|
95
95
|
}
|
96
|
+
|
97
|
+
uint64_t computeHash(uint64_t seed) const
|
98
|
+
{
|
99
|
+
seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
|
100
|
+
return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
|
101
|
+
}
|
96
102
|
};
|
97
103
|
|
98
104
|
template<typename _Base, TermWeight _tw>
|
@@ -139,8 +145,8 @@ namespace tomoto
|
|
139
145
|
tvector<Float> wordWeights;
|
140
146
|
ShareableMatrix<WeightType, -1, 1> numByTopic;
|
141
147
|
|
142
|
-
|
143
|
-
|
148
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
149
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
144
150
|
|
145
151
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
146
152
|
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentLDA);
|
9
|
+
|
5
10
|
ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
|
@@ -47,6 +47,10 @@ Term Weighting Scheme is based on following paper:
|
|
47
47
|
return nullptr; } while(0)
|
48
48
|
#endif
|
49
49
|
|
50
|
+
#define TMT_INSTANTIATE_DOC(CLS) template struct CLS<TermWeight::one>; \
|
51
|
+
template struct CLS<TermWeight::idf>; \
|
52
|
+
template struct CLS<TermWeight::pmi>;
|
53
|
+
|
50
54
|
#define GETTER(name, type, field) type get##name() const override { return field; }
|
51
55
|
|
52
56
|
namespace tomoto
|
@@ -61,6 +65,7 @@ namespace tomoto
|
|
61
65
|
//Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
|
62
66
|
ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
|
63
67
|
DEFINE_SERIALIZER(numByTopic, numByTopicWord);
|
68
|
+
DEFINE_HASHER(numByTopic, numByTopicWord);
|
64
69
|
};
|
65
70
|
|
66
71
|
namespace flags
|
@@ -954,6 +959,8 @@ namespace tomoto
|
|
954
959
|
DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord,
|
955
960
|
burnIn, optimInterval);
|
956
961
|
|
962
|
+
DEFINE_HASHER(vocabWeights, alpha, alphas, eta, K, /*etaByWord,*/ burnIn, optimInterval);
|
963
|
+
|
957
964
|
LDAModel(const LDAArgs& args, bool checkAlpha = true)
|
958
965
|
: BaseClass(args.seed), K(args.k), alpha(args.alpha[0]), eta(args.eta)
|
959
966
|
{
|
@@ -1066,6 +1073,7 @@ namespace tomoto
|
|
1066
1073
|
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
|
1067
1074
|
{
|
1068
1075
|
if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
|
1076
|
+
static_cast<DerivedClass*>(this)->updateWordFormCnts();
|
1069
1077
|
static_cast<DerivedClass*>(this)->updateWeakArray();
|
1070
1078
|
static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
|
1071
1079
|
static_cast<DerivedClass*>(this)->prepareWordPriors();
|
@@ -11,8 +11,8 @@ namespace tomoto
|
|
11
11
|
using WeightType = typename DocumentLDA<_tw>::WeightType;
|
12
12
|
Eigen::Matrix<int8_t, -1, 1> labelMask;
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
16
16
|
};
|
17
17
|
|
18
18
|
class ILLDAModel : public ILDAModel
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentLLDA);
|
9
|
+
|
5
10
|
ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
|
@@ -107,6 +107,7 @@ namespace tomoto
|
|
107
107
|
public:
|
108
108
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict);
|
109
109
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict);
|
110
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict);
|
110
111
|
|
111
112
|
LLDAModel(const LDAArgs& args)
|
112
113
|
: BaseClass(args)
|
@@ -22,8 +22,8 @@ namespace tomoto
|
|
22
22
|
Eigen::Matrix<WeightType, -1, 1> numByWin; // number of words in the window (len = S + T - 1)
|
23
23
|
Eigen::Matrix<WeightType, -1, -1> numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
|
24
24
|
|
25
|
-
|
26
|
-
|
25
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
26
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
27
27
|
|
28
28
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
29
29
|
};
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentMGLDA);
|
9
|
+
|
5
10
|
IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
|
@@ -370,6 +370,7 @@ namespace tomoto
|
|
370
370
|
public:
|
371
371
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
372
372
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
373
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
373
374
|
|
374
375
|
MGLDAModel(const MGLDAArgs& args)
|
375
376
|
: BaseClass(args), KL(args.kL), T(args.t),
|
@@ -15,8 +15,8 @@ namespace tomoto
|
|
15
15
|
|
16
16
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
17
17
|
|
18
|
-
|
19
|
-
|
18
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
19
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
20
20
|
};
|
21
21
|
|
22
22
|
struct PAArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentPA);
|
9
|
+
|
5
10
|
IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, PAModel, args);
|
@@ -19,6 +19,7 @@ namespace tomoto
|
|
19
19
|
Vector subTmp;
|
20
20
|
|
21
21
|
DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
|
22
|
+
DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
|
22
23
|
};
|
23
24
|
|
24
25
|
template<TermWeight _tw, typename _RandGen,
|
@@ -364,6 +365,7 @@ namespace tomoto
|
|
364
365
|
public:
|
365
366
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
|
366
367
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
|
368
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
|
367
369
|
|
368
370
|
PAModel(const PAArgs& args)
|
369
371
|
: BaseClass(args), K2(args.k2)
|
@@ -111,6 +111,7 @@ namespace tomoto
|
|
111
111
|
public:
|
112
112
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict, numLatentTopics, numTopicsPerLabel);
|
113
113
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict, numLatentTopics, numTopicsPerLabel);
|
114
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict, numLatentTopics, numTopicsPerLabel);
|
114
115
|
|
115
116
|
PLDAModel(const PLDAArgs& args)
|
116
117
|
: BaseClass(args.setK(1)),
|
@@ -11,9 +11,9 @@ namespace tomoto
|
|
11
11
|
using WeightType = typename DocumentLDA<_tw>::WeightType;
|
12
12
|
|
13
13
|
uint64_t pseudoDoc = 0;
|
14
|
-
|
15
|
-
|
16
|
-
|
14
|
+
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
16
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
17
17
|
};
|
18
18
|
|
19
19
|
struct PTArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentPT);
|
9
|
+
|
5
10
|
IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);
|
@@ -266,6 +266,7 @@ namespace tomoto
|
|
266
266
|
public:
|
267
267
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numPDocs, lambda);
|
268
268
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numPDocs, lambda);
|
269
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, numPDocs, lambda);
|
269
270
|
|
270
271
|
GETTER(P, size_t, numPDocs);
|
271
272
|
|