tomoto 0.3.3 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/README.md +1 -1
- data/ext/tomoto/extconf.rb +4 -2
- data/lib/tomoto/version.rb +1 -1
- data/lib/tomoto.rb +14 -14
- data/vendor/tomotopy/README.kr.rst +27 -1
- data/vendor/tomotopy/README.rst +27 -1
- data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +4 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -1
- data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +7 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +83 -3
- data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
- data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +1 -1
- data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
- data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
- data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
- data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
- data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
- metadata +12 -7
- data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86215ec57ae6cf6e36531ee2896e2b81d591f61909eb5454ef70b69c5db0a39d
|
4
|
+
data.tar.gz: 3f31adcb38a1793caaaedc516f99c9ffce4b82ff0c93f2a169b85377e116433b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: db0a4bd9831cecae6711e150ecc2c5d23b87ada83d418f784474cf2f260627e52e40d871d9af974c6a79790e8d0d060ac08fbb69775e1d2de316085421ef76af
|
7
|
+
data.tar.gz: '07779f29aa9bdb4b71d9a0acdfe26c84d041312ad769c54539d6377b0ab01327dba045d97b51179a50498b438d137b567a3708bd3a2ab902c16adec37ad3a779'
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1
|
+
## 0.4.1 (2024-09-04)
|
2
|
+
|
3
|
+
- Updated tomoto to 0.13.0
|
4
|
+
|
5
|
+
## 0.4.0 (2023-12-28)
|
6
|
+
|
7
|
+
- Added support for Ruby 3.3
|
8
|
+
- Added precompiled gem for Linux ARM
|
9
|
+
- Updated tomoto to 0.12.7
|
10
|
+
- Dropped support for Ruby < 3
|
11
|
+
|
1
12
|
## 0.3.3 (2023-02-01)
|
2
13
|
|
3
14
|
- Added `topic_label_dict` method to `LLDA`
|
data/README.md
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
:tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
|
4
4
|
|
5
|
-
[![Build Status](https://github.com/ankane/tomoto-ruby/workflows/build/badge.svg
|
5
|
+
[![Build Status](https://github.com/ankane/tomoto-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tomoto-ruby/actions)
|
6
6
|
|
7
7
|
## Installation
|
8
8
|
|
data/ext/tomoto/extconf.rb
CHANGED
@@ -27,16 +27,18 @@ else
|
|
27
27
|
end
|
28
28
|
|
29
29
|
# silence tomoto warnings
|
30
|
-
$CXXFLAGS += " -Wno-unused-variable -Wno-switch"
|
30
|
+
$CXXFLAGS += " -Wno-unused-variable -Wno-switch -Wno-unqualified-std-cast-call"
|
31
31
|
|
32
32
|
ext = File.expand_path(".", __dir__)
|
33
33
|
tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
|
34
|
+
tomoto_utils = File.expand_path("../../vendor/tomotopy/src/Utils", __dir__)
|
34
35
|
eigen = File.expand_path("../../vendor/eigen", __dir__)
|
35
36
|
eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
|
36
37
|
variant = File.expand_path("../../vendor/variant/include", __dir__)
|
37
38
|
|
38
|
-
$srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
|
39
|
+
$srcs = Dir["{#{ext},#{tomoto},#{tomoto_utils}}/*.cpp"]
|
39
40
|
$INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
|
40
41
|
$VPATH << tomoto
|
42
|
+
$VPATH << tomoto_utils
|
41
43
|
|
42
44
|
create_makefile("tomoto/tomoto")
|
data/lib/tomoto/version.rb
CHANGED
data/lib/tomoto.rb
CHANGED
@@ -6,20 +6,20 @@ rescue LoadError
|
|
6
6
|
end
|
7
7
|
|
8
8
|
# modules
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
9
|
+
require_relative "tomoto/ct"
|
10
|
+
require_relative "tomoto/dmr"
|
11
|
+
require_relative "tomoto/dt"
|
12
|
+
require_relative "tomoto/gdmr"
|
13
|
+
require_relative "tomoto/hdp"
|
14
|
+
require_relative "tomoto/hlda"
|
15
|
+
require_relative "tomoto/hpa"
|
16
|
+
require_relative "tomoto/lda"
|
17
|
+
require_relative "tomoto/llda"
|
18
|
+
require_relative "tomoto/mglda"
|
19
|
+
require_relative "tomoto/pa"
|
20
|
+
require_relative "tomoto/plda"
|
21
|
+
require_relative "tomoto/slda"
|
22
|
+
require_relative "tomoto/version"
|
23
23
|
|
24
24
|
module Tomoto
|
25
25
|
PARALLEL_SCHEME = [:default, :none, :copy_merge, :partition]
|
@@ -7,7 +7,7 @@ tomotopy
|
|
7
7
|
.. image:: https://zenodo.org/badge/186155463.svg
|
8
8
|
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
9
|
|
10
|
-
|
10
|
+
🌐
|
11
11
|
`English`_,
|
12
12
|
**한국어**.
|
13
13
|
|
@@ -305,6 +305,32 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
|
|
305
305
|
|
306
306
|
역사
|
307
307
|
-------
|
308
|
+
* 0.13.0 (2024-08-05)
|
309
|
+
* 신규 기능
|
310
|
+
* 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`의 주요 기능이 완성되었습니다.
|
311
|
+
* `tomotopy.LDAModel.get_hash()`가 추가되었습니다. 모델의 128bit 해시를 구해줍니다.
|
312
|
+
* `ngram_list` 인자가 `tomotopy.utils.SimpleTokenizer`에 추가되었습니다.
|
313
|
+
* Bug fixes
|
314
|
+
* `Corpus.concat_ngrams` 호출 후에 `spans`이 비일관적인 버그가 수정되었습니다.
|
315
|
+
* `tomotopy.LDAModel.load()`와 `tomotopy.LDAModel.save()`의 병목을 최적화하여 속도를 10배 이상 개선했습니다.
|
316
|
+
|
317
|
+
* 0.12.7 (2023-12-19)
|
318
|
+
* 신규 기능
|
319
|
+
* 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`가 추가되었습니다.
|
320
|
+
* `tomotopy.utils.Corpus.process()`의 속도를 개선했습니다.
|
321
|
+
* Bug fixes
|
322
|
+
* `Document.span`이 이제 바이트 단위가 아니라 문자 단위로 범위를 제대로 반환합니다.
|
323
|
+
|
324
|
+
* 0.12.6 (2023-12-11)
|
325
|
+
* 신규 기능
|
326
|
+
* `tomotopy.LDAModel.train`과 `tomotopy.LDAModel.set_word_prior`에 몇가지 편의 기능을 추가했습니다.
|
327
|
+
* `LDAModel.train`가 이제 학습 진행상황을 모니터링할 수 있는 `callback`, `callback_interval`, `show_progres` 인자를 지원합니다.
|
328
|
+
* `LDAModel.set_word_prior`가 이제 `prior` 인자로 `Dict[int, float]` 타입도 받을 수 있게 되었습니다.
|
329
|
+
|
330
|
+
* 0.12.5 (2023-08-03)
|
331
|
+
* 신규 기능
|
332
|
+
* Linux ARM64 아키텍처에 대한 지원을 추가했습니다.
|
333
|
+
|
308
334
|
* 0.12.4 (2023-01-22)
|
309
335
|
* New features
|
310
336
|
* macOS ARM64 아키텍처에 대한 지원을 추가했습니다.
|
data/vendor/tomotopy/README.rst
CHANGED
@@ -7,7 +7,7 @@ tomotopy
|
|
7
7
|
.. image:: https://zenodo.org/badge/186155463.svg
|
8
8
|
:target: https://zenodo.org/badge/latestdoi/186155463
|
9
9
|
|
10
|
-
|
10
|
+
🌐
|
11
11
|
**English**,
|
12
12
|
`한국어`_.
|
13
13
|
|
@@ -309,6 +309,32 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
|
|
309
309
|
|
310
310
|
History
|
311
311
|
-------
|
312
|
+
* 0.13.0 (2024-08-05)
|
313
|
+
* New features
|
314
|
+
* Major features of Topic Model Viewer `tomotopy.viewer.open_viewer()` are ready now.
|
315
|
+
* `tomotopy.LDAModel.get_hash()` is added. You can get 128bit hash value of the model.
|
316
|
+
* Add an argument `ngram_list` to `tomotopy.utils.SimpleTokenizer`.
|
317
|
+
* Bug fixes
|
318
|
+
* Fixed inconsistent `spans` bug after `Corpus.concat_ngrams` is called.
|
319
|
+
* Optimized the bottleneck of `tomotopy.LDAModel.load()` and `tomotopy.LDAModel.save()` and improved its speed more than 10 times.
|
320
|
+
|
321
|
+
* 0.12.7 (2023-12-19)
|
322
|
+
* New features
|
323
|
+
* Added Topic Model Viewer `tomotopy.viewer.open_viewer()`
|
324
|
+
* Optimized the performance of `tomotopy.utils.Corpus.process()`
|
325
|
+
* Bug fixes
|
326
|
+
* `Document.span` now returns the ranges in character unit, not in byte unit.
|
327
|
+
|
328
|
+
* 0.12.6 (2023-12-11)
|
329
|
+
* New features
|
330
|
+
* Added some convenience features to `tomotopy.LDAModel.train` and `tomotopy.LDAModel.set_word_prior`.
|
331
|
+
* `LDAModel.train` now has new arguments `callback`, `callback_interval` and `show_progres` to monitor the training progress.
|
332
|
+
* `LDAModel.set_word_prior` now can accept `Dict[int, float]` type as its argument `prior`.
|
333
|
+
|
334
|
+
* 0.12.5 (2023-08-03)
|
335
|
+
* New features
|
336
|
+
* Added support for Linux ARM64 architecture.
|
337
|
+
|
312
338
|
* 0.12.4 (2023-01-22)
|
313
339
|
* New features
|
314
340
|
* Added support for macOS ARM64 architecture.
|
@@ -11,8 +11,8 @@ namespace tomoto
|
|
11
11
|
Matrix beta; // Dim: (K, betaSample)
|
12
12
|
Vector smBeta; // Dim: K
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
16
16
|
};
|
17
17
|
|
18
18
|
struct CTArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentCTM);
|
9
|
+
|
5
10
|
ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
|
@@ -243,6 +243,7 @@ namespace tomoto
|
|
243
243
|
public:
|
244
244
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
|
245
245
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);
|
246
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, numBetaSample, numTMNSample, topicPrior);
|
246
247
|
|
247
248
|
CTModel(const CTArgs& args)
|
248
249
|
: BaseClass(args)
|
@@ -18,8 +18,8 @@ namespace tomoto
|
|
18
18
|
|
19
19
|
RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
|
20
20
|
|
21
|
-
|
22
|
-
|
21
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
22
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
23
23
|
};
|
24
24
|
|
25
25
|
struct DMRArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentDMR);
|
9
|
+
|
5
10
|
IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
|
@@ -364,6 +364,7 @@ namespace tomoto
|
|
364
364
|
public:
|
365
365
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda);
|
366
366
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
|
367
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
|
367
368
|
|
368
369
|
DMRModel(const DMRArgs& args)
|
369
370
|
: BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps)
|
@@ -21,8 +21,8 @@ namespace tomoto
|
|
21
21
|
return ret;
|
22
22
|
}
|
23
23
|
|
24
|
-
|
25
|
-
|
24
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
25
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
26
26
|
};
|
27
27
|
|
28
28
|
struct DTArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentDTM);
|
9
|
+
|
5
10
|
IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
|
@@ -22,6 +22,7 @@ namespace tomoto
|
|
22
22
|
Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
|
23
23
|
//ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
|
24
24
|
DEFINE_SERIALIZER(numByTopic, numByTopicWord);
|
25
|
+
DEFINE_HASHER(numByTopic, numByTopicWord);
|
25
26
|
};
|
26
27
|
|
27
28
|
template<TermWeight _tw, typename _RandGen,
|
@@ -365,6 +366,7 @@ namespace tomoto
|
|
365
366
|
{
|
366
367
|
double ll = 0;
|
367
368
|
const size_t V = this->realV;
|
369
|
+
if (V == 0) return 0;
|
368
370
|
for (Tid t = 0; t < T; ++t)
|
369
371
|
{
|
370
372
|
// topic-word distribution
|
@@ -495,6 +497,8 @@ namespace tomoto
|
|
495
497
|
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
496
498
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001,
|
497
499
|
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
500
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass,
|
501
|
+
T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
|
498
502
|
|
499
503
|
GETTER(T, size_t, T);
|
500
504
|
GETTER(NumDocsByT, std::vector<uint32_t>, numDocsByTime);
|
@@ -17,8 +17,8 @@ namespace tomoto
|
|
17
17
|
return ret;
|
18
18
|
}
|
19
19
|
|
20
|
-
|
21
|
-
|
20
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
21
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
22
22
|
};
|
23
23
|
|
24
24
|
struct GDMRArgs : public DMRArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentGDMR);
|
9
|
+
|
5
10
|
IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
|
@@ -412,6 +412,7 @@ namespace tomoto
|
|
412
412
|
public:
|
413
413
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
|
414
414
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
|
415
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
|
415
416
|
|
416
417
|
GDMRModel(const GDMRArgs& args)
|
417
418
|
: BaseClass(args), sigma0(args.sigma0), orderDecay(args.orderDecay), degreeByF(args.degrees)
|
@@ -39,8 +39,8 @@ namespace tomoto
|
|
39
39
|
};
|
40
40
|
std::vector<TableTopicInfo> numTopicByTable;
|
41
41
|
|
42
|
-
|
43
|
-
|
42
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
43
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
44
44
|
|
45
45
|
size_t getNumTable() const
|
46
46
|
{
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHDP);
|
9
|
+
|
5
10
|
IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
|
@@ -19,6 +19,7 @@ namespace tomoto
|
|
19
19
|
size_t totalTable = 0;
|
20
20
|
|
21
21
|
DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
|
22
|
+
DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
|
22
23
|
};
|
23
24
|
|
24
25
|
template<TermWeight _tw, typename _RandGen,
|
@@ -457,6 +458,7 @@ namespace tomoto
|
|
457
458
|
public:
|
458
459
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
|
459
460
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
|
461
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
|
460
462
|
|
461
463
|
HDPModel(const HDPArgs& args)
|
462
464
|
: BaseClass(args), gamma(args.gamma)
|
@@ -16,8 +16,8 @@ namespace tomoto
|
|
16
16
|
|
17
17
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
18
18
|
|
19
|
-
|
20
|
-
|
19
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
20
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
21
21
|
};
|
22
22
|
|
23
23
|
struct HLDAArgs : public LDAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHLDA);
|
9
|
+
|
5
10
|
IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
|
@@ -18,6 +18,7 @@ namespace tomoto
|
|
18
18
|
int32_t parent = 0, sibling = 0, child = 0;
|
19
19
|
|
20
20
|
DEFINE_SERIALIZER(numCustomers, level, parent, sibling, child);
|
21
|
+
DEFINE_HASHER(numCustomers, level, parent, sibling, child);
|
21
22
|
|
22
23
|
NCRPNode* getParent() const
|
23
24
|
{
|
@@ -118,6 +119,7 @@ namespace tomoto
|
|
118
119
|
Vector nodeWLikelihoods; //
|
119
120
|
|
120
121
|
DEFINE_SERIALIZER(nodes, levelBlocks);
|
122
|
+
DEFINE_HASHER(nodes, levelBlocks);
|
121
123
|
|
122
124
|
template<bool _makeNewPath = true>
|
123
125
|
void calcNodeLikelihood(Float gamma, size_t levelDepth)
|
@@ -317,6 +319,12 @@ namespace tomoto
|
|
317
319
|
ModelStateLDA<_tw>::serializerWrite(ostr);
|
318
320
|
nt->serializerWrite(ostr);
|
319
321
|
}
|
322
|
+
|
323
|
+
uint64_t computeHash(uint64_t seed) const
|
324
|
+
{
|
325
|
+
seed = ModelStateLDA<_tw>::computeHash(seed);
|
326
|
+
return nt->computeHash(seed);
|
327
|
+
}
|
320
328
|
};
|
321
329
|
|
322
330
|
template<TermWeight _tw, typename _RandGen,
|
@@ -596,6 +604,7 @@ namespace tomoto
|
|
596
604
|
public:
|
597
605
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
|
598
606
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
|
607
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
|
599
608
|
|
600
609
|
HLDAModel(const HLDAArgs& args)
|
601
610
|
: BaseClass(args), gamma(args.gamma)
|
@@ -12,8 +12,8 @@ namespace tomoto
|
|
12
12
|
|
13
13
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
14
14
|
|
15
|
-
|
16
|
-
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
16
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
17
17
|
};
|
18
18
|
|
19
19
|
struct HPAArgs : public PAArgs
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
|
6
|
+
DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentHPA);
|
9
|
+
|
5
10
|
IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
if (_exclusive)
|
@@ -21,6 +21,7 @@ namespace tomoto
|
|
21
21
|
Eigen::Matrix<WeightType, -1, -1> numByTopic1_2;
|
22
22
|
|
23
23
|
DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
|
24
|
+
DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
|
24
25
|
};
|
25
26
|
|
26
27
|
template<TermWeight _tw, typename _RandGen,
|
@@ -439,6 +440,7 @@ namespace tomoto
|
|
439
440
|
public:
|
440
441
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
|
441
442
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
|
443
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
|
442
444
|
|
443
445
|
HPAModel(const HPAArgs& args)
|
444
446
|
: BaseClass(args, false), K2(args.k2)
|
@@ -93,6 +93,12 @@ namespace tomoto
|
|
93
93
|
if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
|
94
94
|
throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
|
95
95
|
}
|
96
|
+
|
97
|
+
uint64_t computeHash(uint64_t seed) const
|
98
|
+
{
|
99
|
+
seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
|
100
|
+
return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
|
101
|
+
}
|
96
102
|
};
|
97
103
|
|
98
104
|
template<typename _Base, TermWeight _tw>
|
@@ -139,8 +145,8 @@ namespace tomoto
|
|
139
145
|
tvector<Float> wordWeights;
|
140
146
|
ShareableMatrix<WeightType, -1, 1> numByTopic;
|
141
147
|
|
142
|
-
|
143
|
-
|
148
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
149
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
144
150
|
|
145
151
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
146
152
|
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentLDA);
|
9
|
+
|
5
10
|
ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
|
@@ -47,6 +47,10 @@ Term Weighting Scheme is based on following paper:
|
|
47
47
|
return nullptr; } while(0)
|
48
48
|
#endif
|
49
49
|
|
50
|
+
#define TMT_INSTANTIATE_DOC(CLS) template struct CLS<TermWeight::one>; \
|
51
|
+
template struct CLS<TermWeight::idf>; \
|
52
|
+
template struct CLS<TermWeight::pmi>;
|
53
|
+
|
50
54
|
#define GETTER(name, type, field) type get##name() const override { return field; }
|
51
55
|
|
52
56
|
namespace tomoto
|
@@ -61,6 +65,7 @@ namespace tomoto
|
|
61
65
|
//Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
|
62
66
|
ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
|
63
67
|
DEFINE_SERIALIZER(numByTopic, numByTopicWord);
|
68
|
+
DEFINE_HASHER(numByTopic, numByTopicWord);
|
64
69
|
};
|
65
70
|
|
66
71
|
namespace flags
|
@@ -954,6 +959,8 @@ namespace tomoto
|
|
954
959
|
DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord,
|
955
960
|
burnIn, optimInterval);
|
956
961
|
|
962
|
+
DEFINE_HASHER(vocabWeights, alpha, alphas, eta, K, /*etaByWord,*/ burnIn, optimInterval);
|
963
|
+
|
957
964
|
LDAModel(const LDAArgs& args, bool checkAlpha = true)
|
958
965
|
: BaseClass(args.seed), K(args.k), alpha(args.alpha[0]), eta(args.eta)
|
959
966
|
{
|
@@ -1066,6 +1073,7 @@ namespace tomoto
|
|
1066
1073
|
void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
|
1067
1074
|
{
|
1068
1075
|
if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
|
1076
|
+
static_cast<DerivedClass*>(this)->updateWordFormCnts();
|
1069
1077
|
static_cast<DerivedClass*>(this)->updateWeakArray();
|
1070
1078
|
static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
|
1071
1079
|
static_cast<DerivedClass*>(this)->prepareWordPriors();
|
@@ -11,8 +11,8 @@ namespace tomoto
|
|
11
11
|
using WeightType = typename DocumentLDA<_tw>::WeightType;
|
12
12
|
Eigen::Matrix<int8_t, -1, 1> labelMask;
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
15
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
16
16
|
};
|
17
17
|
|
18
18
|
class ILLDAModel : public ILDAModel
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentLLDA);
|
9
|
+
|
5
10
|
ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
|
@@ -107,6 +107,7 @@ namespace tomoto
|
|
107
107
|
public:
|
108
108
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict);
|
109
109
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict);
|
110
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict);
|
110
111
|
|
111
112
|
LLDAModel(const LDAArgs& args)
|
112
113
|
: BaseClass(args)
|
@@ -22,8 +22,8 @@ namespace tomoto
|
|
22
22
|
Eigen::Matrix<WeightType, -1, 1> numByWin; // number of words in the window (len = S + T - 1)
|
23
23
|
Eigen::Matrix<WeightType, -1, -1> numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
|
24
24
|
|
25
|
-
|
26
|
-
|
25
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
26
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
27
27
|
|
28
28
|
template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
|
29
29
|
};
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentMGLDA);
|
9
|
+
|
5
10
|
IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
|
@@ -370,6 +370,7 @@ namespace tomoto
|
|
370
370
|
public:
|
371
371
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
372
372
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
373
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
|
373
374
|
|
374
375
|
MGLDAModel(const MGLDAArgs& args)
|
375
376
|
: BaseClass(args), KL(args.kL), T(args.t),
|
@@ -516,9 +517,14 @@ namespace tomoto
|
|
516
517
|
return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
|
517
518
|
}
|
518
519
|
|
520
|
+
size_t getNumTopicsForPrior() const override
|
521
|
+
{
|
522
|
+
return this->K + KL;
|
523
|
+
}
|
524
|
+
|
519
525
|
void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
|
520
526
|
{
|
521
|
-
if (priors.size() != this->K + KL) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors.size() must be equal to K.");
|
527
|
+
if (priors.size() != this->K + KL) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors.size() must be equal to K + KL.");
|
522
528
|
for (auto p : priors)
|
523
529
|
{
|
524
530
|
if (p < 0) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors must not be less than 0.");
|