tomoto 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +1 -1
  4. data/ext/tomoto/extconf.rb +4 -2
  5. data/lib/tomoto/version.rb +1 -1
  6. data/vendor/tomotopy/README.kr.rst +10 -1
  7. data/vendor/tomotopy/README.rst +10 -1
  8. data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
  9. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
  10. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
  11. data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
  12. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
  13. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
  14. data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
  15. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
  16. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +3 -0
  17. data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
  18. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
  20. data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
  21. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
  22. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
  23. data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
  24. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
  25. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
  26. data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
  27. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
  28. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
  29. data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
  30. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
  31. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
  32. data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
  33. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
  34. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  35. data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
  36. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
  37. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  38. data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
  39. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
  40. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +2 -0
  41. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  42. data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
  43. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
  44. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
  45. data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
  46. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
  47. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
  48. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +77 -3
  49. data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
  50. data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
  51. data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
  52. data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
  53. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
  54. data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
  55. data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
  56. data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
  57. data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
  58. metadata +9 -4
  59. data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7d16410002670991fd881e13f64195db9de29dfa5c383da2287d44c9053b500
4
- data.tar.gz: 290254c48ed1c3ce1ff51e2bbe07a46ed02d05dab6bcc095f38cdbf499883561
3
+ metadata.gz: 86215ec57ae6cf6e36531ee2896e2b81d591f61909eb5454ef70b69c5db0a39d
4
+ data.tar.gz: 3f31adcb38a1793caaaedc516f99c9ffce4b82ff0c93f2a169b85377e116433b
5
5
  SHA512:
6
- metadata.gz: 817a074c0f9969ded7592d70a2b3096ca91142470552e019bc95668b45b658d24010160bce53b356810b09f25288fd9fb9c070841b3587ea23e6099f528f94b0
7
- data.tar.gz: 94764d26429358b30766a36ef899b6856b61c848ed575cf62911dae9a352344c9736cd86be859601642cde479ba810b607dd41730db920d0fa9170b71dc9fdf2
6
+ metadata.gz: db0a4bd9831cecae6711e150ecc2c5d23b87ada83d418f784474cf2f260627e52e40d871d9af974c6a79790e8d0d060ac08fbb69775e1d2de316085421ef76af
7
+ data.tar.gz: '07779f29aa9bdb4b71d9a0acdfe26c84d041312ad769c54539d6377b0ab01327dba045d97b51179a50498b438d137b567a3708bd3a2ab902c16adec37ad3a779'
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
+ ## 0.4.1 (2024-09-04)
2
+
3
+ - Updated tomoto to 0.13.0
4
+
1
5
  ## 0.4.0 (2023-12-28)
2
6
 
7
+ - Added support for Ruby 3.3
3
8
  - Added precompiled gem for Linux ARM
4
9
  - Updated tomoto to 0.12.7
5
10
  - Dropped support for Ruby < 3
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tomoto-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tomoto-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tomoto-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -27,16 +27,18 @@ else
27
27
  end
28
28
 
29
29
  # silence tomoto warnings
30
- $CXXFLAGS += " -Wno-unused-variable -Wno-switch"
30
+ $CXXFLAGS += " -Wno-unused-variable -Wno-switch -Wno-unqualified-std-cast-call"
31
31
 
32
32
  ext = File.expand_path(".", __dir__)
33
33
  tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
34
+ tomoto_utils = File.expand_path("../../vendor/tomotopy/src/Utils", __dir__)
34
35
  eigen = File.expand_path("../../vendor/eigen", __dir__)
35
36
  eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
36
37
  variant = File.expand_path("../../vendor/variant/include", __dir__)
37
38
 
38
- $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
39
+ $srcs = Dir["{#{ext},#{tomoto},#{tomoto_utils}}/*.cpp"]
39
40
  $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
40
41
  $VPATH << tomoto
42
+ $VPATH << tomoto_utils
41
43
 
42
44
  create_makefile("tomoto/tomoto")
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  `English`_,
12
12
  **한국어**.
13
13
 
@@ -305,6 +305,15 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
305
305
 
306
306
  역사
307
307
  -------
308
+ * 0.13.0 (2024-08-05)
309
+ * 신규 기능
310
+ * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`의 주요 기능이 완성되었습니다.
311
+ * `tomotopy.LDAModel.get_hash()`가 추가되었습니다. 모델의 128bit 해시를 구해줍니다.
312
+ * `ngram_list` 인자가 `tomotopy.utils.SimpleTokenizer`에 추가되었습니다.
313
+ * Bug fixes
314
+ * `Corpus.concat_ngrams` 호출 후에 `spans`이 비일관적인 버그가 수정되었습니다.
315
+ * `tomotopy.LDAModel.load()`와 `tomotopy.LDAModel.save()`의 병목을 최적화하여 속도를 10배 이상 개선했습니다.
316
+
308
317
  * 0.12.7 (2023-12-19)
309
318
  * 신규 기능
310
319
  * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`가 추가되었습니다.
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  **English**,
12
12
  `한국어`_.
13
13
 
@@ -309,6 +309,15 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
309
309
 
310
310
  History
311
311
  -------
312
+ * 0.13.0 (2024-08-05)
313
+ * New features
314
+ * Major features of Topic Model Viewer `tomotopy.viewer.open_viewer()` are ready now.
315
+ * `tomotopy.LDAModel.get_hash()` is added. You can get 128bit hash value of the model.
316
+ * Add an argument `ngram_list` to `tomotopy.utils.SimpleTokenizer`.
317
+ * Bug fixes
318
+ * Fixed inconsistent `spans` bug after `Corpus.concat_ngrams` is called.
319
+ * Optimized the bottleneck of `tomotopy.LDAModel.load()` and `tomotopy.LDAModel.save()` and improved its speed more than 10 times.
320
+
312
321
  * 0.12.7 (2023-12-19)
313
322
  * New features
314
323
  * Added Topic Model Viewer `tomotopy.viewer.open_viewer()`
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  Matrix beta; // Dim: (K, betaSample)
12
12
  Vector smBeta; // Dim: K
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  struct CTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentCTM);
9
+
5
10
  ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
@@ -243,6 +243,7 @@ namespace tomoto
243
243
  public:
244
244
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
245
245
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);
246
+ DEFINE_HASHER_AFTER_BASE(BaseClass, numBetaSample, numTMNSample, topicPrior);
246
247
 
247
248
  CTModel(const CTArgs& args)
248
249
  : BaseClass(args)
@@ -18,8 +18,8 @@ namespace tomoto
18
18
 
19
19
  RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
20
20
 
21
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
22
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata);
21
+ DECLARE_SERIALIZER_WITH_VERSION(0);
22
+ DECLARE_SERIALIZER_WITH_VERSION(1);
23
23
  };
24
24
 
25
25
  struct DMRArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDMR);
9
+
5
10
  IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
@@ -364,6 +364,7 @@ namespace tomoto
364
364
  public:
365
365
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda);
366
366
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
368
 
368
369
  DMRModel(const DMRArgs& args)
369
370
  : BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps)
@@ -21,8 +21,8 @@ namespace tomoto
21
21
  return ret;
22
22
  }
23
23
 
24
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, timepoint);
25
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, timepoint);
24
+ DECLARE_SERIALIZER_WITH_VERSION(0);
25
+ DECLARE_SERIALIZER_WITH_VERSION(1);
26
26
  };
27
27
 
28
28
  struct DTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDTM);
9
+
5
10
  IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
@@ -22,6 +22,7 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
23
23
  //ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
24
24
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
25
+ DEFINE_HASHER(numByTopic, numByTopicWord);
25
26
  };
26
27
 
27
28
  template<TermWeight _tw, typename _RandGen,
@@ -496,6 +497,8 @@ namespace tomoto
496
497
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
497
498
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001,
498
499
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
500
+ DEFINE_HASHER_AFTER_BASE(BaseClass,
501
+ T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
499
502
 
500
503
  GETTER(T, size_t, T);
501
504
  GETTER(NumDocsByT, std::vector<uint32_t>, numDocsByTime);
@@ -17,8 +17,8 @@ namespace tomoto
17
17
  return ret;
18
18
  }
19
19
 
20
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataOrg);
21
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
20
+ DECLARE_SERIALIZER_WITH_VERSION(0);
21
+ DECLARE_SERIALIZER_WITH_VERSION(1);
22
22
  };
23
23
 
24
24
  struct GDMRArgs : public DMRArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentGDMR);
9
+
5
10
  IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
@@ -412,6 +412,7 @@ namespace tomoto
412
412
  public:
413
413
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
414
414
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
416
 
416
417
  GDMRModel(const GDMRArgs& args)
417
418
  : BaseClass(args), sigma0(args.sigma0), orderDecay(args.orderDecay), degreeByF(args.degrees)
@@ -39,8 +39,8 @@ namespace tomoto
39
39
  };
40
40
  std::vector<TableTopicInfo> numTopicByTable;
41
41
 
42
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, numTopicByTable);
43
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, numTopicByTable);
42
+ DECLARE_SERIALIZER_WITH_VERSION(0);
43
+ DECLARE_SERIALIZER_WITH_VERSION(1);
44
44
 
45
45
  size_t getNumTable() const
46
46
  {
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHDP);
9
+
5
10
  IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
@@ -19,6 +19,7 @@ namespace tomoto
19
19
  size_t totalTable = 0;
20
20
 
21
21
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
23
  };
23
24
 
24
25
  template<TermWeight _tw, typename _RandGen,
@@ -457,6 +458,7 @@ namespace tomoto
457
458
  public:
458
459
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
459
460
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
461
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
460
462
 
461
463
  HDPModel(const HDPArgs& args)
462
464
  : BaseClass(args), gamma(args.gamma)
@@ -16,8 +16,8 @@ namespace tomoto
16
16
 
17
17
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
18
18
 
19
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, path);
20
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, path);
19
+ DECLARE_SERIALIZER_WITH_VERSION(0);
20
+ DECLARE_SERIALIZER_WITH_VERSION(1);
21
21
  };
22
22
 
23
23
  struct HLDAArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHLDA);
9
+
5
10
  IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
@@ -18,6 +18,7 @@ namespace tomoto
18
18
  int32_t parent = 0, sibling = 0, child = 0;
19
19
 
20
20
  DEFINE_SERIALIZER(numCustomers, level, parent, sibling, child);
21
+ DEFINE_HASHER(numCustomers, level, parent, sibling, child);
21
22
 
22
23
  NCRPNode* getParent() const
23
24
  {
@@ -118,6 +119,7 @@ namespace tomoto
118
119
  Vector nodeWLikelihoods; //
119
120
 
120
121
  DEFINE_SERIALIZER(nodes, levelBlocks);
122
+ DEFINE_HASHER(nodes, levelBlocks);
121
123
 
122
124
  template<bool _makeNewPath = true>
123
125
  void calcNodeLikelihood(Float gamma, size_t levelDepth)
@@ -317,6 +319,12 @@ namespace tomoto
317
319
  ModelStateLDA<_tw>::serializerWrite(ostr);
318
320
  nt->serializerWrite(ostr);
319
321
  }
322
+
323
+ uint64_t computeHash(uint64_t seed) const
324
+ {
325
+ seed = ModelStateLDA<_tw>::computeHash(seed);
326
+ return nt->computeHash(seed);
327
+ }
320
328
  };
321
329
 
322
330
  template<TermWeight _tw, typename _RandGen,
@@ -596,6 +604,7 @@ namespace tomoto
596
604
  public:
597
605
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
598
606
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
607
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
599
608
 
600
609
  HLDAModel(const HLDAArgs& args)
601
610
  : BaseClass(args), gamma(args.gamma)
@@ -12,8 +12,8 @@ namespace tomoto
12
12
 
13
13
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
14
14
 
15
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 0);
16
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 1);
15
+ DECLARE_SERIALIZER_WITH_VERSION(0);
16
+ DECLARE_SERIALIZER_WITH_VERSION(1);
17
17
  };
18
18
 
19
19
  struct HPAArgs : public PAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
6
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHPA);
9
+
5
10
  IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
6
11
  {
7
12
  if (_exclusive)
@@ -21,6 +21,7 @@ namespace tomoto
21
21
  Eigen::Matrix<WeightType, -1, -1> numByTopic1_2;
22
22
 
23
23
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
25
  };
25
26
 
26
27
  template<TermWeight _tw, typename _RandGen,
@@ -439,6 +440,7 @@ namespace tomoto
439
440
  public:
440
441
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
441
442
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
443
+ DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
442
444
 
443
445
  HPAModel(const HPAArgs& args)
444
446
  : BaseClass(args, false), K2(args.k2)
@@ -93,6 +93,12 @@ namespace tomoto
93
93
  if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
94
94
  throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
95
95
  }
96
+
97
+ uint64_t computeHash(uint64_t seed) const
98
+ {
99
+ seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
100
+ return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
101
+ }
96
102
  };
97
103
 
98
104
  template<typename _Base, TermWeight _tw>
@@ -139,8 +145,8 @@ namespace tomoto
139
145
  tvector<Float> wordWeights;
140
146
  ShareableMatrix<WeightType, -1, 1> numByTopic;
141
147
 
142
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 0, Zs, wordWeights);
143
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 1, 0x00010001, Zs, wordWeights);
148
+ DECLARE_SERIALIZER_WITH_VERSION(0);
149
+ DECLARE_SERIALIZER_WITH_VERSION(1);
144
150
 
145
151
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
146
152
 
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLDA);
9
+
5
10
  ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
@@ -47,6 +47,10 @@ Term Weighting Scheme is based on following paper:
47
47
  return nullptr; } while(0)
48
48
  #endif
49
49
 
50
+ #define TMT_INSTANTIATE_DOC(CLS) template struct CLS<TermWeight::one>; \
51
+ template struct CLS<TermWeight::idf>; \
52
+ template struct CLS<TermWeight::pmi>;
53
+
50
54
  #define GETTER(name, type, field) type get##name() const override { return field; }
51
55
 
52
56
  namespace tomoto
@@ -61,6 +65,7 @@ namespace tomoto
61
65
  //Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
62
66
  ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
63
67
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
68
+ DEFINE_HASHER(numByTopic, numByTopicWord);
64
69
  };
65
70
 
66
71
  namespace flags
@@ -954,6 +959,8 @@ namespace tomoto
954
959
  DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord,
955
960
  burnIn, optimInterval);
956
961
 
962
+ DEFINE_HASHER(vocabWeights, alpha, alphas, eta, K, /*etaByWord,*/ burnIn, optimInterval);
963
+
957
964
  LDAModel(const LDAArgs& args, bool checkAlpha = true)
958
965
  : BaseClass(args.seed), K(args.k), alpha(args.alpha[0]), eta(args.eta)
959
966
  {
@@ -1066,6 +1073,7 @@ namespace tomoto
1066
1073
  void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
1067
1074
  {
1068
1075
  if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
1076
+ static_cast<DerivedClass*>(this)->updateWordFormCnts();
1069
1077
  static_cast<DerivedClass*>(this)->updateWeakArray();
1070
1078
  static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
1071
1079
  static_cast<DerivedClass*>(this)->prepareWordPriors();
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  using WeightType = typename DocumentLDA<_tw>::WeightType;
12
12
  Eigen::Matrix<int8_t, -1, 1> labelMask;
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, labelMask);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, labelMask);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  class ILLDAModel : public ILDAModel
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLLDA);
9
+
5
10
  ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
@@ -107,6 +107,7 @@ namespace tomoto
107
107
  public:
108
108
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict);
109
109
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict);
110
+ DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict);
110
111
 
111
112
  LLDAModel(const LDAArgs& args)
112
113
  : BaseClass(args)
@@ -22,8 +22,8 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, 1> numByWin; // number of words in the window (len = S + T - 1)
23
23
  Eigen::Matrix<WeightType, -1, -1> numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
24
24
 
25
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
26
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
25
+ DECLARE_SERIALIZER_WITH_VERSION(0);
26
+ DECLARE_SERIALIZER_WITH_VERSION(1);
27
27
 
28
28
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
29
29
  };
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentMGLDA);
9
+
5
10
  IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
@@ -370,6 +370,7 @@ namespace tomoto
370
370
  public:
371
371
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
372
372
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
+ DEFINE_HASHER_AFTER_BASE(BaseClass, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
374
 
374
375
  MGLDAModel(const MGLDAArgs& args)
375
376
  : BaseClass(args), KL(args.kL), T(args.t),
@@ -15,8 +15,8 @@ namespace tomoto
15
15
 
16
16
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
17
17
 
18
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, Z2s);
19
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, Z2s);
18
+ DECLARE_SERIALIZER_WITH_VERSION(0);
19
+ DECLARE_SERIALIZER_WITH_VERSION(1);
20
20
  };
21
21
 
22
22
  struct PAArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentPA);
9
+
5
10
  IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, PAModel, args);
@@ -19,6 +19,7 @@ namespace tomoto
19
19
  Vector subTmp;
20
20
 
21
21
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
22
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
22
23
  };
23
24
 
24
25
  template<TermWeight _tw, typename _RandGen,
@@ -364,6 +365,7 @@ namespace tomoto
364
365
  public:
365
366
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
366
367
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
368
+ DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
367
369
 
368
370
  PAModel(const PAArgs& args)
369
371
  : BaseClass(args), K2(args.k2)
@@ -111,6 +111,7 @@ namespace tomoto
111
111
  public:
112
112
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict, numLatentTopics, numTopicsPerLabel);
113
113
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict, numLatentTopics, numTopicsPerLabel);
114
+ DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict, numLatentTopics, numTopicsPerLabel);
114
115
 
115
116
  PLDAModel(const PLDAArgs& args)
116
117
  : BaseClass(args.setK(1)),
@@ -11,9 +11,9 @@ namespace tomoto
11
11
  using WeightType = typename DocumentLDA<_tw>::WeightType;
12
12
 
13
13
  uint64_t pseudoDoc = 0;
14
-
15
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, pseudoDoc);
16
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, pseudoDoc);
14
+
15
+ DECLARE_SERIALIZER_WITH_VERSION(0);
16
+ DECLARE_SERIALIZER_WITH_VERSION(1);
17
17
  };
18
18
 
19
19
  struct PTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentPT);
9
+
5
10
  IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);
@@ -266,6 +266,7 @@ namespace tomoto
266
266
  public:
267
267
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numPDocs, lambda);
268
268
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numPDocs, lambda);
269
+ DEFINE_HASHER_AFTER_BASE(BaseClass, numPDocs, lambda);
269
270
 
270
271
  GETTER(P, size_t, numPDocs);
271
272