tomoto 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +1 -1
  4. data/ext/tomoto/extconf.rb +4 -2
  5. data/lib/tomoto/version.rb +1 -1
  6. data/vendor/tomotopy/README.kr.rst +10 -1
  7. data/vendor/tomotopy/README.rst +10 -1
  8. data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
  9. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
  10. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
  11. data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
  12. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
  13. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
  14. data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
  15. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
  16. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +3 -0
  17. data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
  18. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
  20. data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
  21. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
  22. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
  23. data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
  24. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
  25. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
  26. data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
  27. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
  28. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
  29. data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
  30. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
  31. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
  32. data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
  33. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
  34. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  35. data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
  36. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
  37. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  38. data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
  39. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
  40. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +2 -0
  41. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  42. data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
  43. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
  44. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
  45. data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
  46. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
  47. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
  48. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +77 -3
  49. data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
  50. data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
  51. data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
  52. data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
  53. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
  54. data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
  55. data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
  56. data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
  57. data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
  58. metadata +9 -4
  59. data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7d16410002670991fd881e13f64195db9de29dfa5c383da2287d44c9053b500
4
- data.tar.gz: 290254c48ed1c3ce1ff51e2bbe07a46ed02d05dab6bcc095f38cdbf499883561
3
+ metadata.gz: 86215ec57ae6cf6e36531ee2896e2b81d591f61909eb5454ef70b69c5db0a39d
4
+ data.tar.gz: 3f31adcb38a1793caaaedc516f99c9ffce4b82ff0c93f2a169b85377e116433b
5
5
  SHA512:
6
- metadata.gz: 817a074c0f9969ded7592d70a2b3096ca91142470552e019bc95668b45b658d24010160bce53b356810b09f25288fd9fb9c070841b3587ea23e6099f528f94b0
7
- data.tar.gz: 94764d26429358b30766a36ef899b6856b61c848ed575cf62911dae9a352344c9736cd86be859601642cde479ba810b607dd41730db920d0fa9170b71dc9fdf2
6
+ metadata.gz: db0a4bd9831cecae6711e150ecc2c5d23b87ada83d418f784474cf2f260627e52e40d871d9af974c6a79790e8d0d060ac08fbb69775e1d2de316085421ef76af
7
+ data.tar.gz: '07779f29aa9bdb4b71d9a0acdfe26c84d041312ad769c54539d6377b0ab01327dba045d97b51179a50498b438d137b567a3708bd3a2ab902c16adec37ad3a779'
data/CHANGELOG.md CHANGED
@@ -1,5 +1,10 @@
1
+ ## 0.4.1 (2024-09-04)
2
+
3
+ - Updated tomoto to 0.13.0
4
+
1
5
  ## 0.4.0 (2023-12-28)
2
6
 
7
+ - Added support for Ruby 3.3
3
8
  - Added precompiled gem for Linux ARM
4
9
  - Updated tomoto to 0.12.7
5
10
  - Dropped support for Ruby < 3
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tomoto-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tomoto-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tomoto-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -27,16 +27,18 @@ else
27
27
  end
28
28
 
29
29
  # silence tomoto warnings
30
- $CXXFLAGS += " -Wno-unused-variable -Wno-switch"
30
+ $CXXFLAGS += " -Wno-unused-variable -Wno-switch -Wno-unqualified-std-cast-call"
31
31
 
32
32
  ext = File.expand_path(".", __dir__)
33
33
  tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
34
+ tomoto_utils = File.expand_path("../../vendor/tomotopy/src/Utils", __dir__)
34
35
  eigen = File.expand_path("../../vendor/eigen", __dir__)
35
36
  eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
36
37
  variant = File.expand_path("../../vendor/variant/include", __dir__)
37
38
 
38
- $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
39
+ $srcs = Dir["{#{ext},#{tomoto},#{tomoto_utils}}/*.cpp"]
39
40
  $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
40
41
  $VPATH << tomoto
42
+ $VPATH << tomoto_utils
41
43
 
42
44
  create_makefile("tomoto/tomoto")
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.1"
3
3
  end
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  `English`_,
12
12
  **한국어**.
13
13
 
@@ -305,6 +305,15 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
305
305
 
306
306
  역사
307
307
  -------
308
+ * 0.13.0 (2024-08-05)
309
+ * 신규 기능
310
+ * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`의 주요 기능이 완성되었습니다.
311
+ * `tomotopy.LDAModel.get_hash()`가 추가되었습니다. 모델의 128bit 해시를 구해줍니다.
312
+ * `ngram_list` 인자가 `tomotopy.utils.SimpleTokenizer`에 추가되었습니다.
313
+ * Bug fixes
314
+ * `Corpus.concat_ngrams` 호출 후에 `spans`이 비일관적인 버그가 수정되었습니다.
315
+ * `tomotopy.LDAModel.load()`와 `tomotopy.LDAModel.save()`의 병목을 최적화하여 속도를 10배 이상 개선했습니다.
316
+
308
317
  * 0.12.7 (2023-12-19)
309
318
  * 신규 기능
310
319
  * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`가 추가되었습니다.
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  **English**,
12
12
  `한국어`_.
13
13
 
@@ -309,6 +309,15 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
309
309
 
310
310
  History
311
311
  -------
312
+ * 0.13.0 (2024-08-05)
313
+ * New features
314
+ * Major features of Topic Model Viewer `tomotopy.viewer.open_viewer()` are ready now.
315
+ * `tomotopy.LDAModel.get_hash()` is added. You can get 128bit hash value of the model.
316
+ * Add an argument `ngram_list` to `tomotopy.utils.SimpleTokenizer`.
317
+ * Bug fixes
318
+ * Fixed inconsistent `spans` bug after `Corpus.concat_ngrams` is called.
319
+ * Optimized the bottleneck of `tomotopy.LDAModel.load()` and `tomotopy.LDAModel.save()` and improved its speed more than 10 times.
320
+
312
321
  * 0.12.7 (2023-12-19)
313
322
  * New features
314
323
  * Added Topic Model Viewer `tomotopy.viewer.open_viewer()`
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  Matrix beta; // Dim: (K, betaSample)
12
12
  Vector smBeta; // Dim: K
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  struct CTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentCTM);
9
+
5
10
  ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
@@ -243,6 +243,7 @@ namespace tomoto
243
243
  public:
244
244
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
245
245
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);
246
+ DEFINE_HASHER_AFTER_BASE(BaseClass, numBetaSample, numTMNSample, topicPrior);
246
247
 
247
248
  CTModel(const CTArgs& args)
248
249
  : BaseClass(args)
@@ -18,8 +18,8 @@ namespace tomoto
18
18
 
19
19
  RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
20
20
 
21
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
22
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata);
21
+ DECLARE_SERIALIZER_WITH_VERSION(0);
22
+ DECLARE_SERIALIZER_WITH_VERSION(1);
23
23
  };
24
24
 
25
25
  struct DMRArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDMR);
9
+
5
10
  IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
@@ -364,6 +364,7 @@ namespace tomoto
364
364
  public:
365
365
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda);
366
366
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
368
 
368
369
  DMRModel(const DMRArgs& args)
369
370
  : BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps)
@@ -21,8 +21,8 @@ namespace tomoto
21
21
  return ret;
22
22
  }
23
23
 
24
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, timepoint);
25
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, timepoint);
24
+ DECLARE_SERIALIZER_WITH_VERSION(0);
25
+ DECLARE_SERIALIZER_WITH_VERSION(1);
26
26
  };
27
27
 
28
28
  struct DTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDTM);
9
+
5
10
  IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
@@ -22,6 +22,7 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
23
23
  //ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
24
24
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
25
+ DEFINE_HASHER(numByTopic, numByTopicWord);
25
26
  };
26
27
 
27
28
  template<TermWeight _tw, typename _RandGen,
@@ -496,6 +497,8 @@ namespace tomoto
496
497
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
497
498
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001,
498
499
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
500
+ DEFINE_HASHER_AFTER_BASE(BaseClass,
501
+ T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
499
502
 
500
503
  GETTER(T, size_t, T);
501
504
  GETTER(NumDocsByT, std::vector<uint32_t>, numDocsByTime);
@@ -17,8 +17,8 @@ namespace tomoto
17
17
  return ret;
18
18
  }
19
19
 
20
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataOrg);
21
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
20
+ DECLARE_SERIALIZER_WITH_VERSION(0);
21
+ DECLARE_SERIALIZER_WITH_VERSION(1);
22
22
  };
23
23
 
24
24
  struct GDMRArgs : public DMRArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentGDMR);
9
+
5
10
  IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
@@ -412,6 +412,7 @@ namespace tomoto
412
412
  public:
413
413
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
414
414
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
416
 
416
417
  GDMRModel(const GDMRArgs& args)
417
418
  : BaseClass(args), sigma0(args.sigma0), orderDecay(args.orderDecay), degreeByF(args.degrees)
@@ -39,8 +39,8 @@ namespace tomoto
39
39
  };
40
40
  std::vector<TableTopicInfo> numTopicByTable;
41
41
 
42
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, numTopicByTable);
43
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, numTopicByTable);
42
+ DECLARE_SERIALIZER_WITH_VERSION(0);
43
+ DECLARE_SERIALIZER_WITH_VERSION(1);
44
44
 
45
45
  size_t getNumTable() const
46
46
  {
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHDP);
9
+
5
10
  IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
@@ -19,6 +19,7 @@ namespace tomoto
19
19
  size_t totalTable = 0;
20
20
 
21
21
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
23
  };
23
24
 
24
25
  template<TermWeight _tw, typename _RandGen,
@@ -457,6 +458,7 @@ namespace tomoto
457
458
  public:
458
459
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
459
460
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
461
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
460
462
 
461
463
  HDPModel(const HDPArgs& args)
462
464
  : BaseClass(args), gamma(args.gamma)
@@ -16,8 +16,8 @@ namespace tomoto
16
16
 
17
17
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
18
18
 
19
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, path);
20
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, path);
19
+ DECLARE_SERIALIZER_WITH_VERSION(0);
20
+ DECLARE_SERIALIZER_WITH_VERSION(1);
21
21
  };
22
22
 
23
23
  struct HLDAArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHLDA);
9
+
5
10
  IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
@@ -18,6 +18,7 @@ namespace tomoto
18
18
  int32_t parent = 0, sibling = 0, child = 0;
19
19
 
20
20
  DEFINE_SERIALIZER(numCustomers, level, parent, sibling, child);
21
+ DEFINE_HASHER(numCustomers, level, parent, sibling, child);
21
22
 
22
23
  NCRPNode* getParent() const
23
24
  {
@@ -118,6 +119,7 @@ namespace tomoto
118
119
  Vector nodeWLikelihoods; //
119
120
 
120
121
  DEFINE_SERIALIZER(nodes, levelBlocks);
122
+ DEFINE_HASHER(nodes, levelBlocks);
121
123
 
122
124
  template<bool _makeNewPath = true>
123
125
  void calcNodeLikelihood(Float gamma, size_t levelDepth)
@@ -317,6 +319,12 @@ namespace tomoto
317
319
  ModelStateLDA<_tw>::serializerWrite(ostr);
318
320
  nt->serializerWrite(ostr);
319
321
  }
322
+
323
+ uint64_t computeHash(uint64_t seed) const
324
+ {
325
+ seed = ModelStateLDA<_tw>::computeHash(seed);
326
+ return nt->computeHash(seed);
327
+ }
320
328
  };
321
329
 
322
330
  template<TermWeight _tw, typename _RandGen,
@@ -596,6 +604,7 @@ namespace tomoto
596
604
  public:
597
605
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
598
606
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
607
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
599
608
 
600
609
  HLDAModel(const HLDAArgs& args)
601
610
  : BaseClass(args), gamma(args.gamma)
@@ -12,8 +12,8 @@ namespace tomoto
12
12
 
13
13
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
14
14
 
15
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 0);
16
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 1);
15
+ DECLARE_SERIALIZER_WITH_VERSION(0);
16
+ DECLARE_SERIALIZER_WITH_VERSION(1);
17
17
  };
18
18
 
19
19
  struct HPAArgs : public PAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
6
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHPA);
9
+
5
10
  IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
6
11
  {
7
12
  if (_exclusive)
@@ -21,6 +21,7 @@ namespace tomoto
21
21
  Eigen::Matrix<WeightType, -1, -1> numByTopic1_2;
22
22
 
23
23
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
25
  };
25
26
 
26
27
  template<TermWeight _tw, typename _RandGen,
@@ -439,6 +440,7 @@ namespace tomoto
439
440
  public:
440
441
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
441
442
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
443
+ DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
442
444
 
443
445
  HPAModel(const HPAArgs& args)
444
446
  : BaseClass(args, false), K2(args.k2)
@@ -93,6 +93,12 @@ namespace tomoto
93
93
  if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
94
94
  throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
95
95
  }
96
+
97
+ uint64_t computeHash(uint64_t seed) const
98
+ {
99
+ seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
100
+ return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
101
+ }
96
102
  };
97
103
 
98
104
  template<typename _Base, TermWeight _tw>
@@ -139,8 +145,8 @@ namespace tomoto
139
145
  tvector<Float> wordWeights;
140
146
  ShareableMatrix<WeightType, -1, 1> numByTopic;
141
147
 
142
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 0, Zs, wordWeights);
143
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 1, 0x00010001, Zs, wordWeights);
148
+ DECLARE_SERIALIZER_WITH_VERSION(0);
149
+ DECLARE_SERIALIZER_WITH_VERSION(1);
144
150
 
145
151
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
146
152
 
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLDA);
9
+
5
10
  ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
@@ -47,6 +47,10 @@ Term Weighting Scheme is based on following paper:
47
47
  return nullptr; } while(0)
48
48
  #endif
49
49
 
50
+ #define TMT_INSTANTIATE_DOC(CLS) template struct CLS<TermWeight::one>; \
51
+ template struct CLS<TermWeight::idf>; \
52
+ template struct CLS<TermWeight::pmi>;
53
+
50
54
  #define GETTER(name, type, field) type get##name() const override { return field; }
51
55
 
52
56
  namespace tomoto
@@ -61,6 +65,7 @@ namespace tomoto
61
65
  //Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
62
66
  ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
63
67
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
68
+ DEFINE_HASHER(numByTopic, numByTopicWord);
64
69
  };
65
70
 
66
71
  namespace flags
@@ -954,6 +959,8 @@ namespace tomoto
954
959
  DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord,
955
960
  burnIn, optimInterval);
956
961
 
962
+ DEFINE_HASHER(vocabWeights, alpha, alphas, eta, K, /*etaByWord,*/ burnIn, optimInterval);
963
+
957
964
  LDAModel(const LDAArgs& args, bool checkAlpha = true)
958
965
  : BaseClass(args.seed), K(args.k), alpha(args.alpha[0]), eta(args.eta)
959
966
  {
@@ -1066,6 +1073,7 @@ namespace tomoto
1066
1073
  void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
1067
1074
  {
1068
1075
  if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
1076
+ static_cast<DerivedClass*>(this)->updateWordFormCnts();
1069
1077
  static_cast<DerivedClass*>(this)->updateWeakArray();
1070
1078
  static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
1071
1079
  static_cast<DerivedClass*>(this)->prepareWordPriors();
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  using WeightType = typename DocumentLDA<_tw>::WeightType;
12
12
  Eigen::Matrix<int8_t, -1, 1> labelMask;
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, labelMask);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, labelMask);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  class ILLDAModel : public ILDAModel
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLLDA);
9
+
5
10
  ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
@@ -107,6 +107,7 @@ namespace tomoto
107
107
  public:
108
108
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict);
109
109
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict);
110
+ DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict);
110
111
 
111
112
  LLDAModel(const LDAArgs& args)
112
113
  : BaseClass(args)
@@ -22,8 +22,8 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, 1> numByWin; // number of words in the window (len = S + T - 1)
23
23
  Eigen::Matrix<WeightType, -1, -1> numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
24
24
 
25
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
26
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
25
+ DECLARE_SERIALIZER_WITH_VERSION(0);
26
+ DECLARE_SERIALIZER_WITH_VERSION(1);
27
27
 
28
28
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
29
29
  };
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentMGLDA);
9
+
5
10
  IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
@@ -370,6 +370,7 @@ namespace tomoto
370
370
  public:
371
371
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
372
372
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
+ DEFINE_HASHER_AFTER_BASE(BaseClass, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
374
 
374
375
  MGLDAModel(const MGLDAArgs& args)
375
376
  : BaseClass(args), KL(args.kL), T(args.t),
@@ -15,8 +15,8 @@ namespace tomoto
15
15
 
16
16
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
17
17
 
18
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, Z2s);
19
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, Z2s);
18
+ DECLARE_SERIALIZER_WITH_VERSION(0);
19
+ DECLARE_SERIALIZER_WITH_VERSION(1);
20
20
  };
21
21
 
22
22
  struct PAArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentPA);
9
+
5
10
  IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, PAModel, args);
@@ -19,6 +19,7 @@ namespace tomoto
19
19
  Vector subTmp;
20
20
 
21
21
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
22
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
22
23
  };
23
24
 
24
25
  template<TermWeight _tw, typename _RandGen,
@@ -364,6 +365,7 @@ namespace tomoto
364
365
  public:
365
366
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
366
367
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
368
+ DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
367
369
 
368
370
  PAModel(const PAArgs& args)
369
371
  : BaseClass(args), K2(args.k2)
@@ -111,6 +111,7 @@ namespace tomoto
111
111
  public:
112
112
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict, numLatentTopics, numTopicsPerLabel);
113
113
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict, numLatentTopics, numTopicsPerLabel);
114
+ DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict, numLatentTopics, numTopicsPerLabel);
114
115
 
115
116
  PLDAModel(const PLDAArgs& args)
116
117
  : BaseClass(args.setK(1)),
@@ -11,9 +11,9 @@ namespace tomoto
11
11
  using WeightType = typename DocumentLDA<_tw>::WeightType;
12
12
 
13
13
  uint64_t pseudoDoc = 0;
14
-
15
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, pseudoDoc);
16
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, pseudoDoc);
14
+
15
+ DECLARE_SERIALIZER_WITH_VERSION(0);
16
+ DECLARE_SERIALIZER_WITH_VERSION(1);
17
17
  };
18
18
 
19
19
  struct PTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentPT);
9
+
5
10
  IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);
@@ -266,6 +266,7 @@ namespace tomoto
266
266
  public:
267
267
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numPDocs, lambda);
268
268
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numPDocs, lambda);
269
+ DEFINE_HASHER_AFTER_BASE(BaseClass, numPDocs, lambda);
269
270
 
270
271
  GETTER(P, size_t, numPDocs);
271
272