tomoto 0.3.3 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +11 -0
  3. data/README.md +1 -1
  4. data/ext/tomoto/extconf.rb +4 -2
  5. data/lib/tomoto/version.rb +1 -1
  6. data/lib/tomoto.rb +14 -14
  7. data/vendor/tomotopy/README.kr.rst +27 -1
  8. data/vendor/tomotopy/README.rst +27 -1
  9. data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
  10. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
  11. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
  12. data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
  13. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
  14. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
  15. data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
  16. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
  17. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +4 -0
  18. data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
  20. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
  21. data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
  22. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
  23. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
  24. data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
  25. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
  26. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
  27. data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
  28. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
  29. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
  30. data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
  31. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
  32. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
  33. data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
  34. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
  35. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  36. data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
  37. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
  38. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -1
  39. data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
  40. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
  41. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +7 -0
  42. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  43. data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
  44. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
  45. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
  46. data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
  47. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
  48. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
  49. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +83 -3
  50. data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
  51. data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
  52. data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +1 -1
  53. data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
  54. data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
  55. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
  56. data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
  57. data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
  58. data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
  59. data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
  60. metadata +12 -7
  61. data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: daa9c454c4cf09e120dbbe17305d225be58ac5937c463886e87ea1b3c3b5d466
4
- data.tar.gz: f6c0c353a0efcc6026964e9125f1156b50e0d119506ecab2812522f7b716042d
3
+ metadata.gz: 86215ec57ae6cf6e36531ee2896e2b81d591f61909eb5454ef70b69c5db0a39d
4
+ data.tar.gz: 3f31adcb38a1793caaaedc516f99c9ffce4b82ff0c93f2a169b85377e116433b
5
5
  SHA512:
6
- metadata.gz: 874f531a75a62d2291793ded080f380f8103682c2ae2b087dd31a014533443d5f35a7ea4e634aabd246fab1564ece35679c60c0b6ffbb6a627d57048e32bf790
7
- data.tar.gz: e91bf3c618394f34f208fe4945729db6719a2cab1a8e7192e646b232d7e38274f2087808d145dccddb94b3cc632b9dddce82f942f051a4bf522ec5ec9d2c43b3
6
+ metadata.gz: db0a4bd9831cecae6711e150ecc2c5d23b87ada83d418f784474cf2f260627e52e40d871d9af974c6a79790e8d0d060ac08fbb69775e1d2de316085421ef76af
7
+ data.tar.gz: '07779f29aa9bdb4b71d9a0acdfe26c84d041312ad769c54539d6377b0ab01327dba045d97b51179a50498b438d137b567a3708bd3a2ab902c16adec37ad3a779'
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.4.1 (2024-09-04)
2
+
3
+ - Updated tomoto to 0.13.0
4
+
5
+ ## 0.4.0 (2023-12-28)
6
+
7
+ - Added support for Ruby 3.3
8
+ - Added precompiled gem for Linux ARM
9
+ - Updated tomoto to 0.12.7
10
+ - Dropped support for Ruby < 3
11
+
1
12
  ## 0.3.3 (2023-02-01)
2
13
 
3
14
  - Added `topic_label_dict` method to `LLDA`
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  :tomato: [tomoto](https://github.com/bab2min/tomotopy) - high performance topic modeling - for Ruby
4
4
 
5
- [![Build Status](https://github.com/ankane/tomoto-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/tomoto-ruby/actions)
5
+ [![Build Status](https://github.com/ankane/tomoto-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/tomoto-ruby/actions)
6
6
 
7
7
  ## Installation
8
8
 
@@ -27,16 +27,18 @@ else
27
27
  end
28
28
 
29
29
  # silence tomoto warnings
30
- $CXXFLAGS += " -Wno-unused-variable -Wno-switch"
30
+ $CXXFLAGS += " -Wno-unused-variable -Wno-switch -Wno-unqualified-std-cast-call"
31
31
 
32
32
  ext = File.expand_path(".", __dir__)
33
33
  tomoto = File.expand_path("../../vendor/tomotopy/src/TopicModel", __dir__)
34
+ tomoto_utils = File.expand_path("../../vendor/tomotopy/src/Utils", __dir__)
34
35
  eigen = File.expand_path("../../vendor/eigen", __dir__)
35
36
  eigen_rand = File.expand_path("../../vendor/EigenRand", __dir__)
36
37
  variant = File.expand_path("../../vendor/variant/include", __dir__)
37
38
 
38
- $srcs = Dir["{#{ext},#{tomoto}}/*.cpp"]
39
+ $srcs = Dir["{#{ext},#{tomoto},#{tomoto_utils}}/*.cpp"]
39
40
  $INCFLAGS += " -I#{tomoto} -I#{eigen} -I#{eigen_rand} -I#{variant}"
40
41
  $VPATH << tomoto
42
+ $VPATH << tomoto_utils
41
43
 
42
44
  create_makefile("tomoto/tomoto")
@@ -1,3 +1,3 @@
1
1
  module Tomoto
2
- VERSION = "0.3.3"
2
+ VERSION = "0.4.1"
3
3
  end
data/lib/tomoto.rb CHANGED
@@ -6,20 +6,20 @@ rescue LoadError
6
6
  end
7
7
 
8
8
  # modules
9
- require "tomoto/ct"
10
- require "tomoto/dmr"
11
- require "tomoto/dt"
12
- require "tomoto/gdmr"
13
- require "tomoto/hdp"
14
- require "tomoto/hlda"
15
- require "tomoto/hpa"
16
- require "tomoto/lda"
17
- require "tomoto/llda"
18
- require "tomoto/mglda"
19
- require "tomoto/pa"
20
- require "tomoto/plda"
21
- require "tomoto/slda"
22
- require "tomoto/version"
9
+ require_relative "tomoto/ct"
10
+ require_relative "tomoto/dmr"
11
+ require_relative "tomoto/dt"
12
+ require_relative "tomoto/gdmr"
13
+ require_relative "tomoto/hdp"
14
+ require_relative "tomoto/hlda"
15
+ require_relative "tomoto/hpa"
16
+ require_relative "tomoto/lda"
17
+ require_relative "tomoto/llda"
18
+ require_relative "tomoto/mglda"
19
+ require_relative "tomoto/pa"
20
+ require_relative "tomoto/plda"
21
+ require_relative "tomoto/slda"
22
+ require_relative "tomoto/version"
23
23
 
24
24
  module Tomoto
25
25
  PARALLEL_SCHEME = [:default, :none, :copy_merge, :partition]
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  `English`_,
12
12
  **한국어**.
13
13
 
@@ -305,6 +305,32 @@ tomotopy의 Python3 예제 코드는 https://github.com/bab2min/tomotopy/blob/ma
305
305
 
306
306
  역사
307
307
  -------
308
+ * 0.13.0 (2024-08-05)
309
+ * 신규 기능
310
+ * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`의 주요 기능이 완성되었습니다.
311
+ * `tomotopy.LDAModel.get_hash()`가 추가되었습니다. 모델의 128bit 해시를 구해줍니다.
312
+ * `ngram_list` 인자가 `tomotopy.utils.SimpleTokenizer`에 추가되었습니다.
313
+ * Bug fixes
314
+ * `Corpus.concat_ngrams` 호출 후에 `spans`이 비일관적인 버그가 수정되었습니다.
315
+ * `tomotopy.LDAModel.load()`와 `tomotopy.LDAModel.save()`의 병목을 최적화하여 속도를 10배 이상 개선했습니다.
316
+
317
+ * 0.12.7 (2023-12-19)
318
+ * 신규 기능
319
+ * 토픽 모델 뷰어인 `tomotopy.viewer.open_viewer()`가 추가되었습니다.
320
+ * `tomotopy.utils.Corpus.process()`의 속도를 개선했습니다.
321
+ * Bug fixes
322
+ * `Document.span`이 이제 바이트 단위가 아니라 문자 단위로 범위를 제대로 반환합니다.
323
+
324
+ * 0.12.6 (2023-12-11)
325
+ * 신규 기능
326
+ * `tomotopy.LDAModel.train`과 `tomotopy.LDAModel.set_word_prior`에 몇가지 편의 기능을 추가했습니다.
327
+ * `LDAModel.train`가 이제 학습 진행상황을 모니터링할 수 있는 `callback`, `callback_interval`, `show_progres` 인자를 지원합니다.
328
+ * `LDAModel.set_word_prior`가 이제 `prior` 인자로 `Dict[int, float]` 타입도 받을 수 있게 되었습니다.
329
+
330
+ * 0.12.5 (2023-08-03)
331
+ * 신규 기능
332
+ * Linux ARM64 아키텍처에 대한 지원을 추가했습니다.
333
+
308
334
  * 0.12.4 (2023-01-22)
309
335
  * New features
310
336
  * macOS ARM64 아키텍처에 대한 지원을 추가했습니다.
@@ -7,7 +7,7 @@ tomotopy
7
7
  .. image:: https://zenodo.org/badge/186155463.svg
8
8
  :target: https://zenodo.org/badge/latestdoi/186155463
9
9
 
10
- 🎌
10
+ 🌐
11
11
  **English**,
12
12
  `한국어`_.
13
13
 
@@ -309,6 +309,32 @@ meaning you can use it for any reasonable purpose and remain in complete ownersh
309
309
 
310
310
  History
311
311
  -------
312
+ * 0.13.0 (2024-08-05)
313
+ * New features
314
+ * Major features of Topic Model Viewer `tomotopy.viewer.open_viewer()` are ready now.
315
+ * `tomotopy.LDAModel.get_hash()` is added. You can get 128bit hash value of the model.
316
+ * Add an argument `ngram_list` to `tomotopy.utils.SimpleTokenizer`.
317
+ * Bug fixes
318
+ * Fixed inconsistent `spans` bug after `Corpus.concat_ngrams` is called.
319
+ * Optimized the bottleneck of `tomotopy.LDAModel.load()` and `tomotopy.LDAModel.save()` and improved its speed more than 10 times.
320
+
321
+ * 0.12.7 (2023-12-19)
322
+ * New features
323
+ * Added Topic Model Viewer `tomotopy.viewer.open_viewer()`
324
+ * Optimized the performance of `tomotopy.utils.Corpus.process()`
325
+ * Bug fixes
326
+ * `Document.span` now returns the ranges in character unit, not in byte unit.
327
+
328
+ * 0.12.6 (2023-12-11)
329
+ * New features
330
+ * Added some convenience features to `tomotopy.LDAModel.train` and `tomotopy.LDAModel.set_word_prior`.
331
+ * `LDAModel.train` now has new arguments `callback`, `callback_interval` and `show_progres` to monitor the training progress.
332
+ * `LDAModel.set_word_prior` now can accept `Dict[int, float]` type as its argument `prior`.
333
+
334
+ * 0.12.5 (2023-08-03)
335
+ * New features
336
+ * Added support for Linux ARM64 architecture.
337
+
312
338
  * 0.12.4 (2023-01-22)
313
339
  * New features
314
340
  * Added support for macOS ARM64 architecture.
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  Matrix beta; // Dim: (K, betaSample)
12
12
  Vector smBeta; // Dim: K
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, smBeta);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, smBeta);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  struct CTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 0, smBeta);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentCTM, BaseDocument, 1, 0x00010001, smBeta);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentCTM);
9
+
5
10
  ICTModel* ICTModel::create(TermWeight _weight, const CTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, CTModel, args);
@@ -243,6 +243,7 @@ namespace tomoto
243
243
  public:
244
244
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numBetaSample, numTMNSample, topicPrior);
245
245
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numBetaSample, numTMNSample, topicPrior);
246
+ DEFINE_HASHER_AFTER_BASE(BaseClass, numBetaSample, numTMNSample, topicPrior);
246
247
 
247
248
  CTModel(const CTArgs& args)
248
249
  : BaseClass(args)
@@ -18,8 +18,8 @@ namespace tomoto
18
18
 
19
19
  RawDoc::MiscType makeMisc(const ITopicModel* tm) const override;
20
20
 
21
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadata);
22
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadata, multiMetadata);
21
+ DECLARE_SERIALIZER_WITH_VERSION(0);
22
+ DECLARE_SERIALIZER_WITH_VERSION(1);
23
23
  };
24
24
 
25
25
  struct DMRArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 0, metadata);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDMR, BaseDocument, 1, 0x00010001, metadata, multiMetadata);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDMR);
9
+
5
10
  IDMRModel* IDMRModel::create(TermWeight _weight, const DMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DMRModel, args);
@@ -364,6 +364,7 @@ namespace tomoto
364
364
  public:
365
365
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma, alphaEps, metadataDict, lambda);
366
366
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma, alphaEps, metadataDict, lambda, multiMetadataDict);
367
368
 
368
369
  DMRModel(const DMRArgs& args)
369
370
  : BaseClass(args), sigma(args.sigma), alphaEps(args.alphaEps)
@@ -21,8 +21,8 @@ namespace tomoto
21
21
  return ret;
22
22
  }
23
23
 
24
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, timepoint);
25
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, timepoint);
24
+ DECLARE_SERIALIZER_WITH_VERSION(0);
25
+ DECLARE_SERIALIZER_WITH_VERSION(1);
26
26
  };
27
27
 
28
28
  struct DTArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 0, timepoint);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentDTM, BaseDocument, 1, 0x00010001, timepoint);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentDTM);
9
+
5
10
  IDTModel* IDTModel::create(TermWeight _weight, const DTArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, DTModel, args);
@@ -22,6 +22,7 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
23
23
  //ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic * Time, Vocabs)
24
24
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
25
+ DEFINE_HASHER(numByTopic, numByTopicWord);
25
26
  };
26
27
 
27
28
  template<TermWeight _tw, typename _RandGen,
@@ -365,6 +366,7 @@ namespace tomoto
365
366
  {
366
367
  double ll = 0;
367
368
  const size_t V = this->realV;
369
+ if (V == 0) return 0;
368
370
  for (Tid t = 0; t < T; ++t)
369
371
  {
370
372
  // topic-word distribution
@@ -495,6 +497,8 @@ namespace tomoto
495
497
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
496
498
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001,
497
499
  T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
500
+ DEFINE_HASHER_AFTER_BASE(BaseClass,
501
+ T, shapeA, shapeB, shapeC, alphaVar, etaVar, phiVar, alphas, etaByDoc, phi);
498
502
 
499
503
  GETTER(T, size_t, T);
500
504
  GETTER(NumDocsByT, std::vector<uint32_t>, numDocsByTime);
@@ -17,8 +17,8 @@ namespace tomoto
17
17
  return ret;
18
18
  }
19
19
 
20
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, metadataOrg);
21
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
20
+ DECLARE_SERIALIZER_WITH_VERSION(0);
21
+ DECLARE_SERIALIZER_WITH_VERSION(1);
22
22
  };
23
23
 
24
24
  struct GDMRArgs : public DMRArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 0, metadataOrg);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentGDMR, BaseDocument, 1, 0x00010001, metadataOrg, metadataNormalized);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentGDMR);
9
+
5
10
  IGDMRModel* IGDMRModel::create(TermWeight _weight, const GDMRArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, GDMRModel, args);
@@ -412,6 +412,7 @@ namespace tomoto
412
412
  public:
413
413
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, sigma0, degreeByF, mdCoefs, mdIntercepts);
414
414
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
+ DEFINE_HASHER_AFTER_BASE(BaseClass, sigma0, orderDecay, degreeByF, mdCoefs, mdIntercepts, mdMax);
415
416
 
416
417
  GDMRModel(const GDMRArgs& args)
417
418
  : BaseClass(args), sigma0(args.sigma0), orderDecay(args.orderDecay), degreeByF(args.degrees)
@@ -39,8 +39,8 @@ namespace tomoto
39
39
  };
40
40
  std::vector<TableTopicInfo> numTopicByTable;
41
41
 
42
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, numTopicByTable);
43
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, numTopicByTable);
42
+ DECLARE_SERIALIZER_WITH_VERSION(0);
43
+ DECLARE_SERIALIZER_WITH_VERSION(1);
44
44
 
45
45
  size_t getNumTable() const
46
46
  {
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 0, numTopicByTable);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHDP, BaseDocument, 1, 0x00010001, numTopicByTable);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHDP);
9
+
5
10
  IHDPModel* IHDPModel::create(TermWeight _weight, const HDPArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HDPModel, args);
@@ -19,6 +19,7 @@ namespace tomoto
19
19
  size_t totalTable = 0;
20
20
 
21
21
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numTableByTopic, totalTable);
22
23
  };
23
24
 
24
25
  template<TermWeight _tw, typename _RandGen,
@@ -457,6 +458,7 @@ namespace tomoto
457
458
  public:
458
459
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
459
460
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
461
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
460
462
 
461
463
  HDPModel(const HDPArgs& args)
462
464
  : BaseClass(args), gamma(args.gamma)
@@ -16,8 +16,8 @@ namespace tomoto
16
16
 
17
17
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
18
18
 
19
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, path);
20
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, path);
19
+ DECLARE_SERIALIZER_WITH_VERSION(0);
20
+ DECLARE_SERIALIZER_WITH_VERSION(1);
21
21
  };
22
22
 
23
23
  struct HLDAArgs : public LDAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 0, path);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentHLDA, BaseDocument, 1, 0x00010001, path);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHLDA);
9
+
5
10
  IHLDAModel* IHLDAModel::create(TermWeight _weight, const HLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, HLDAModel, args);
@@ -18,6 +18,7 @@ namespace tomoto
18
18
  int32_t parent = 0, sibling = 0, child = 0;
19
19
 
20
20
  DEFINE_SERIALIZER(numCustomers, level, parent, sibling, child);
21
+ DEFINE_HASHER(numCustomers, level, parent, sibling, child);
21
22
 
22
23
  NCRPNode* getParent() const
23
24
  {
@@ -118,6 +119,7 @@ namespace tomoto
118
119
  Vector nodeWLikelihoods; //
119
120
 
120
121
  DEFINE_SERIALIZER(nodes, levelBlocks);
122
+ DEFINE_HASHER(nodes, levelBlocks);
121
123
 
122
124
  template<bool _makeNewPath = true>
123
125
  void calcNodeLikelihood(Float gamma, size_t levelDepth)
@@ -317,6 +319,12 @@ namespace tomoto
317
319
  ModelStateLDA<_tw>::serializerWrite(ostr);
318
320
  nt->serializerWrite(ostr);
319
321
  }
322
+
323
+ uint64_t computeHash(uint64_t seed) const
324
+ {
325
+ seed = ModelStateLDA<_tw>::computeHash(seed);
326
+ return nt->computeHash(seed);
327
+ }
320
328
  };
321
329
 
322
330
  template<TermWeight _tw, typename _RandGen,
@@ -596,6 +604,7 @@ namespace tomoto
596
604
  public:
597
605
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, gamma);
598
606
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, gamma);
607
+ DEFINE_HASHER_AFTER_BASE(BaseClass, gamma);
599
608
 
600
609
  HLDAModel(const HLDAArgs& args)
601
610
  : BaseClass(args), gamma(args.gamma)
@@ -12,8 +12,8 @@ namespace tomoto
12
12
 
13
13
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
14
14
 
15
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 0);
16
- DEFINE_SERIALIZER_BASE_WITH_VERSION(BaseDocument, 1);
15
+ DECLARE_SERIALIZER_WITH_VERSION(0);
16
+ DECLARE_SERIALIZER_WITH_VERSION(1);
17
17
  };
18
18
 
19
19
  struct HPAArgs : public PAArgs
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 0);
6
+ DEFINE_OUT_SERIALIZER_BASE_WITH_VERSION(DocumentHPA, BaseDocument, 1);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentHPA);
9
+
5
10
  IHPAModel* IHPAModel::create(TermWeight _weight, bool _exclusive, const HPAArgs& args, bool scalarRng)
6
11
  {
7
12
  if (_exclusive)
@@ -21,6 +21,7 @@ namespace tomoto
21
21
  Eigen::Matrix<WeightType, -1, -1> numByTopic1_2;
22
22
 
23
23
  DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
+ DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopicWord, numByTopic, numByTopic1_2);
24
25
  };
25
26
 
26
27
  template<TermWeight _tw, typename _RandGen,
@@ -439,6 +440,7 @@ namespace tomoto
439
440
  public:
440
441
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
441
442
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
443
+ DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
442
444
 
443
445
  HPAModel(const HPAArgs& args)
444
446
  : BaseClass(args, false), K2(args.k2)
@@ -93,6 +93,12 @@ namespace tomoto
93
93
  if (!ostr.write((const char*)this->data(), sizeof(_Scalar) * this->size()))
94
94
  throw std::ios_base::failure(std::string("writing type '") + typeid(_Scalar).name() + std::string("' is failed"));
95
95
  }
96
+
97
+ uint64_t computeHash(uint64_t seed) const
98
+ {
99
+ seed = serializer::computeHashMany(seed, (uint32_t)this->rows(), (uint32_t)this->cols());
100
+ return serializer::computeFastHash(this->data(), sizeof(_Scalar) * this->size(), seed);
101
+ }
96
102
  };
97
103
 
98
104
  template<typename _Base, TermWeight _tw>
@@ -139,8 +145,8 @@ namespace tomoto
139
145
  tvector<Float> wordWeights;
140
146
  ShareableMatrix<WeightType, -1, 1> numByTopic;
141
147
 
142
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 0, Zs, wordWeights);
143
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentBase, 1, 0x00010001, Zs, wordWeights);
148
+ DECLARE_SERIALIZER_WITH_VERSION(0);
149
+ DECLARE_SERIALIZER_WITH_VERSION(1);
144
150
 
145
151
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
146
152
 
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 0, Zs, wordWeights);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLDA, DocumentBase, 1, 0x00010001, Zs, wordWeights);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLDA);
9
+
5
10
  ILDAModel* ILDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LDAModel, args);
@@ -47,6 +47,10 @@ Term Weighting Scheme is based on following paper:
47
47
  return nullptr; } while(0)
48
48
  #endif
49
49
 
50
+ #define TMT_INSTANTIATE_DOC(CLS) template struct CLS<TermWeight::one>; \
51
+ template struct CLS<TermWeight::idf>; \
52
+ template struct CLS<TermWeight::pmi>;
53
+
50
54
  #define GETTER(name, type, field) type get##name() const override { return field; }
51
55
 
52
56
  namespace tomoto
@@ -61,6 +65,7 @@ namespace tomoto
61
65
  //Eigen::Matrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
62
66
  ShareableMatrix<WeightType, -1, -1> numByTopicWord; // Dim: (Topic, Vocabs)
63
67
  DEFINE_SERIALIZER(numByTopic, numByTopicWord);
68
+ DEFINE_HASHER(numByTopic, numByTopicWord);
64
69
  };
65
70
 
66
71
  namespace flags
@@ -954,6 +959,8 @@ namespace tomoto
954
959
  DEFINE_TAGGED_SERIALIZER_WITH_VERSION(1, 0x00010001, vocabWeights, alpha, alphas, eta, K, etaByWord,
955
960
  burnIn, optimInterval);
956
961
 
962
+ DEFINE_HASHER(vocabWeights, alpha, alphas, eta, K, /*etaByWord,*/ burnIn, optimInterval);
963
+
957
964
  LDAModel(const LDAArgs& args, bool checkAlpha = true)
958
965
  : BaseClass(args.seed), K(args.k), alpha(args.alpha[0]), eta(args.eta)
959
966
  {
@@ -1066,6 +1073,7 @@ namespace tomoto
1066
1073
  void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) override
1067
1074
  {
1068
1075
  if (initDocs && updateStopwords) this->removeStopwords(minWordCnt, minWordDf, removeTopN);
1076
+ static_cast<DerivedClass*>(this)->updateWordFormCnts();
1069
1077
  static_cast<DerivedClass*>(this)->updateWeakArray();
1070
1078
  static_cast<DerivedClass*>(this)->initGlobalState(initDocs);
1071
1079
  static_cast<DerivedClass*>(this)->prepareWordPriors();
@@ -11,8 +11,8 @@ namespace tomoto
11
11
  using WeightType = typename DocumentLDA<_tw>::WeightType;
12
12
  Eigen::Matrix<int8_t, -1, 1> labelMask;
13
13
 
14
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, labelMask);
15
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, labelMask);
14
+ DECLARE_SERIALIZER_WITH_VERSION(0);
15
+ DECLARE_SERIALIZER_WITH_VERSION(1);
16
16
  };
17
17
 
18
18
  class ILLDAModel : public ILDAModel
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 0, labelMask);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentLLDA, BaseDocument, 1, 0x00010001, labelMask);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentLLDA);
9
+
5
10
  ILLDAModel* ILLDAModel::create(TermWeight _weight, const LDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, LLDAModel, args);
@@ -107,6 +107,7 @@ namespace tomoto
107
107
  public:
108
108
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict);
109
109
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict);
110
+ DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict);
110
111
 
111
112
  LLDAModel(const LDAArgs& args)
112
113
  : BaseClass(args)
@@ -22,8 +22,8 @@ namespace tomoto
22
22
  Eigen::Matrix<WeightType, -1, 1> numByWin; // number of words in the window (len = S + T - 1)
23
23
  Eigen::Matrix<WeightType, -1, -1> numByWinTopicL; // number of words in the loc. topic in the window (len = KL * (S + T - 1))
24
24
 
25
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
26
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
25
+ DECLARE_SERIALIZER_WITH_VERSION(0);
26
+ DECLARE_SERIALIZER_WITH_VERSION(1);
27
27
 
28
28
  template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
29
29
  };
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 0, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentMGLDA, BaseDocument, 1, 0x00010001, sents, Vs, numGl, numBySentWin, numByWinL, numByWin, numByWinTopicL);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentMGLDA);
9
+
5
10
  IMGLDAModel* IMGLDAModel::create(TermWeight _weight, const MGLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, MGLDAModel, args);
@@ -370,6 +370,7 @@ namespace tomoto
370
370
  public:
371
371
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
372
372
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
+ DEFINE_HASHER_AFTER_BASE(BaseClass, alphaL, alphaM, alphaML, etaL, gamma, KL, T);
373
374
 
374
375
  MGLDAModel(const MGLDAArgs& args)
375
376
  : BaseClass(args), KL(args.kL), T(args.t),
@@ -516,9 +517,14 @@ namespace tomoto
516
517
  return std::make_unique<_DocType>(as_mutable(this)->template _makeFromRawDoc<true>(rawDoc));
517
518
  }
518
519
 
520
+ size_t getNumTopicsForPrior() const override
521
+ {
522
+ return this->K + KL;
523
+ }
524
+
519
525
  void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
520
526
  {
521
- if (priors.size() != this->K + KL) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors.size() must be equal to K.");
527
+ if (priors.size() != this->K + KL) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors.size() must be equal to K + KL.");
522
528
  for (auto p : priors)
523
529
  {
524
530
  if (p < 0) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors must not be less than 0.");