tomoto 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/tomoto/ct.cpp +8 -4
- data/ext/tomoto/dmr.cpp +10 -4
- data/ext/tomoto/dt.cpp +13 -4
- data/ext/tomoto/extconf.rb +1 -1
- data/ext/tomoto/gdmr.cpp +14 -6
- data/ext/tomoto/hdp.cpp +9 -4
- data/ext/tomoto/hlda.cpp +9 -4
- data/ext/tomoto/hpa.cpp +9 -4
- data/ext/tomoto/lda.cpp +8 -4
- data/ext/tomoto/llda.cpp +8 -4
- data/ext/tomoto/mglda.cpp +11 -1
- data/ext/tomoto/pa.cpp +9 -4
- data/ext/tomoto/plda.cpp +8 -4
- data/ext/tomoto/slda.cpp +13 -5
- data/lib/tomoto/gdmr.rb +2 -2
- data/lib/tomoto/version.rb +1 -1
- data/vendor/EigenRand/EigenRand/Core.h +6 -1107
- data/vendor/EigenRand/EigenRand/Dists/Basic.h +490 -43
- data/vendor/EigenRand/EigenRand/Dists/Discrete.h +916 -285
- data/vendor/EigenRand/EigenRand/Dists/GammaPoisson.h +85 -36
- data/vendor/EigenRand/EigenRand/Dists/NormalExp.h +1038 -290
- data/vendor/EigenRand/EigenRand/EigenRand +2 -2
- data/vendor/EigenRand/EigenRand/Macro.h +4 -4
- data/vendor/EigenRand/EigenRand/MorePacketMath.h +54 -22
- data/vendor/EigenRand/EigenRand/MvDists/Multinomial.h +222 -0
- data/vendor/EigenRand/EigenRand/MvDists/MvNormal.h +492 -0
- data/vendor/EigenRand/EigenRand/PacketFilter.h +2 -2
- data/vendor/EigenRand/EigenRand/PacketRandomEngine.h +2 -2
- data/vendor/EigenRand/EigenRand/RandUtils.h +65 -11
- data/vendor/EigenRand/EigenRand/doc.h +142 -25
- data/vendor/EigenRand/LICENSE +1 -1
- data/vendor/EigenRand/README.md +109 -24
- data/vendor/tomotopy/README.kr.rst +27 -6
- data/vendor/tomotopy/README.rst +29 -8
- data/vendor/tomotopy/src/Labeling/FoRelevance.cpp +60 -12
- data/vendor/tomotopy/src/Labeling/FoRelevance.h +2 -2
- data/vendor/tomotopy/src/Labeling/Phraser.hpp +33 -21
- data/vendor/tomotopy/src/TopicModel/CT.h +8 -5
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +29 -23
- data/vendor/tomotopy/src/TopicModel/DMR.h +33 -4
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +231 -57
- data/vendor/tomotopy/src/TopicModel/DT.h +24 -5
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +2 -8
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +41 -28
- data/vendor/tomotopy/src/TopicModel/GDMR.h +31 -5
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +2 -7
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +211 -104
- data/vendor/tomotopy/src/TopicModel/HDP.h +11 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +52 -45
- data/vendor/tomotopy/src/TopicModel/HLDA.h +11 -2
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +13 -16
- data/vendor/tomotopy/src/TopicModel/HPA.h +5 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +51 -21
- data/vendor/tomotopy/src/TopicModel/LDA.h +9 -2
- data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +8 -8
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +70 -28
- data/vendor/tomotopy/src/TopicModel/LLDA.h +1 -2
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +22 -12
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +12 -3
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +2 -10
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +42 -19
- data/vendor/tomotopy/src/TopicModel/PA.h +9 -4
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +48 -25
- data/vendor/tomotopy/src/TopicModel/PLDA.h +13 -2
- data/vendor/tomotopy/src/TopicModel/PLDAModel.cpp +2 -6
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +27 -19
- data/vendor/tomotopy/src/TopicModel/PT.h +12 -5
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +2 -3
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +29 -14
- data/vendor/tomotopy/src/TopicModel/SLDA.h +18 -6
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +2 -10
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +93 -43
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +58 -23
- data/vendor/tomotopy/src/Utils/AliasMethod.hpp +6 -6
- data/vendor/tomotopy/src/Utils/Dictionary.h +11 -0
- data/vendor/tomotopy/src/Utils/SharedString.hpp +26 -1
- data/vendor/tomotopy/src/Utils/Trie.hpp +46 -21
- data/vendor/tomotopy/src/Utils/Utils.hpp +99 -14
- data/vendor/tomotopy/src/Utils/exception.h +1 -1
- data/vendor/tomotopy/src/Utils/math.h +5 -7
- data/vendor/tomotopy/src/Utils/serializer.hpp +329 -201
- data/vendor/tomotopy/src/Utils/text.hpp +8 -0
- data/vendor/tomotopy/src/Utils/tvector.hpp +49 -7
- metadata +9 -7
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
namespace tomoto
|
|
5
5
|
{
|
|
6
6
|
template<TermWeight _tw>
|
|
7
|
-
struct
|
|
7
|
+
struct DocumentPT : public DocumentLDA<_tw>
|
|
8
8
|
{
|
|
9
9
|
using BaseDocument = DocumentLDA<_tw>;
|
|
10
10
|
using DocumentLDA<_tw>::DocumentLDA;
|
|
@@ -16,12 +16,19 @@ namespace tomoto
|
|
|
16
16
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, pseudoDoc);
|
|
17
17
|
};
|
|
18
18
|
|
|
19
|
+
struct PTArgs : public LDAArgs
|
|
20
|
+
{
|
|
21
|
+
size_t p = 100;
|
|
22
|
+
Float lambda = 0.01;
|
|
23
|
+
};
|
|
24
|
+
|
|
19
25
|
class IPTModel : public ILDAModel
|
|
20
26
|
{
|
|
21
27
|
public:
|
|
22
|
-
using DefaultDocType =
|
|
23
|
-
static IPTModel* create(TermWeight _weight,
|
|
24
|
-
Float alpha = 0.1, Float eta = 0.01, Float lambda = 0.01, size_t seed = std::random_device{}(),
|
|
28
|
+
using DefaultDocType = DocumentPT<TermWeight::one>;
|
|
29
|
+
static IPTModel* create(TermWeight _weight, const PTArgs& args,
|
|
25
30
|
bool scalarRng = false);
|
|
31
|
+
|
|
32
|
+
virtual size_t getP() const = 0;
|
|
26
33
|
};
|
|
27
|
-
}
|
|
34
|
+
}
|
|
@@ -2,9 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
namespace tomoto
|
|
4
4
|
{
|
|
5
|
-
|
|
6
|
-
IPTModel* IPTModel::create(TermWeight _weight, size_t _K, size_t _P, Float _alpha, Float _eta, Float _lambda, size_t seed, bool scalarRng)
|
|
5
|
+
IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
|
|
7
6
|
{
|
|
8
|
-
TMT_SWITCH_TW(_weight, scalarRng, PTModel,
|
|
7
|
+
TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);
|
|
9
8
|
}
|
|
10
9
|
}
|
|
@@ -25,7 +25,7 @@ namespace tomoto
|
|
|
25
25
|
template<TermWeight _tw, typename _RandGen,
|
|
26
26
|
typename _Interface = IPTModel,
|
|
27
27
|
typename _Derived = void,
|
|
28
|
-
typename _DocType =
|
|
28
|
+
typename _DocType = DocumentPT<_tw>,
|
|
29
29
|
typename _ModelState = ModelStatePTM<_tw>>
|
|
30
30
|
class PTModel : public LDAModel<_tw, _RandGen, flags::continuous_doc_data | flags::partitioned_multisampling, _Interface,
|
|
31
31
|
typename std::conditional<std::is_same<_Derived, void>::value, PTModel<_tw, _RandGen>, _Derived>::type,
|
|
@@ -158,15 +158,13 @@ namespace tomoto
|
|
|
158
158
|
{
|
|
159
159
|
sortAndWriteOrder(doc.words, doc.wOrder);
|
|
160
160
|
doc.numByTopic.init((WeightType*)this->globalState.numByTopicPDoc.col(0).data(), this->K, 1);
|
|
161
|
-
doc.Zs = tvector<Tid>(wordSize);
|
|
161
|
+
doc.Zs = tvector<Tid>(wordSize, non_topic_id);
|
|
162
162
|
if (_tw != TermWeight::one) doc.wordWeights.resize(wordSize);
|
|
163
163
|
}
|
|
164
164
|
|
|
165
165
|
void initGlobalState(bool initDocs)
|
|
166
166
|
{
|
|
167
|
-
this->
|
|
168
|
-
this->alphas.array() = this->alpha;
|
|
169
|
-
this->globalState.pLikelihood = Eigen::Matrix<Float, -1, 1>::Zero(numPDocs);
|
|
167
|
+
this->globalState.pLikelihood = Vector::Zero(numPDocs);
|
|
170
168
|
this->globalState.numDocsByPDoc = Eigen::ArrayXi::Zero(numPDocs);
|
|
171
169
|
this->globalState.numByTopicPDoc = Eigen::Matrix<WeightType, -1, -1>::Zero(this->K, numPDocs);
|
|
172
170
|
BaseClass::initGlobalState(initDocs);
|
|
@@ -175,15 +173,15 @@ namespace tomoto
|
|
|
175
173
|
struct Generator
|
|
176
174
|
{
|
|
177
175
|
std::uniform_int_distribution<uint64_t> psi;
|
|
178
|
-
|
|
176
|
+
Eigen::Rand::DiscreteGen<int32_t> theta;
|
|
179
177
|
};
|
|
180
178
|
|
|
181
179
|
Generator makeGeneratorForInit(const _DocType*) const
|
|
182
180
|
{
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
181
|
+
Generator g;
|
|
182
|
+
g.psi = std::uniform_int_distribution<uint64_t>{ 0, numPDocs - 1 };
|
|
183
|
+
g.theta = Eigen::Rand::DiscreteGen<int32_t>{ this->alphas.data(), this->alphas.data() + this->alphas.size() };
|
|
184
|
+
return g;
|
|
187
185
|
}
|
|
188
186
|
|
|
189
187
|
template<bool _Infer>
|
|
@@ -256,17 +254,34 @@ namespace tomoto
|
|
|
256
254
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numPDocs, lambda);
|
|
257
255
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numPDocs, lambda);
|
|
258
256
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
257
|
+
GETTER(P, size_t, numPDocs);
|
|
258
|
+
|
|
259
|
+
PTModel(const PTArgs& args)
|
|
260
|
+
: BaseClass(args), numPDocs(args.p), lambda(args.lambda)
|
|
262
261
|
{
|
|
263
262
|
}
|
|
264
263
|
|
|
264
|
+
std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
|
|
265
|
+
{
|
|
266
|
+
std::vector<Float> ret(this->K);
|
|
267
|
+
Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };
|
|
268
|
+
m = this->alphas.array();
|
|
269
|
+
for (size_t i = 0; i < doc.words.size(); ++i)
|
|
270
|
+
{
|
|
271
|
+
if (doc.words[i] >= this->realV) continue;
|
|
272
|
+
typename std::conditional<_tw != TermWeight::one, float, int32_t>::type weight
|
|
273
|
+
= _tw != TermWeight::one ? doc.wordWeights[i] : 1;
|
|
274
|
+
ret[doc.Zs[i]] += weight;
|
|
275
|
+
}
|
|
276
|
+
if (normalize) m /= m.sum();
|
|
277
|
+
return ret;
|
|
278
|
+
}
|
|
279
|
+
|
|
265
280
|
void updateDocs()
|
|
266
281
|
{
|
|
267
282
|
for (auto& doc : this->docs)
|
|
268
283
|
{
|
|
269
|
-
doc.template update<>(this->
|
|
284
|
+
doc.template update<>(this->globalState.numByTopicPDoc.col(doc.pseudoDoc).data(), *static_cast<DerivedClass*>(this));
|
|
270
285
|
}
|
|
271
286
|
}
|
|
272
287
|
};
|
|
@@ -9,10 +9,19 @@ namespace tomoto
|
|
|
9
9
|
using BaseDocument = DocumentLDA<_tw>;
|
|
10
10
|
using DocumentLDA<_tw>::DocumentLDA;
|
|
11
11
|
std::vector<Float> y;
|
|
12
|
+
|
|
13
|
+
RawDoc::MiscType makeMisc(const ITopicModel* tm) const override
|
|
14
|
+
{
|
|
15
|
+
RawDoc::MiscType ret = DocumentLDA<_tw>::makeMisc(tm);
|
|
16
|
+
ret["y"] = y;
|
|
17
|
+
return ret;
|
|
18
|
+
}
|
|
12
19
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, y);
|
|
13
20
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, y);
|
|
14
21
|
};
|
|
15
22
|
|
|
23
|
+
struct SLDAArgs;
|
|
24
|
+
|
|
16
25
|
class ISLDAModel : public ILDAModel
|
|
17
26
|
{
|
|
18
27
|
public:
|
|
@@ -23,12 +32,7 @@ namespace tomoto
|
|
|
23
32
|
};
|
|
24
33
|
|
|
25
34
|
using DefaultDocType = DocumentSLDA<TermWeight::one>;
|
|
26
|
-
static ISLDAModel* create(TermWeight _weight,
|
|
27
|
-
const std::vector<ISLDAModel::GLM>& vars = {},
|
|
28
|
-
Float alpha = 0.1, Float _eta = 0.01,
|
|
29
|
-
const std::vector<Float>& _mu = {}, const std::vector<Float>& _nuSq = {},
|
|
30
|
-
const std::vector<Float>& _glmParam = {},
|
|
31
|
-
size_t seed = std::random_device{}(),
|
|
35
|
+
static ISLDAModel* create(TermWeight _weight, const SLDAArgs& args,
|
|
32
36
|
bool scalarRng = false);
|
|
33
37
|
|
|
34
38
|
virtual size_t getF() const = 0;
|
|
@@ -36,4 +40,12 @@ namespace tomoto
|
|
|
36
40
|
virtual GLM getTypeOfVar(size_t f) const = 0;
|
|
37
41
|
virtual std::vector<Float> estimateVars(const DocumentBase* doc) const = 0;
|
|
38
42
|
};
|
|
43
|
+
|
|
44
|
+
struct SLDAArgs : public LDAArgs
|
|
45
|
+
{
|
|
46
|
+
std::vector<ISLDAModel::GLM> vars;
|
|
47
|
+
std::vector<Float> mu;
|
|
48
|
+
std::vector<Float> nuSq;
|
|
49
|
+
std::vector<Float> glmParam;
|
|
50
|
+
};
|
|
39
51
|
}
|
|
@@ -2,16 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
namespace tomoto
|
|
4
4
|
{
|
|
5
|
-
|
|
6
|
-
template class SLDAModel<TermWeight::idf>;
|
|
7
|
-
template class SLDAModel<TermWeight::pmi>;*/
|
|
8
|
-
|
|
9
|
-
ISLDAModel* ISLDAModel::create(TermWeight _weight, size_t _K, const std::vector<ISLDAModel::GLM>& vars,
|
|
10
|
-
Float _alpha, Float _eta,
|
|
11
|
-
const std::vector<Float>& _mu, const std::vector<Float>& _nuSq,
|
|
12
|
-
const std::vector<Float>& _glmParam,
|
|
13
|
-
size_t seed, bool scalarRng)
|
|
5
|
+
ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng)
|
|
14
6
|
{
|
|
15
|
-
TMT_SWITCH_TW(_weight, scalarRng, SLDAModel,
|
|
7
|
+
TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args);
|
|
16
8
|
}
|
|
17
9
|
}
|
|
@@ -16,22 +16,24 @@ namespace tomoto
|
|
|
16
16
|
template<typename _WeightType>
|
|
17
17
|
struct GLMFunctor
|
|
18
18
|
{
|
|
19
|
-
|
|
19
|
+
Vector regressionCoef; // Dim : (K)
|
|
20
20
|
|
|
21
|
-
GLMFunctor(size_t K = 0, Float mu = 0) : regressionCoef(
|
|
21
|
+
GLMFunctor(size_t K = 0, Float mu = 0) : regressionCoef(Vector::Constant(K, mu))
|
|
22
22
|
{
|
|
23
23
|
}
|
|
24
24
|
|
|
25
25
|
virtual ISLDAModel::GLM getType() const = 0;
|
|
26
26
|
|
|
27
|
+
virtual std::unique_ptr<GLMFunctor> copy() const = 0;
|
|
28
|
+
|
|
27
29
|
virtual void updateZLL(
|
|
28
|
-
|
|
30
|
+
Vector& zLikelihood,
|
|
29
31
|
Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const = 0;
|
|
30
32
|
|
|
31
33
|
virtual void optimizeCoef(
|
|
32
|
-
const
|
|
34
|
+
const Matrix& normZ,
|
|
33
35
|
Float mu, Float nuSq,
|
|
34
|
-
Eigen::Block<
|
|
36
|
+
Eigen::Block<Matrix, -1, 1, true> ys
|
|
35
37
|
) = 0;
|
|
36
38
|
|
|
37
39
|
virtual double getLL(Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic,
|
|
@@ -69,8 +71,13 @@ namespace tomoto
|
|
|
69
71
|
|
|
70
72
|
ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::linear; }
|
|
71
73
|
|
|
74
|
+
std::unique_ptr<GLMFunctor<_WeightType>> copy() const override
|
|
75
|
+
{
|
|
76
|
+
return std::make_unique<LinearFunctor>(*this);
|
|
77
|
+
}
|
|
78
|
+
|
|
72
79
|
void updateZLL(
|
|
73
|
-
|
|
80
|
+
Vector& zLikelihood,
|
|
74
81
|
Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const override
|
|
75
82
|
{
|
|
76
83
|
Float yErr = y -
|
|
@@ -81,14 +88,14 @@ namespace tomoto
|
|
|
81
88
|
}
|
|
82
89
|
|
|
83
90
|
void optimizeCoef(
|
|
84
|
-
const
|
|
91
|
+
const Matrix& normZ,
|
|
85
92
|
Float mu, Float nuSq,
|
|
86
|
-
Eigen::Block<
|
|
93
|
+
Eigen::Block<Matrix, -1, 1, true> ys
|
|
87
94
|
) override
|
|
88
95
|
{
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
normZZT +=
|
|
96
|
+
Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast<Float>();
|
|
97
|
+
Matrix normZZT = selectedNormZ * selectedNormZ.transpose();
|
|
98
|
+
normZZT += Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq;
|
|
92
99
|
this->regressionCoef = normZZT.colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, ys).matrix());
|
|
93
100
|
}
|
|
94
101
|
|
|
@@ -113,17 +120,22 @@ namespace tomoto
|
|
|
113
120
|
struct BinaryLogisticFunctor : public GLMFunctor<_WeightType>
|
|
114
121
|
{
|
|
115
122
|
Float b = 1;
|
|
116
|
-
|
|
123
|
+
Vector omega;
|
|
117
124
|
|
|
118
125
|
BinaryLogisticFunctor(size_t K = 0, Float mu = 0, Float _b = 1, size_t numDocs = 0)
|
|
119
|
-
: GLMFunctor<_WeightType>(K, mu), b(_b), omega{
|
|
126
|
+
: GLMFunctor<_WeightType>(K, mu), b(_b), omega{ Vector::Ones(numDocs) }
|
|
120
127
|
{
|
|
121
128
|
}
|
|
122
129
|
|
|
123
130
|
ISLDAModel::GLM getType() const override { return ISLDAModel::GLM::binary_logistic; }
|
|
124
131
|
|
|
132
|
+
std::unique_ptr<GLMFunctor<_WeightType>> copy() const override
|
|
133
|
+
{
|
|
134
|
+
return std::make_unique<BinaryLogisticFunctor>(*this);
|
|
135
|
+
}
|
|
136
|
+
|
|
125
137
|
void updateZLL(
|
|
126
|
-
|
|
138
|
+
Vector& zLikelihood,
|
|
127
139
|
Float y, const Eigen::Matrix<_WeightType, -1, 1>& numByTopic, size_t docId, Float docSize) const override
|
|
128
140
|
{
|
|
129
141
|
Float yErr = b * (y - 0.5f) -
|
|
@@ -134,18 +146,18 @@ namespace tomoto
|
|
|
134
146
|
}
|
|
135
147
|
|
|
136
148
|
void optimizeCoef(
|
|
137
|
-
const
|
|
149
|
+
const Matrix& normZ,
|
|
138
150
|
Float mu, Float nuSq,
|
|
139
|
-
Eigen::Block<
|
|
151
|
+
Eigen::Block<Matrix, -1, 1, true> ys
|
|
140
152
|
) override
|
|
141
153
|
{
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
normZZT +=
|
|
154
|
+
Matrix selectedNormZ = normZ.array().rowwise() * (!ys.array().transpose().isNaN()).template cast<Float>();
|
|
155
|
+
Matrix normZZT = selectedNormZ * Eigen::DiagonalMatrix<Float, -1>{ omega } * selectedNormZ.transpose();
|
|
156
|
+
normZZT += Matrix::Identity(normZZT.cols(), normZZT.cols()) / nuSq;
|
|
145
157
|
|
|
146
158
|
this->regressionCoef = normZZT
|
|
147
159
|
.colPivHouseholderQr().solve(selectedNormZ * ys.array().isNaN().select(0, b * (ys.array() - 0.5f)).matrix()
|
|
148
|
-
+
|
|
160
|
+
+ Vector::Constant(selectedNormZ.rows(), mu / nuSq));
|
|
149
161
|
|
|
150
162
|
RandGen rng;
|
|
151
163
|
for (size_t i = 0; i < (size_t)omega.size(); ++i)
|
|
@@ -173,8 +185,20 @@ namespace tomoto
|
|
|
173
185
|
|
|
174
186
|
DEFINE_SERIALIZER_AFTER_BASE(GLMFunctor<_WeightType>, b, omega);
|
|
175
187
|
};
|
|
188
|
+
|
|
189
|
+
struct CopyGLMFunctor
|
|
190
|
+
{
|
|
191
|
+
template<typename Wt>
|
|
192
|
+
std::vector<std::unique_ptr<GLMFunctor<Wt>>> operator()(const std::vector<std::unique_ptr<GLMFunctor<Wt>>>& o)
|
|
193
|
+
{
|
|
194
|
+
std::vector<std::unique_ptr<GLMFunctor<Wt>>> ret;
|
|
195
|
+
for (auto& p : o) ret.emplace_back(p->copy());
|
|
196
|
+
return ret;
|
|
197
|
+
}
|
|
198
|
+
};
|
|
176
199
|
}
|
|
177
200
|
|
|
201
|
+
|
|
178
202
|
template<TermWeight _tw, typename _RandGen,
|
|
179
203
|
size_t _Flags = flags::partitioned_multisampling,
|
|
180
204
|
typename _Interface = ISLDAModel,
|
|
@@ -198,12 +222,12 @@ namespace tomoto
|
|
|
198
222
|
std::vector<ISLDAModel::GLM> varTypes;
|
|
199
223
|
std::vector<Float> glmParam;
|
|
200
224
|
|
|
201
|
-
|
|
202
|
-
|
|
225
|
+
Vector mu; // Mean of regression coefficients, Dim : (F)
|
|
226
|
+
Vector nuSq; // Variance of regression coefficients, Dim : (F)
|
|
203
227
|
|
|
204
|
-
std::vector<std::unique_ptr<detail::GLMFunctor<WeightType
|
|
205
|
-
|
|
206
|
-
|
|
228
|
+
DelegateCopy<std::vector<std::unique_ptr<detail::GLMFunctor<WeightType>>>, detail::CopyGLMFunctor> responseVars;
|
|
229
|
+
Matrix normZ; // topic proportions for all docs, Dim : (K, D)
|
|
230
|
+
Matrix Ys; // response variables, Dim : (D, F)
|
|
207
231
|
|
|
208
232
|
template<bool _asymEta>
|
|
209
233
|
Float* getZLikelihoods(_ModelState& ld, const _DocType& doc, size_t docId, size_t vid) const
|
|
@@ -299,11 +323,11 @@ namespace tomoto
|
|
|
299
323
|
switch (varTypes[f])
|
|
300
324
|
{
|
|
301
325
|
case ISLDAModel::GLM::linear:
|
|
302
|
-
v = make_unique<detail::LinearFunctor<WeightType>>(this->K, mu[f],
|
|
326
|
+
v = std::make_unique<detail::LinearFunctor<WeightType>>(this->K, mu[f],
|
|
303
327
|
f < glmParam.size() ? glmParam[f] : 1.f);
|
|
304
328
|
break;
|
|
305
329
|
case ISLDAModel::GLM::binary_logistic:
|
|
306
|
-
v = make_unique<detail::BinaryLogisticFunctor<WeightType>>(this->K, mu[f],
|
|
330
|
+
v = std::make_unique<detail::BinaryLogisticFunctor<WeightType>>(this->K, mu[f],
|
|
307
331
|
f < glmParam.size() ? glmParam[f] : 1.f, this->docs.size());
|
|
308
332
|
break;
|
|
309
333
|
}
|
|
@@ -322,22 +346,48 @@ namespace tomoto
|
|
|
322
346
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, F, responseVars, mu, nuSq);
|
|
323
347
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, F, responseVars, mu, nuSq);
|
|
324
348
|
|
|
325
|
-
SLDAModel(
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
const std::vector<Float>& _glmParam = {},
|
|
329
|
-
size_t _rg = std::random_device{}())
|
|
330
|
-
: BaseClass(_K, _alpha, _eta, _rg), F(vars.size()), varTypes(vars),
|
|
331
|
-
glmParam(_glmParam)
|
|
349
|
+
SLDAModel(const SLDAArgs& args)
|
|
350
|
+
: BaseClass(args), F(args.vars.size()), varTypes(args.vars),
|
|
351
|
+
glmParam(args.glmParam)
|
|
332
352
|
{
|
|
333
353
|
for (auto t : varTypes)
|
|
334
354
|
{
|
|
335
|
-
if (t
|
|
355
|
+
if ((size_t)t > (size_t)ISLDAModel::GLM::binary_logistic) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "unknown var GLM type in `vars`");
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
if (args.mu.size() == 0)
|
|
359
|
+
{
|
|
360
|
+
mu = Vector::Zero(F);
|
|
361
|
+
}
|
|
362
|
+
else if (args.mu.size() == 1)
|
|
363
|
+
{
|
|
364
|
+
mu = Vector::Constant(F, args.mu[0]);
|
|
365
|
+
}
|
|
366
|
+
else if (args.mu.size() == F)
|
|
367
|
+
{
|
|
368
|
+
mu = Eigen::Map<const Vector>(args.mu.data(), args.mu.size());
|
|
369
|
+
}
|
|
370
|
+
else
|
|
371
|
+
{
|
|
372
|
+
THROW_ERROR_WITH_INFO(exc::InvalidArgument, text::format("wrong mu value (len = %zd)", args.mu.size()));
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (args.nuSq.size() == 0)
|
|
376
|
+
{
|
|
377
|
+
nuSq = Vector::Ones(F);
|
|
378
|
+
}
|
|
379
|
+
else if (args.mu.size() == 1)
|
|
380
|
+
{
|
|
381
|
+
nuSq = Vector::Constant(F, args.nuSq[0]);
|
|
382
|
+
}
|
|
383
|
+
else if (args.mu.size() == F)
|
|
384
|
+
{
|
|
385
|
+
nuSq = Eigen::Map<const Vector>(args.nuSq.data(), args.nuSq.size());
|
|
386
|
+
}
|
|
387
|
+
else
|
|
388
|
+
{
|
|
389
|
+
THROW_ERROR_WITH_INFO(exc::InvalidArgument, text::format("wrong nuSq value (len = %zd)", args.nuSq.size()));
|
|
336
390
|
}
|
|
337
|
-
mu = decltype(mu)::Zero(F);
|
|
338
|
-
std::copy(_mu.begin(), _mu.end(), mu.data());
|
|
339
|
-
nuSq = decltype(nuSq)::Ones(F);
|
|
340
|
-
std::copy(_nuSq.begin(), _nuSq.end(), nuSq.data());
|
|
341
391
|
}
|
|
342
392
|
|
|
343
393
|
std::vector<Float> getRegressionCoef(size_t f) const override
|
|
@@ -385,7 +435,7 @@ namespace tomoto
|
|
|
385
435
|
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) const override
|
|
386
436
|
{
|
|
387
437
|
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc, tokenizer);
|
|
388
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<Float>>("y")));
|
|
438
|
+
return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<Float>>("y")));
|
|
389
439
|
}
|
|
390
440
|
|
|
391
441
|
size_t addDoc(const RawDoc& rawDoc) override
|
|
@@ -397,7 +447,7 @@ namespace tomoto
|
|
|
397
447
|
std::unique_ptr<DocumentBase> makeDoc(const RawDoc& rawDoc) const override
|
|
398
448
|
{
|
|
399
449
|
auto doc = as_mutable(this)->template _makeFromRawDoc<true>(rawDoc);
|
|
400
|
-
return make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<Float>>("y")));
|
|
450
|
+
return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<Float>>("y")));
|
|
401
451
|
}
|
|
402
452
|
|
|
403
453
|
std::vector<Float> estimateVars(const DocumentBase* doc) const override
|
|
@@ -424,10 +474,10 @@ namespace tomoto
|
|
|
424
474
|
switch ((ISLDAModel::GLM)(t - 1))
|
|
425
475
|
{
|
|
426
476
|
case ISLDAModel::GLM::linear:
|
|
427
|
-
p = make_unique<LinearFunctor<_WeightType>>();
|
|
477
|
+
p = std::make_unique<LinearFunctor<_WeightType>>();
|
|
428
478
|
break;
|
|
429
479
|
case ISLDAModel::GLM::binary_logistic:
|
|
430
|
-
p = make_unique<BinaryLogisticFunctor<_WeightType>>();
|
|
480
|
+
p = std::make_unique<BinaryLogisticFunctor<_WeightType>>();
|
|
431
481
|
break;
|
|
432
482
|
default:
|
|
433
483
|
throw std::ios_base::failure(text::format("wrong GLMFunctor type id %d", (t - 1)));
|