RubyGems - tomoto - Versions diffs - 0.2.2 → 0.2.3 - Mend

tomoto 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +4 -0
data/ext/tomoto/ct.cpp +11 -11
data/ext/tomoto/dmr.cpp +14 -13
data/ext/tomoto/dt.cpp +14 -14
data/ext/tomoto/ext.cpp +7 -7
data/ext/tomoto/extconf.rb +1 -3
data/ext/tomoto/gdmr.cpp +7 -7
data/ext/tomoto/hdp.cpp +9 -9
data/ext/tomoto/hlda.cpp +13 -13
data/ext/tomoto/hpa.cpp +5 -5
data/ext/tomoto/lda.cpp +42 -39
data/ext/tomoto/llda.cpp +6 -6
data/ext/tomoto/mglda.cpp +15 -15
data/ext/tomoto/pa.cpp +6 -6
data/ext/tomoto/plda.cpp +6 -6
data/ext/tomoto/slda.cpp +8 -8
data/ext/tomoto/utils.h +16 -70
data/lib/tomoto/version.rb +1 -1
data/vendor/tomotopy/README.kr.rst +57 -0
data/vendor/tomotopy/README.rst +55 -0
data/vendor/tomotopy/src/Labeling/Phraser.hpp +3 -3
data/vendor/tomotopy/src/TopicModel/CTModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +4 -4
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -2
data/vendor/tomotopy/src/TopicModel/LDA.h +3 -3
data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp +3 -3
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +34 -14
data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +2 -2
data/vendor/tomotopy/src/TopicModel/PAModel.hpp +1 -1
data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/PTModel.hpp +5 -2
data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +4 -1
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +48 -21
data/vendor/tomotopy/src/Utils/AliasMethod.hpp +5 -4
data/vendor/tomotopy/src/Utils/Dictionary.h +2 -2
data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -1
data/vendor/tomotopy/src/Utils/TruncMultiNormal.hpp +1 -1
data/vendor/tomotopy/src/Utils/math.h +2 -2
data/vendor/tomotopy/src/Utils/serializer.hpp +30 -5
metadata +6 -6

data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp CHANGED Viewed

@@ -335,7 +335,10 @@ namespace tomoto
 		friend typename BaseClass::BaseClass;
 		using WeightType = typename BaseClass::WeightType;
-		static constexpr char TMID[] = "hLDA";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("hLDA");
+		}
 		Float gamma;
@@ -422,7 +425,7 @@ namespace tomoto
 		}
 		template<int _inc>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, Tid level) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid level) const
 		{
 			assert(vid < this->realV);
 			constexpr bool _dec = _inc < 0 && _tw != TermWeight::one;

data/vendor/tomotopy/src/TopicModel/HPAModel.hpp CHANGED Viewed

@@ -143,7 +143,7 @@ namespace tomoto
 		}
 		template<int _inc>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, Tid z1, Tid z2) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid z1, Tid z2) const
 		{
 			assert(vid < this->realV);
 			constexpr bool _dec = _inc < 0 && _tw != TermWeight::one;
@@ -540,7 +540,7 @@ namespace tomoto
 			return ret;
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(1 + this->K + K2);
 			Float sum = doc.getSumWordWeight() + this->alphas.sum();

data/vendor/tomotopy/src/TopicModel/LDA.h CHANGED Viewed

@@ -121,7 +121,7 @@ namespace tomoto
 		void updateSumWordWeight(size_t realV)
 		{
-			sumWordWeight = std::count_if(static_cast<_Base*>(this)->words.begin(), static_cast<_Base*>(this)->words.end(), [realV](Vid w)
+			sumWordWeight = (int32_t)std::count_if(static_cast<_Base*>(this)->words.begin(), static_cast<_Base*>(this)->words.end(), [realV](Vid w)
 			{
 				return w < realV;
 			});
@@ -164,8 +164,8 @@ namespace tomoto
 	struct LDAArgs
 	{
 		size_t k = 1;
-		std::vector<Float> alpha = { 0.1 };
-		Float eta = 0.01;
+		std::vector<Float> alpha = { (Float)0.1 };
+		Float eta = (Float)0.01;
 		size_t seed = std::random_device{}();
 	};

data/vendor/tomotopy/src/TopicModel/LDACVB0Model.hpp CHANGED Viewed

@@ -82,7 +82,7 @@ namespace tomoto
 		friend BaseClass;
 		static constexpr const char TWID[] = "one\0";
-		static constexpr static constexpr char TMID[] = "LDA\0";
+		static constexpr const char TMID[] = "LDA\0";
 		Float alpha;
 		Vector alphas;
@@ -125,7 +125,7 @@ namespace tomoto
 		}
 		template<int _Inc, typename _Vec>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, _Vec tDist) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, _Vec tDist) const
 		{
 			assert(vid < this->realV);
 			constexpr bool _dec = _Inc < 0;
@@ -392,7 +392,7 @@ namespace tomoto
 			return static_cast<const DerivedClass*>(this)->_getTopicsCount();
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc) const
 		{
 			std::vector<Float> ret(K);
 			Float sum = doc.getSumWordWeight() + K * alpha;

data/vendor/tomotopy/src/TopicModel/LDAModel.hpp CHANGED Viewed

@@ -117,19 +117,28 @@ namespace tomoto
 	template<>
 	struct TwId<TermWeight::one>
 	{
-		static constexpr char TWID[] = "one\0";
+		static constexpr auto twid()
+		{
+			return serializer::to_key("one\0");
+		}
 	};
 	template<>
 	struct TwId<TermWeight::idf>
 	{
-		static constexpr char TWID[] = "idf\0";
+		static constexpr auto twid()
+		{
+			return serializer::to_key("idf\0");
+		}
 	};
 	template<>
 	struct TwId<TermWeight::pmi>
 	{
-		static constexpr char TWID[] = "pmi\0";
+		static constexpr auto twid()
+		{
+			return serializer::to_key("pmi\0");
+		}
 	};
 	// to make HDP friend of LDA for HDPModel::converToLDA
@@ -169,7 +178,11 @@ namespace tomoto
 			typename>
 		friend class HDPModel;
-		static constexpr char TMID[] = "LDA\0";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("LDA\0");
+		}
 		using WeightType = typename std::conditional<_tw == TermWeight::one, int32_t, float>::type;
 		enum { m_flags = _Flags };
@@ -189,7 +202,7 @@ namespace tomoto
 		struct ExtraDocData
 		{
 			std::vector<Vid> vChunkOffset;
-			Eigen::Matrix<uint32_t, -1, -1> chunkOffsetByDoc;
+			Eigen::Matrix<size_t, -1, -1> chunkOffsetByDoc;
 		};
 		ExtraDocData eddTrain;
@@ -261,7 +274,7 @@ namespace tomoto
 		}
 		template<int _inc>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, Tid tid) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid tid) const
 		{
 			assert(tid < K);
 			assert(vid < this->realV);
@@ -620,7 +633,7 @@ namespace tomoto
 					for (Vid v = 0; v < V; ++v)
 					{
 						if (!ld.numByTopicWord(k, v)) continue;
-						ll += math::lgammaT(ld.numByTopicWord(k, v) + etaByTopicWord(v, k)) - math::lgammaT(etaByTopicWord(v, k));
+						ll += math::lgammaT(ld.numByTopicWord(k, v) + etaByTopicWord(k, v)) - math::lgammaT(etaByTopicWord(k, v));
 						assert(std::isfinite(ll));
 					}
 				}
@@ -972,12 +985,14 @@ namespace tomoto
 		void setOptimInterval(size_t _optimInterval) override
 		{
-			optimInterval = _optimInterval;
+			if (_optimInterval > 0x7FFFFFFF) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "wrong value");
+			optimInterval = (uint32_t)_optimInterval;
 		}
 		void setBurnInIteration(size_t iteration) override
 		{
-			burnIn = iteration;
+			if (iteration > 0x7FFFFFFF) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "wrong value");
+			burnIn = (uint32_t)iteration;
 		}
 		size_t addDoc(const RawDoc& rawDoc, const RawDocTokenizer::Factory& tokenizer) override
@@ -1008,6 +1023,11 @@ namespace tomoto
 				if (p < 0) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors must not be less than 0.");
 			}
 			this->dict.add(word);
+			if (this->dict.size() > this->vocabCf.size())
+			{
+				this->vocabCf.resize(this->dict.size());
+				this->vocabDf.resize(this->dict.size());
+			}
 			etaByWord.emplace(word, priors);
 		}
@@ -1049,7 +1069,7 @@ namespace tomoto
 			if (initDocs)
 			{
 				std::vector<uint32_t> df, cf, tf;
-				uint32_t totCf;
+				size_t totCf;
 				// calculate weighting
 				if (_tw != TermWeight::one)
@@ -1064,14 +1084,14 @@ namespace tomoto
 							++df[w];
 						}
 					}
-					totCf = accumulate(this->vocabCf.begin(), this->vocabCf.end(), 0);
+					totCf = std::accumulate(this->vocabCf.begin(), this->vocabCf.end(), 0);
 				}
 				if (_tw == TermWeight::idf)
 				{
 					vocabWeights.resize(V);
 					for (size_t i = 0; i < V; ++i)
 					{
-						vocabWeights[i] = log(this->docs.size() / (Float)df[i]);
+						vocabWeights[i] = (Float)log(this->docs.size() / (double)df[i]);
 					}
 				}
 				else if (_tw == TermWeight::pmi)
@@ -1079,7 +1099,7 @@ namespace tomoto
 					vocabWeights.resize(V);
 					for (size_t i = 0; i < V; ++i)
 					{
-						vocabWeights[i] = this->vocabCf[i] / (float)totCf;
+						vocabWeights[i] = (Float)(this->vocabCf[i] / (double)totCf);
 					}
 				}
@@ -1104,7 +1124,7 @@ namespace tomoto
 			return static_cast<const DerivedClass*>(this)->_getTopicsCount();
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(K);
 			Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), K };

data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp CHANGED Viewed

@@ -26,7 +26,10 @@ namespace tomoto
 		friend typename BaseClass::BaseClass;
 		using WeightType = typename BaseClass::WeightType;
-		static constexpr char TMID[] = "LLDA";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("LLDA");
+		}
 		Dictionary topicLabelDict;
@@ -171,7 +174,7 @@ namespace tomoto
 			return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(this->K);
 			auto maskedAlphas = this->alphas.array() * doc.labelMask.template cast<Float>().array();

data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp CHANGED Viewed

@@ -63,7 +63,7 @@ namespace tomoto
 		}
 		template<int _inc>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, Tid tid, uint16_t s, uint8_t w, uint8_t r) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid tid, uint16_t s, uint8_t w, uint8_t r) const
 		{
 			const auto K = this->K;
@@ -527,7 +527,7 @@ namespace tomoto
 			this->etaByWord.emplace(word, priors);
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(this->K + KL);
 			Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K + KL };

data/vendor/tomotopy/src/TopicModel/PAModel.hpp CHANGED Viewed

@@ -90,7 +90,7 @@ namespace tomoto
 		}
 		template<int _inc>
-		inline void addWordTo(_ModelState& ld, _DocType& doc, uint32_t pid, Vid vid, Tid z1, Tid z2) const
+		inline void addWordTo(_ModelState& ld, _DocType& doc, size_t pid, Vid vid, Tid z1, Tid z2) const
 		{
 			assert(vid < this->realV);
 			constexpr bool _dec = _inc < 0 && _tw != TermWeight::one;

data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp CHANGED Viewed

@@ -26,7 +26,10 @@ namespace tomoto
 		friend typename BaseClass::BaseClass;
 		using WeightType = typename BaseClass::WeightType;
-		static constexpr char TMID[] = "PLDA";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("PLDA");
+		}
 		Dictionary topicLabelDict;
@@ -178,7 +181,7 @@ namespace tomoto
 			return std::make_unique<_DocType>(as_mutable(this)->template _updateDoc<true>(doc, rawDoc.template getMiscDefault<std::vector<std::string>>("labels")));
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(this->K);
 			auto maskedAlphas = this->alphas.array() * doc.labelMask.template cast<Float>().array();

data/vendor/tomotopy/src/TopicModel/PTModel.hpp CHANGED Viewed

@@ -38,7 +38,10 @@ namespace tomoto
 		friend typename BaseClass::BaseClass;
 		using WeightType = typename BaseClass::WeightType;
-		static constexpr char TMID[] = "PTM";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("PTM");
+		}
 		uint64_t numPDocs;
 		Float lambda;
@@ -261,7 +264,7 @@ namespace tomoto
 		{
 		}
-		std::vector<Float> getTopicsByDoc(const _DocType& doc, bool normalize) const
+		std::vector<Float> _getTopicsByDoc(const _DocType& doc, bool normalize) const
 		{
 			std::vector<Float> ret(this->K);
 			Eigen::Map<Eigen::Array<Float, -1, 1>> m{ ret.data(), this->K };

data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp CHANGED Viewed

@@ -216,7 +216,10 @@ namespace tomoto
 		friend typename BaseClass::BaseClass;
 		using WeightType = typename BaseClass::WeightType;
-		static constexpr char TMID[] = "SLDA";
+		static constexpr auto tmid()
+		{
+			return serializer::to_key("SLDA");
+		}
 		uint64_t F; // number of response variables
 		std::vector<ISLDAModel::GLM> varTypes;

data/vendor/tomotopy/src/TopicModel/TopicModel.hpp CHANGED Viewed

@@ -249,6 +249,7 @@ namespace tomoto
 		virtual size_t getNumDocs() const = 0;
 		virtual const Dictionary& getVocabDict() const = 0;
 		virtual const std::vector<uint64_t>& getVocabCf() const = 0;
+		virtual std::vector<double> getVocabWeightedCf() const = 0;
 		virtual const std::vector<uint64_t>& getVocabDf() const = 0;
 		virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
@@ -319,6 +320,7 @@ namespace tomoto
 		Dictionary dict;
 		uint64_t realV = 0; // vocab size after removing stopwords
 		uint64_t realN = 0; // total word size after removing stopwords
+		double weightedN = 0;
 		size_t maxThreads[(size_t)ParallelScheme::size] = { 0, };
 		size_t minWordCf = 0, minWordDf = 0, removeTopN = 0;
@@ -327,15 +329,17 @@ namespace tomoto
 		void _saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const
 		{
 			serializer::writeMany(writer,
-				serializer::to_keyz(static_cast<const _Derived*>(this)->TMID),
-				serializer::to_keyz(static_cast<const _Derived*>(this)->TWID));
+				serializer::to_keyz(static_cast<const _Derived*>(this)->tmid()),
+				serializer::to_keyz(static_cast<const _Derived*>(this)->twid())
+			);
 			serializer::writeTaggedMany(writer, 0x00010001,
 				serializer::to_keyz("dict"), dict,
 				serializer::to_keyz("vocabCf"), vocabCf,
 				serializer::to_keyz("vocabDf"), vocabDf,
 				serializer::to_keyz("realV"), realV,
 				serializer::to_keyz("globalStep"), globalStep,
-				serializer::to_keyz("extra"), extra_data ? *extra_data : std::vector<uint8_t>(0));
+				serializer::to_keyz("extra"), extra_data ? *extra_data : std::vector<uint8_t>(0)
+			);
 			serializer::writeMany(writer, *static_cast<const _Derived*>(this));
 			globalState.serializerWrite(writer);
 			if (fullModel)
@@ -355,8 +359,9 @@ namespace tomoto
 			{
 				std::vector<uint8_t> extra;
 				serializer::readMany(reader,
-					serializer::to_keyz(static_cast<_Derived*>(this)->TMID),
-					serializer::to_keyz(static_cast<_Derived*>(this)->TWID));
+					serializer::to_keyz(static_cast<_Derived*>(this)->tmid()),
+					serializer::to_keyz(static_cast<_Derived*>(this)->twid())
+				);
 				serializer::readTaggedMany(reader, 0x00010001,
 					serializer::to_keyz("dict"), dict,
 					serializer::to_keyz("vocabCf"), vocabCf,
@@ -370,14 +375,17 @@ namespace tomoto
 			{
 				reader.seekg(start_pos);
 				serializer::readMany(reader,
-					serializer::to_key(static_cast<_Derived*>(this)->TMID),
-					serializer::to_key(static_cast<_Derived*>(this)->TWID),
-					dict, vocabCf, realV);
+					serializer::to_key(static_cast<_Derived*>(this)->tmid()),
+					serializer::to_key(static_cast<_Derived*>(this)->twid()),
+					dict, vocabCf, realV
+				);
 			}
 			serializer::readMany(reader, *static_cast<_Derived*>(this));
 			globalState.serializerRead(reader);
 			serializer::readMany(reader, docs);
-			realN = countRealN();
+			auto p = countRealN();
+			realN = p.first;
+			weightedN = p.second;
 		}
 		template<typename _DocTy>
@@ -490,17 +498,23 @@ namespace tomoto
 			}
 		}
-		size_t countRealN() const
+		std::pair<size_t, double> countRealN() const
 		{
 			size_t n = 0;
+			double weighted = 0;
 			for (auto& doc : docs)
 			{
-				for (auto& w : doc.words)
+				for (size_t i = 0; i < doc.words.size(); ++i)
 				{
-					if (w < realV) ++n;
+					auto w = doc.words[i];
+					if (w < realV)
+					{
+						++n;
+						weighted += doc.wordWeights.empty() ? 1 : doc.wordWeights[i];
+					}
 				}
 			}
-			return n;
+			return std::make_pair(n, weighted);
 		}
 		void removeStopwords(size_t minWordCnt, size_t minWordDf, size_t removeTopN)
@@ -544,14 +558,9 @@ namespace tomoto
 			}
 			dict.reorder(order);
-			realN = 0;
 			for (auto& doc : docs)
 			{
-				for (auto& w : doc.words)
-				{
-					w = order[w];
-					if (w < realV) ++realN;
-				}
+				for (auto& w : doc.words) w = order[w];
 			}
 		}
@@ -598,6 +607,10 @@ namespace tomoto
 		void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0) override
 		{
+			auto p = countRealN();
+			realN = p.first;
+			weightedN = p.second;
 			maxThreads[(size_t)ParallelScheme::default_] = -1;
 			maxThreads[(size_t)ParallelScheme::none] = -1;
 			maxThreads[(size_t)ParallelScheme::copy_merge] = static_cast<_Derived*>(this)->template estimateMaxThreads<ParallelScheme::copy_merge>();
@@ -697,7 +710,7 @@ namespace tomoto
 		double getLLPerWord() const override
 		{
-			return words.empty() ? 0 : static_cast<const _Derived*>(this)->getLL() / realN;
+			return words.empty() ? 0 : static_cast<const _Derived*>(this)->getLL() / weightedN;
 		}
 		double getPerplexity() const override
@@ -797,7 +810,7 @@ namespace tomoto
 		std::vector<Float> getTopicsByDoc(const DocumentBase* doc, bool normalize) const override
 		{
-			return static_cast<const _Derived*>(this)->getTopicsByDoc(*static_cast<const DocType*>(doc), normalize);
+			return static_cast<const _Derived*>(this)->_getTopicsByDoc(*static_cast<const DocType*>(doc), normalize);
 		}
 		std::vector<std::pair<Tid, Float>> getTopicsByDocSorted(const DocumentBase* doc, size_t topN) const override
@@ -832,6 +845,20 @@ namespace tomoto
 			return vocabCf;
 		}
+		std::vector<double> getVocabWeightedCf() const override
+		{
+			std::vector<double> ret(realV);
+			for (auto& doc : docs)
+			{
+				for (size_t i = 0; i < doc.words.size(); ++i)
+				{
+					if (doc.words[i] >= realV) continue;
+					ret[doc.words[i]] += doc.wordWeights.empty() ? 1 : doc.wordWeights[i];
+				}
+			}
+			return ret;
+		}
 		const std::vector<uint64_t>& getVocabDf() const override
 		{
 			return vocabDf;