RubyGems - tomoto - Versions diffs - 0.3.3 → 0.4.1 - Mend

tomoto 0.3.3 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +11 -0
data/README.md +1 -1
data/ext/tomoto/extconf.rb +4 -2
data/lib/tomoto/version.rb +1 -1
data/lib/tomoto.rb +14 -14
data/vendor/tomotopy/README.kr.rst +27 -1
data/vendor/tomotopy/README.rst +27 -1
data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/DTModel.hpp +4 -0
data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +7 -1
data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/PAModel.hpp +7 -0
data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +83 -3
data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp +1 -1
data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
metadata +12 -7
data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206

data/vendor/tomotopy/src/TopicModel/PA.h CHANGED Viewed

@@ -15,8 +15,8 @@ namespace tomoto
 		template<typename _TopicModel> void update(WeightType* ptr, const _TopicModel& mdl);
-		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, Z2s);
-		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, Z2s);
+		DECLARE_SERIALIZER_WITH_VERSION(0);
+		DECLARE_SERIALIZER_WITH_VERSION(1);
 	};
 	struct PAArgs : public LDAArgs

data/vendor/tomotopy/src/TopicModel/PAModel.cpp CHANGED Viewed

@@ -2,6 +2,11 @@
 namespace tomoto
 {
+	DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 0, Z2s);
+	DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPA, BaseDocument, 1, 0x00010001, Z2s);
+	TMT_INSTANTIATE_DOC(DocumentPA);
 	IPAModel* IPAModel::create(TermWeight _weight, const PAArgs& args, bool scalarRng)
 	{
 		TMT_SWITCH_TW(_weight, scalarRng, PAModel, args);

data/vendor/tomotopy/src/TopicModel/PAModel.hpp CHANGED Viewed

@@ -19,6 +19,7 @@ namespace tomoto
 		Vector subTmp;
 		DEFINE_SERIALIZER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
+		DEFINE_HASHER_AFTER_BASE(ModelStateLDA<_tw>, numByTopic1_2, numByTopic2);
 	};
 	template<TermWeight _tw, typename _RandGen,
@@ -364,6 +365,7 @@ namespace tomoto
 	public:
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, K2, subAlphas, subAlphaSum);
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, K2, subAlphas, subAlphaSum);
+		DEFINE_HASHER_AFTER_BASE(BaseClass, K2, subAlphas, subAlphaSum);
 		PAModel(const PAArgs& args)
 			: BaseClass(args), K2(args.k2)
@@ -460,6 +462,11 @@ namespace tomoto
 			return ret;
 		}
+		size_t getNumTopicsForPrior() const override
+		{
+			return this->K2;
+		}
 		void setWordPrior(const std::string& word, const std::vector<Float>& priors) override
 		{
 			if (priors.size() != K2) THROW_ERROR_WITH_INFO(exc::InvalidArgument, "priors.size() must be equal to K2.");

data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp CHANGED Viewed

@@ -111,6 +111,7 @@ namespace tomoto
 	public:
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, topicLabelDict, numLatentTopics, numTopicsPerLabel);
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, topicLabelDict, numLatentTopics, numTopicsPerLabel);
+		DEFINE_HASHER_AFTER_BASE(BaseClass, topicLabelDict, numLatentTopics, numTopicsPerLabel);
 		PLDAModel(const PLDAArgs& args)
 			: BaseClass(args.setK(1)),

data/vendor/tomotopy/src/TopicModel/PT.h CHANGED Viewed

@@ -11,9 +11,9 @@ namespace tomoto
 		using WeightType = typename DocumentLDA<_tw>::WeightType;
 		uint64_t pseudoDoc = 0;
-		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, pseudoDoc);
-		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, pseudoDoc);
+		DECLARE_SERIALIZER_WITH_VERSION(0);
+		DECLARE_SERIALIZER_WITH_VERSION(1);
 	};
 	struct PTArgs : public LDAArgs

data/vendor/tomotopy/src/TopicModel/PTModel.cpp CHANGED Viewed

@@ -2,6 +2,11 @@
 namespace tomoto
 {
+	DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 0, pseudoDoc);
+	DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentPT, BaseDocument, 1, 0x00010001, pseudoDoc);
+	TMT_INSTANTIATE_DOC(DocumentPT);
 	IPTModel* IPTModel::create(TermWeight _weight, const PTArgs& args, bool scalarRng)
 	{
 		TMT_SWITCH_TW(_weight, scalarRng, PTModel, args);

data/vendor/tomotopy/src/TopicModel/PTModel.hpp CHANGED Viewed

@@ -266,6 +266,7 @@ namespace tomoto
 	public:
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, numPDocs, lambda);
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, numPDocs, lambda);
+		DEFINE_HASHER_AFTER_BASE(BaseClass, numPDocs, lambda);
 		GETTER(P, size_t, numPDocs);

data/vendor/tomotopy/src/TopicModel/SLDA.h CHANGED Viewed

@@ -16,8 +16,9 @@ namespace tomoto
 			ret["y"] = y;
 			return ret;
 		}
-		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, y);
-		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, y);
+		DECLARE_SERIALIZER_WITH_VERSION(0);
+		DECLARE_SERIALIZER_WITH_VERSION(1);
 	};
 	struct SLDAArgs;

data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp CHANGED Viewed

@@ -2,6 +2,11 @@
 namespace tomoto
 {
+	DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 0, y);
+	DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 1, 0x00010001, y);
+	TMT_INSTANTIATE_DOC(DocumentSLDA);
     ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng)
 	{
 		TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args);

data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp CHANGED Viewed

@@ -348,6 +348,7 @@ namespace tomoto
 	public:
 		DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, F, responseVars, mu, nuSq);
 		DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, F, responseVars, mu, nuSq);
+		DEFINE_HASHER_AFTER_BASE(BaseClass, F, mu, nuSq);
 		SLDAModel(const SLDAArgs& args)
 			: BaseClass(args), F(args.vars.size()), varTypes(args.vars),

data/vendor/tomotopy/src/TopicModel/TopicModel.hpp CHANGED Viewed

@@ -1,4 +1,4 @@
-#pragma once
+#pragma once
 #include <numeric>
 #include <unordered_set>
 #include "../Utils/Utils.hpp"
@@ -7,7 +7,7 @@
 #include "../Utils/ThreadPool.hpp"
 #include "../Utils/serializer.hpp"
 #include "../Utils/exception.h"
-#include "../Utils/SharedString.hpp"
+#include "../Utils/SharedString.h"
 #include <EigenRand/EigenRand>
 #include <mapbox/variant.hpp>
@@ -107,7 +107,7 @@ namespace tomoto
 		virtual operator RawDoc() const
 		{
-			RawDoc raw{ *this };
+			RawDoc raw{ *static_cast<const RawDocKernel*>(this) };
 			if (wOrder.empty())
 			{
 				raw.words.insert(raw.words.begin(), words.begin(), words.end());
@@ -224,6 +224,8 @@ namespace tomoto
 		virtual void loadModel(std::istream& reader,
 			std::vector<uint8_t>* extra_data = nullptr) = 0;
+		virtual std::array<uint64_t, 2> getHash() const = 0;
 		virtual std::unique_ptr<ITopicModel> copy() const = 0;
 		virtual const DocumentBase* getDoc(size_t docId) const = 0;
@@ -251,14 +253,17 @@ namespace tomoto
 		virtual const std::vector<uint64_t>& getVocabCf() const = 0;
 		virtual std::vector<double> getVocabWeightedCf() const = 0;
 		virtual const std::vector<uint64_t>& getVocabDf() const = 0;
+		virtual const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const = 0;
 		virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
 		virtual size_t getGlobalStep() const = 0;
 		virtual void prepare(bool initDocs = true, size_t minWordCnt = 0, size_t minWordDf = 0, size_t removeTopN = 0, bool updateStopwords = true) = 0;
 		virtual size_t getK() const = 0;
+		virtual size_t getNumTopicsForPrior() const = 0;
 		virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
 		virtual std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const = 0;
+		virtual std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const = 0;
 		virtual std::vector<std::pair<std::string, Float>> getWordsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;
@@ -318,6 +323,7 @@ namespace tomoto
 		size_t globalStep = 0;
 		_ModelState globalState, tState;
 		Dictionary dict;
+		std::vector<std::vector<std::pair<std::string, size_t>>> wordFormCnts;
 		uint64_t realV = 0; // vocab size after removing stopwords
 		uint64_t realN = 0; // total word size after removing stopwords
 		double weightedN = 0;
@@ -564,6 +570,44 @@ namespace tomoto
 			}
 		}
+		void updateWordFormCnts()
+		{
+			wordFormCnts.clear();
+			wordFormCnts.resize(realV);
+			std::vector<std::unordered_map<std::string, size_t>> cnts(realV);
+			for (auto& doc : docs)
+			{
+				for (size_t i = 0; i < doc.words.size(); ++i)
+				{
+					auto w = doc.words[doc.wOrder.empty() ? i : doc.wOrder[i]];
+					if (w >= realV) continue;
+					auto& cnt = cnts[w];
+					std::string word;
+					if (!doc.rawStr.empty() && i < doc.origWordPos.size())
+					{
+						word = doc.rawStr.substr(doc.origWordPos[i], doc.origWordLen[i]);
+					}
+					else
+					{
+						word = dict.toWord(w);
+					}
+					++cnt[word];
+				}
+			}
+			for (size_t i = 0; i < realV; ++i)
+			{
+				auto& cnt = cnts[i];
+				std::vector<std::pair<std::string, size_t>> v{ std::make_move_iterator(cnt.begin()), std::make_move_iterator(cnt.end()) };
+				std::sort(v.begin(), v.end(), [](const std::pair<std::string, size_t>& a, const std::pair<std::string, size_t>& b)
+				{
+					return a.second > b.second;
+				});
+				wordFormCnts[i] = move(v);
+				cnt.clear();
+			}
+		}
 		int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
 		{
 			throw e;
@@ -725,6 +769,11 @@ namespace tomoto
 			return 0;
 		}
+		size_t getNumTopicsForPrior() const override
+		{
+			return this->getK();
+		}
 		std::vector<Float> getWidsByTopic(size_t tid, bool normalize) const override
 		{
 			return static_cast<const _Derived*>(this)->_getWidsByTopic(tid, normalize);
@@ -745,11 +794,26 @@ namespace tomoto
 			return ret;
 		}
+		std::vector<std::tuple<std::string, Vid, Float>> vid2StringVid(const std::vector<std::pair<Vid, Float>>& vids) const
+		{
+			std::vector<std::tuple<std::string, Vid, Float>> ret(vids.size());
+			for (size_t i = 0; i < vids.size(); ++i)
+			{
+				ret[i] = std::make_tuple(dict.toWord(vids[i].first), vids[i].first, vids[i].second);
+			}
+			return ret;
+		}
 		std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const override
 		{
 			return vid2String(getWidsByTopicSorted(tid, topN));
 		}
+		std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const override
+		{
+			return vid2StringVid(getWidsByTopicSorted(tid, topN));
+		}
 		std::vector<std::pair<Vid, Float>> getWidsByDocSorted(const DocumentBase* doc, size_t topN) const
 		{
 			std::vector<Float> cnt(dict.size());
@@ -866,6 +930,11 @@ namespace tomoto
 			return vocabDf;
 		}
+		const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const override
+		{
+			return wordFormCnts;
+		}
 		void saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const override
 		{
 			static_cast<const _Derived*>(this)->_saveModel(writer, fullModel, extra_data);
@@ -876,6 +945,17 @@ namespace tomoto
 			static_cast<_Derived*>(this)->_loadModel(reader, extra_data);
 			static_cast<_Derived*>(this)->prepare(false);
 		}
+		std::array<uint64_t, 2> getHash() const override
+		{
+			std::array<uint64_t, 2> ret;
+			ret[0] = dict.computeHash(0);
+			const std::string s = static_cast<const _Derived*>(this)->tmid().str() + static_cast<const _Derived*>(this)->twid().str();
+			ret[0] = serializer::computeHashMany(ret[0], s, realV, globalStep, docs.size());
+			ret[1] = globalState.computeHash(0);
+			ret[1] = static_cast<const _Derived*>(this)->computeHash(ret[1]);
+			return ret;
+		}
 	};
 }

data/vendor/tomotopy/src/Utils/Dictionary.cpp ADDED Viewed

@@ -0,0 +1,102 @@
+#include "Dictionary.h"
+namespace tomoto
+{
+    Dictionary::Dictionary() = default;
+    Dictionary::~Dictionary() = default;
+    Dictionary::Dictionary(const Dictionary&) = default;
+    Dictionary& Dictionary::operator=(const Dictionary&) = default;
+    Dictionary::Dictionary(Dictionary&&) noexcept = default;
+    Dictionary& Dictionary::operator=(Dictionary&&) noexcept = default;
+    Vid Dictionary::add(const std::string& word)
+    {
+        auto it = dict.find(word);
+        if (it == dict.end())
+        {
+            dict.emplace(word, (Vid)dict.size());
+            id2word.emplace_back(word);
+            return (Vid)(dict.size() - 1);
+        }
+        return it->second;
+    }
+    const std::string& Dictionary::toWord(Vid vid) const
+    {
+        assert(vid < id2word.size());
+        return id2word[vid];
+    }
+    Vid Dictionary::toWid(const std::string& word) const
+    {
+        auto it = dict.find(word);
+        if (it == dict.end()) return non_vocab_id;
+        return it->second;
+    }
+    void Dictionary::serializerWrite(std::ostream& writer) const
+    {
+        serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
+    }
+    void Dictionary::serializerRead(std::istream& reader)
+    {
+        serializer::readMany(reader, serializer::to_key("Dict"), id2word);
+        for (size_t i = 0; i < id2word.size(); ++i)
+        {
+            dict.emplace(id2word[i], (Vid)i);
+        }
+    }
+    uint64_t Dictionary::computeHash(uint64_t seed) const
+	{
+        return serializer::computeHashMany(seed, id2word);
+	}
+    void Dictionary::swap(Dictionary& rhs)
+    {
+        std::swap(dict, rhs.dict);
+        std::swap(id2word, rhs.id2word);
+    }
+    void Dictionary::reorder(const std::vector<Vid>& order)
+    {
+        for (auto& p : dict)
+        {
+            p.second = order[p.second];
+            id2word[p.second] = p.first;
+        }
+    }
+    const std::vector<std::string>& Dictionary::getRaw() const
+    {
+        return id2word;
+    }
+    Vid Dictionary::mapToNewDict(Vid v, const Dictionary& newDict) const
+    {
+        return newDict.toWid(toWord(v));
+    }
+    std::vector<Vid> Dictionary::mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const
+    {
+        std::vector<Vid> r(v.size());
+        for (size_t i = 0; i < v.size(); ++i)
+        {
+            r[i] = mapToNewDict(v[i], newDict);
+        }
+        return r;
+    }
+    std::vector<Vid> Dictionary::mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const
+    {
+        std::vector<Vid> r(v.size());
+        for (size_t i = 0; i < v.size(); ++i)
+        {
+            r[i] = mapToNewDict(v[i], newDict);
+        }
+        return r;
+    }
+}

data/vendor/tomotopy/src/Utils/Dictionary.h CHANGED Viewed

@@ -12,8 +12,9 @@ namespace tomoto
 {
 	using Vid = uint32_t;
 	static constexpr Vid non_vocab_id = (Vid)-1;
+	static constexpr Vid rm_vocab_id = (Vid)-2;
 	using Tid = uint16_t;
-	static constexpr Vid non_topic_id = (Tid)-1;
+	static constexpr Tid non_topic_id = (Tid)-1;
 	using Float = float;
 	struct VidPair : public std::pair<Vid, Vid>
@@ -27,91 +28,41 @@ namespace tomoto
 		std::unordered_map<std::string, Vid> dict;
 		std::vector<std::string> id2word;
 	public:
-		Vid add(const std::string& word)
-		{
-			auto it = dict.find(word);
-			if (it == dict.end())
-			{
-				dict.emplace(word, (Vid)dict.size());
-				id2word.emplace_back(word);
-				return (Vid)(dict.size() - 1);
-			}
-			return it->second;
-		}
+		Dictionary();
+		~Dictionary();
+		Dictionary(const Dictionary&);
+		Dictionary& operator=(const Dictionary&);
+		Dictionary(Dictionary&&) noexcept;
+		Dictionary& operator=(Dictionary&&) noexcept;
+		Vid add(const std::string& word);
 		size_t size() const { return dict.size(); }
-		const std::string& toWord(Vid vid) const
-		{
-			assert(vid < id2word.size());
-			return id2word[vid];
-		}
+		const std::string& toWord(Vid vid) const;
-		Vid toWid(const std::string& word) const
-		{
-			auto it = dict.find(word);
-			if (it == dict.end()) return non_vocab_id;
-			return it->second;
-		}
+		Vid toWid(const std::string& word) const;
-		void serializerWrite(std::ostream& writer) const
-		{
-			serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
-		}
+		void serializerWrite(std::ostream& writer) const;
-		void serializerRead(std::istream& reader)
-		{
-			serializer::readMany(reader, serializer::to_key("Dict"), id2word);
-			for (size_t i = 0; i < id2word.size(); ++i)
-			{
-				dict.emplace(id2word[i], (Vid)i);
-			}
-		}
+		void serializerRead(std::istream& reader);
-		void swap(Dictionary& rhs)
-		{
-			std::swap(dict, rhs.dict);
-			std::swap(id2word, rhs.id2word);
-		}
+		uint64_t computeHash(uint64_t seed) const;
-		void reorder(const std::vector<Vid>& order)
-		{
-			for (auto& p : dict)
-			{
-				p.second = order[p.second];
-				id2word[p.second] = p.first;
-			}
-		}
+		void swap(Dictionary& rhs);
-		const std::vector<std::string>& getRaw() const
-		{
-			return id2word;
-		}
+		void reorder(const std::vector<Vid>& order);
-		Vid mapToNewDict(Vid v, const Dictionary& newDict) const
-		{
-			return newDict.toWid(toWord(v));
-		}
+		const std::vector<std::string>& getRaw() const;
-		std::vector<Vid> mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const
-		{
-			std::vector<Vid> r(v.size());
-			for (size_t i = 0; i < v.size(); ++i)
-			{
-				r[i] = mapToNewDict(v[i], newDict);
-			}
-			return r;
-		}
+		Vid mapToNewDict(Vid v, const Dictionary& newDict) const;
-		std::vector<Vid> mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const
-		{
-			std::vector<Vid> r(v.size());
-			for (size_t i = 0; i < v.size(); ++i)
-			{
-				r[i] = mapToNewDict(v[i], newDict);
-			}
-			return r;
-		}
+		std::vector<Vid> mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const;
+		std::vector<Vid> mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const;
 	};
 }
@@ -126,4 +77,4 @@ namespace std
 			return hash<size_t>{}(p.first) ^ hash<size_t>{}(p.second);
 		}
 	};
-}
+}

data/vendor/tomotopy/src/Utils/EigenAddonOps.hpp CHANGED Viewed

@@ -116,7 +116,7 @@ namespace Eigen
 		EIGEN_STRONG_INLINE Packet4f p_bool2float(const Packet4f& a)
 		{
-			return vcvtq_f32_s32(vandq_s32(a, vdupq_n_s32(1)));
+			return vcvtq_f32_s32(vandq_s32((Packet4i)a, vdupq_n_s32(1)));
 		}
 		EIGEN_STRONG_INLINE Packet4f p_bool2float(const Packet4i& a)

data/vendor/tomotopy/src/Utils/Mmap.cpp ADDED Viewed

@@ -0,0 +1,146 @@
+#include <cstdint>
+#include "Mmap.h"
+namespace tomoto
+{
+	namespace utils
+	{
+		static std::u16string utf8To16(const std::string& str)
+		{
+			std::u16string ret;
+			for (auto it = str.begin(); it != str.end(); ++it)
+			{
+				uint32_t code = 0;
+				uint32_t byte = (uint8_t)*it;
+				if ((byte & 0xF8) == 0xF0)
+				{
+					code = (uint32_t)((byte & 0x07) << 18);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (uint32_t)((byte & 0x3F) << 12);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (uint32_t)((byte & 0x3F) << 6);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (byte & 0x3F);
+				}
+				else if ((byte & 0xF0) == 0xE0)
+				{
+					code = (uint32_t)((byte & 0x0F) << 12);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (uint32_t)((byte & 0x3F) << 6);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (byte & 0x3F);
+				}
+				else if ((byte & 0xE0) == 0xC0)
+				{
+					code = (uint32_t)((byte & 0x1F) << 6);
+					if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
+					if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
+					code |= (byte & 0x3F);
+				}
+				else if ((byte & 0x80) == 0x00)
+				{
+					code = byte;
+				}
+				else
+				{
+					throw std::invalid_argument{ "unicode error" };
+				}
+				if (code < 0x10000)
+				{
+					ret.push_back((char16_t)code);
+				}
+				else if (code < 0x10FFFF)
+				{
+					code -= 0x10000;
+					ret.push_back((char16_t)(0xD800 | (code >> 10)));
+					ret.push_back((char16_t)(0xDC00 | (code & 0x3FF)));
+				}
+				else
+				{
+					throw std::invalid_argument{ "unicode error" };
+				}
+			}
+			return ret;
+		}
+	}
+}
+namespace tomoto
+{
+	namespace utils
+	{
+		MMap::MMap(const std::string& filepath)
+		{
+#ifdef _WIN32
+			hFile = CreateFileW((const wchar_t*)utf8To16(filepath).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr);
+			if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'");
+			hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr);
+			if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError()));
+			view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0);
+			if (!view) throw std::ios_base::failure("Cannot MapViewOfFile() Code:" + std::to_string(GetLastError()));
+			DWORD high;
+			len = GetFileSize(hFile, &high);
+			len |= (uint64_t)high << 32;
+#else
+			fd = open(filepath.c_str(), O_RDONLY);
+			if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'");
+			struct stat sb;
+			if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'");
+			len = sb.st_size;
+			view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0);
+			if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed");
+#endif
+		}
+#ifdef _WIN32
+		MMap::MMap(MMap&& o) noexcept
+			: view{ o.view }, len{ o.len }
+		{
+			o.view = nullptr;
+			std::swap(hFile, o.hFile);
+			std::swap(hFileMap, o.hFileMap);
+		}
+#else
+		MMap::MMap(MMap&& o) noexcept
+			: len{ o.len }, fd{ std::move(o.fd) }
+		{
+			std::swap(view, o.view);
+		}
+#endif
+		MMap& MMap::operator=(MMap&& o) noexcept
+		{
+			std::swap(view, o.view);
+			std::swap(len, o.len);
+#ifdef _WIN32
+			std::swap(hFile, o.hFile);
+			std::swap(hFileMap, o.hFileMap);
+#else
+			std::swap(fd, o.fd);
+#endif
+			return *this;
+		}
+		MMap::~MMap()
+		{
+#ifdef _WIN32
+			if (hFileMap)
+			{
+				UnmapViewOfFile(view);
+				view = nullptr;
+			}
+#else
+			if (view)
+			{
+				munmap((void*)view, len);
+			}
+#endif
+		}
+	}
+}