tomoto 0.4.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +1 -1
- data/ext/tomoto/extconf.rb +4 -2
- data/lib/tomoto/version.rb +1 -1
- data/vendor/tomotopy/README.kr.rst +10 -1
- data/vendor/tomotopy/README.rst +10 -1
- data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
- data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/DTModel.hpp +3 -0
- data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
- data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
- data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
- data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
- data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
- data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PAModel.hpp +2 -0
- data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
- data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
- data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
- data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
- data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +77 -3
- data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
- data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
- data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
- data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
- data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
- data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
- data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
- data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
- data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
- metadata +9 -4
- data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
@@ -16,8 +16,9 @@ namespace tomoto
|
|
16
16
|
ret["y"] = y;
|
17
17
|
return ret;
|
18
18
|
}
|
19
|
-
|
20
|
-
|
19
|
+
|
20
|
+
DECLARE_SERIALIZER_WITH_VERSION(0);
|
21
|
+
DECLARE_SERIALIZER_WITH_VERSION(1);
|
21
22
|
};
|
22
23
|
|
23
24
|
struct SLDAArgs;
|
@@ -2,6 +2,11 @@
|
|
2
2
|
|
3
3
|
namespace tomoto
|
4
4
|
{
|
5
|
+
DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 0, y);
|
6
|
+
DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 1, 0x00010001, y);
|
7
|
+
|
8
|
+
TMT_INSTANTIATE_DOC(DocumentSLDA);
|
9
|
+
|
5
10
|
ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng)
|
6
11
|
{
|
7
12
|
TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args);
|
@@ -348,6 +348,7 @@ namespace tomoto
|
|
348
348
|
public:
|
349
349
|
DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, F, responseVars, mu, nuSq);
|
350
350
|
DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, F, responseVars, mu, nuSq);
|
351
|
+
DEFINE_HASHER_AFTER_BASE(BaseClass, F, mu, nuSq);
|
351
352
|
|
352
353
|
SLDAModel(const SLDAArgs& args)
|
353
354
|
: BaseClass(args), F(args.vars.size()), varTypes(args.vars),
|
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
#pragma once
|
2
2
|
#include <numeric>
|
3
3
|
#include <unordered_set>
|
4
4
|
#include "../Utils/Utils.hpp"
|
@@ -7,7 +7,7 @@
|
|
7
7
|
#include "../Utils/ThreadPool.hpp"
|
8
8
|
#include "../Utils/serializer.hpp"
|
9
9
|
#include "../Utils/exception.h"
|
10
|
-
#include "../Utils/SharedString.
|
10
|
+
#include "../Utils/SharedString.h"
|
11
11
|
#include <EigenRand/EigenRand>
|
12
12
|
#include <mapbox/variant.hpp>
|
13
13
|
|
@@ -107,7 +107,7 @@ namespace tomoto
|
|
107
107
|
|
108
108
|
virtual operator RawDoc() const
|
109
109
|
{
|
110
|
-
RawDoc raw{ *this };
|
110
|
+
RawDoc raw{ *static_cast<const RawDocKernel*>(this) };
|
111
111
|
if (wOrder.empty())
|
112
112
|
{
|
113
113
|
raw.words.insert(raw.words.begin(), words.begin(), words.end());
|
@@ -224,6 +224,8 @@ namespace tomoto
|
|
224
224
|
virtual void loadModel(std::istream& reader,
|
225
225
|
std::vector<uint8_t>* extra_data = nullptr) = 0;
|
226
226
|
|
227
|
+
virtual std::array<uint64_t, 2> getHash() const = 0;
|
228
|
+
|
227
229
|
virtual std::unique_ptr<ITopicModel> copy() const = 0;
|
228
230
|
|
229
231
|
virtual const DocumentBase* getDoc(size_t docId) const = 0;
|
@@ -251,6 +253,7 @@ namespace tomoto
|
|
251
253
|
virtual const std::vector<uint64_t>& getVocabCf() const = 0;
|
252
254
|
virtual std::vector<double> getVocabWeightedCf() const = 0;
|
253
255
|
virtual const std::vector<uint64_t>& getVocabDf() const = 0;
|
256
|
+
virtual const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const = 0;
|
254
257
|
|
255
258
|
virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
|
256
259
|
virtual size_t getGlobalStep() const = 0;
|
@@ -260,6 +263,7 @@ namespace tomoto
|
|
260
263
|
virtual size_t getNumTopicsForPrior() const = 0;
|
261
264
|
virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
|
262
265
|
virtual std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const = 0;
|
266
|
+
virtual std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const = 0;
|
263
267
|
|
264
268
|
virtual std::vector<std::pair<std::string, Float>> getWordsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;
|
265
269
|
|
@@ -319,6 +323,7 @@ namespace tomoto
|
|
319
323
|
size_t globalStep = 0;
|
320
324
|
_ModelState globalState, tState;
|
321
325
|
Dictionary dict;
|
326
|
+
std::vector<std::vector<std::pair<std::string, size_t>>> wordFormCnts;
|
322
327
|
uint64_t realV = 0; // vocab size after removing stopwords
|
323
328
|
uint64_t realN = 0; // total word size after removing stopwords
|
324
329
|
double weightedN = 0;
|
@@ -565,6 +570,44 @@ namespace tomoto
|
|
565
570
|
}
|
566
571
|
}
|
567
572
|
|
573
|
+
void updateWordFormCnts()
|
574
|
+
{
|
575
|
+
wordFormCnts.clear();
|
576
|
+
wordFormCnts.resize(realV);
|
577
|
+
std::vector<std::unordered_map<std::string, size_t>> cnts(realV);
|
578
|
+
for (auto& doc : docs)
|
579
|
+
{
|
580
|
+
for (size_t i = 0; i < doc.words.size(); ++i)
|
581
|
+
{
|
582
|
+
auto w = doc.words[doc.wOrder.empty() ? i : doc.wOrder[i]];
|
583
|
+
if (w >= realV) continue;
|
584
|
+
auto& cnt = cnts[w];
|
585
|
+
std::string word;
|
586
|
+
if (!doc.rawStr.empty() && i < doc.origWordPos.size())
|
587
|
+
{
|
588
|
+
word = doc.rawStr.substr(doc.origWordPos[i], doc.origWordLen[i]);
|
589
|
+
}
|
590
|
+
else
|
591
|
+
{
|
592
|
+
word = dict.toWord(w);
|
593
|
+
}
|
594
|
+
++cnt[word];
|
595
|
+
}
|
596
|
+
}
|
597
|
+
|
598
|
+
for (size_t i = 0; i < realV; ++i)
|
599
|
+
{
|
600
|
+
auto& cnt = cnts[i];
|
601
|
+
std::vector<std::pair<std::string, size_t>> v{ std::make_move_iterator(cnt.begin()), std::make_move_iterator(cnt.end()) };
|
602
|
+
std::sort(v.begin(), v.end(), [](const std::pair<std::string, size_t>& a, const std::pair<std::string, size_t>& b)
|
603
|
+
{
|
604
|
+
return a.second > b.second;
|
605
|
+
});
|
606
|
+
wordFormCnts[i] = move(v);
|
607
|
+
cnt.clear();
|
608
|
+
}
|
609
|
+
}
|
610
|
+
|
568
611
|
int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
|
569
612
|
{
|
570
613
|
throw e;
|
@@ -751,11 +794,26 @@ namespace tomoto
|
|
751
794
|
return ret;
|
752
795
|
}
|
753
796
|
|
797
|
+
std::vector<std::tuple<std::string, Vid, Float>> vid2StringVid(const std::vector<std::pair<Vid, Float>>& vids) const
|
798
|
+
{
|
799
|
+
std::vector<std::tuple<std::string, Vid, Float>> ret(vids.size());
|
800
|
+
for (size_t i = 0; i < vids.size(); ++i)
|
801
|
+
{
|
802
|
+
ret[i] = std::make_tuple(dict.toWord(vids[i].first), vids[i].first, vids[i].second);
|
803
|
+
}
|
804
|
+
return ret;
|
805
|
+
}
|
806
|
+
|
754
807
|
std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const override
|
755
808
|
{
|
756
809
|
return vid2String(getWidsByTopicSorted(tid, topN));
|
757
810
|
}
|
758
811
|
|
812
|
+
std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const override
|
813
|
+
{
|
814
|
+
return vid2StringVid(getWidsByTopicSorted(tid, topN));
|
815
|
+
}
|
816
|
+
|
759
817
|
std::vector<std::pair<Vid, Float>> getWidsByDocSorted(const DocumentBase* doc, size_t topN) const
|
760
818
|
{
|
761
819
|
std::vector<Float> cnt(dict.size());
|
@@ -872,6 +930,11 @@ namespace tomoto
|
|
872
930
|
return vocabDf;
|
873
931
|
}
|
874
932
|
|
933
|
+
const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const override
|
934
|
+
{
|
935
|
+
return wordFormCnts;
|
936
|
+
}
|
937
|
+
|
875
938
|
void saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const override
|
876
939
|
{
|
877
940
|
static_cast<const _Derived*>(this)->_saveModel(writer, fullModel, extra_data);
|
@@ -882,6 +945,17 @@ namespace tomoto
|
|
882
945
|
static_cast<_Derived*>(this)->_loadModel(reader, extra_data);
|
883
946
|
static_cast<_Derived*>(this)->prepare(false);
|
884
947
|
}
|
948
|
+
|
949
|
+
std::array<uint64_t, 2> getHash() const override
|
950
|
+
{
|
951
|
+
std::array<uint64_t, 2> ret;
|
952
|
+
ret[0] = dict.computeHash(0);
|
953
|
+
const std::string s = static_cast<const _Derived*>(this)->tmid().str() + static_cast<const _Derived*>(this)->twid().str();
|
954
|
+
ret[0] = serializer::computeHashMany(ret[0], s, realV, globalStep, docs.size());
|
955
|
+
ret[1] = globalState.computeHash(0);
|
956
|
+
ret[1] = static_cast<const _Derived*>(this)->computeHash(ret[1]);
|
957
|
+
return ret;
|
958
|
+
}
|
885
959
|
};
|
886
960
|
|
887
961
|
}
|
@@ -0,0 +1,102 @@
|
|
1
|
+
#include "Dictionary.h"
|
2
|
+
|
3
|
+
namespace tomoto
|
4
|
+
{
|
5
|
+
Dictionary::Dictionary() = default;
|
6
|
+
Dictionary::~Dictionary() = default;
|
7
|
+
|
8
|
+
Dictionary::Dictionary(const Dictionary&) = default;
|
9
|
+
Dictionary& Dictionary::operator=(const Dictionary&) = default;
|
10
|
+
|
11
|
+
Dictionary::Dictionary(Dictionary&&) noexcept = default;
|
12
|
+
Dictionary& Dictionary::operator=(Dictionary&&) noexcept = default;
|
13
|
+
|
14
|
+
Vid Dictionary::add(const std::string& word)
|
15
|
+
{
|
16
|
+
auto it = dict.find(word);
|
17
|
+
if (it == dict.end())
|
18
|
+
{
|
19
|
+
dict.emplace(word, (Vid)dict.size());
|
20
|
+
id2word.emplace_back(word);
|
21
|
+
return (Vid)(dict.size() - 1);
|
22
|
+
}
|
23
|
+
return it->second;
|
24
|
+
}
|
25
|
+
|
26
|
+
const std::string& Dictionary::toWord(Vid vid) const
|
27
|
+
{
|
28
|
+
assert(vid < id2word.size());
|
29
|
+
return id2word[vid];
|
30
|
+
}
|
31
|
+
|
32
|
+
Vid Dictionary::toWid(const std::string& word) const
|
33
|
+
{
|
34
|
+
auto it = dict.find(word);
|
35
|
+
if (it == dict.end()) return non_vocab_id;
|
36
|
+
return it->second;
|
37
|
+
}
|
38
|
+
|
39
|
+
void Dictionary::serializerWrite(std::ostream& writer) const
|
40
|
+
{
|
41
|
+
serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
|
42
|
+
}
|
43
|
+
|
44
|
+
void Dictionary::serializerRead(std::istream& reader)
|
45
|
+
{
|
46
|
+
serializer::readMany(reader, serializer::to_key("Dict"), id2word);
|
47
|
+
for (size_t i = 0; i < id2word.size(); ++i)
|
48
|
+
{
|
49
|
+
dict.emplace(id2word[i], (Vid)i);
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
uint64_t Dictionary::computeHash(uint64_t seed) const
|
54
|
+
{
|
55
|
+
return serializer::computeHashMany(seed, id2word);
|
56
|
+
}
|
57
|
+
|
58
|
+
void Dictionary::swap(Dictionary& rhs)
|
59
|
+
{
|
60
|
+
std::swap(dict, rhs.dict);
|
61
|
+
std::swap(id2word, rhs.id2word);
|
62
|
+
}
|
63
|
+
|
64
|
+
void Dictionary::reorder(const std::vector<Vid>& order)
|
65
|
+
{
|
66
|
+
for (auto& p : dict)
|
67
|
+
{
|
68
|
+
p.second = order[p.second];
|
69
|
+
id2word[p.second] = p.first;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
const std::vector<std::string>& Dictionary::getRaw() const
|
74
|
+
{
|
75
|
+
return id2word;
|
76
|
+
}
|
77
|
+
|
78
|
+
Vid Dictionary::mapToNewDict(Vid v, const Dictionary& newDict) const
|
79
|
+
{
|
80
|
+
return newDict.toWid(toWord(v));
|
81
|
+
}
|
82
|
+
|
83
|
+
std::vector<Vid> Dictionary::mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const
|
84
|
+
{
|
85
|
+
std::vector<Vid> r(v.size());
|
86
|
+
for (size_t i = 0; i < v.size(); ++i)
|
87
|
+
{
|
88
|
+
r[i] = mapToNewDict(v[i], newDict);
|
89
|
+
}
|
90
|
+
return r;
|
91
|
+
}
|
92
|
+
|
93
|
+
std::vector<Vid> Dictionary::mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const
|
94
|
+
{
|
95
|
+
std::vector<Vid> r(v.size());
|
96
|
+
for (size_t i = 0; i < v.size(); ++i)
|
97
|
+
{
|
98
|
+
r[i] = mapToNewDict(v[i], newDict);
|
99
|
+
}
|
100
|
+
return r;
|
101
|
+
}
|
102
|
+
}
|
@@ -12,8 +12,9 @@ namespace tomoto
|
|
12
12
|
{
|
13
13
|
using Vid = uint32_t;
|
14
14
|
static constexpr Vid non_vocab_id = (Vid)-1;
|
15
|
+
static constexpr Vid rm_vocab_id = (Vid)-2;
|
15
16
|
using Tid = uint16_t;
|
16
|
-
static constexpr
|
17
|
+
static constexpr Tid non_topic_id = (Tid)-1;
|
17
18
|
using Float = float;
|
18
19
|
|
19
20
|
struct VidPair : public std::pair<Vid, Vid>
|
@@ -27,91 +28,41 @@ namespace tomoto
|
|
27
28
|
std::unordered_map<std::string, Vid> dict;
|
28
29
|
std::vector<std::string> id2word;
|
29
30
|
public:
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
31
|
+
|
32
|
+
Dictionary();
|
33
|
+
~Dictionary();
|
34
|
+
|
35
|
+
Dictionary(const Dictionary&);
|
36
|
+
Dictionary& operator=(const Dictionary&);
|
37
|
+
|
38
|
+
Dictionary(Dictionary&&) noexcept;
|
39
|
+
Dictionary& operator=(Dictionary&&) noexcept;
|
40
|
+
|
41
|
+
Vid add(const std::string& word);
|
41
42
|
|
42
43
|
size_t size() const { return dict.size(); }
|
43
44
|
|
44
|
-
const std::string& toWord(Vid vid) const
|
45
|
-
{
|
46
|
-
assert(vid < id2word.size());
|
47
|
-
return id2word[vid];
|
48
|
-
}
|
45
|
+
const std::string& toWord(Vid vid) const;
|
49
46
|
|
50
|
-
Vid toWid(const std::string& word) const
|
51
|
-
{
|
52
|
-
auto it = dict.find(word);
|
53
|
-
if (it == dict.end()) return non_vocab_id;
|
54
|
-
return it->second;
|
55
|
-
}
|
47
|
+
Vid toWid(const std::string& word) const;
|
56
48
|
|
57
|
-
void serializerWrite(std::ostream& writer) const
|
58
|
-
{
|
59
|
-
serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
|
60
|
-
}
|
49
|
+
void serializerWrite(std::ostream& writer) const;
|
61
50
|
|
62
|
-
void serializerRead(std::istream& reader)
|
63
|
-
{
|
64
|
-
serializer::readMany(reader, serializer::to_key("Dict"), id2word);
|
65
|
-
for (size_t i = 0; i < id2word.size(); ++i)
|
66
|
-
{
|
67
|
-
dict.emplace(id2word[i], (Vid)i);
|
68
|
-
}
|
69
|
-
}
|
51
|
+
void serializerRead(std::istream& reader);
|
70
52
|
|
71
|
-
|
72
|
-
{
|
73
|
-
std::swap(dict, rhs.dict);
|
74
|
-
std::swap(id2word, rhs.id2word);
|
75
|
-
}
|
53
|
+
uint64_t computeHash(uint64_t seed) const;
|
76
54
|
|
77
|
-
void
|
78
|
-
{
|
79
|
-
for (auto& p : dict)
|
80
|
-
{
|
81
|
-
p.second = order[p.second];
|
82
|
-
id2word[p.second] = p.first;
|
83
|
-
}
|
84
|
-
}
|
55
|
+
void swap(Dictionary& rhs);
|
85
56
|
|
86
|
-
const std::vector<
|
87
|
-
{
|
88
|
-
return id2word;
|
89
|
-
}
|
57
|
+
void reorder(const std::vector<Vid>& order);
|
90
58
|
|
91
|
-
|
92
|
-
{
|
93
|
-
return newDict.toWid(toWord(v));
|
94
|
-
}
|
59
|
+
const std::vector<std::string>& getRaw() const;
|
95
60
|
|
96
|
-
|
97
|
-
{
|
98
|
-
std::vector<Vid> r(v.size());
|
99
|
-
for (size_t i = 0; i < v.size(); ++i)
|
100
|
-
{
|
101
|
-
r[i] = mapToNewDict(v[i], newDict);
|
102
|
-
}
|
103
|
-
return r;
|
104
|
-
}
|
61
|
+
Vid mapToNewDict(Vid v, const Dictionary& newDict) const;
|
105
62
|
|
106
|
-
std::vector<Vid>
|
107
|
-
|
108
|
-
|
109
|
-
for (size_t i = 0; i < v.size(); ++i)
|
110
|
-
{
|
111
|
-
r[i] = mapToNewDict(v[i], newDict);
|
112
|
-
}
|
113
|
-
return r;
|
114
|
-
}
|
63
|
+
std::vector<Vid> mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const;
|
64
|
+
|
65
|
+
std::vector<Vid> mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const;
|
115
66
|
};
|
116
67
|
|
117
68
|
}
|
@@ -126,4 +77,4 @@ namespace std
|
|
126
77
|
return hash<size_t>{}(p.first) ^ hash<size_t>{}(p.second);
|
127
78
|
}
|
128
79
|
};
|
129
|
-
}
|
80
|
+
}
|
@@ -0,0 +1,146 @@
|
|
1
|
+
#include <cstdint>
|
2
|
+
#include "Mmap.h"
|
3
|
+
|
4
|
+
namespace tomoto
|
5
|
+
{
|
6
|
+
namespace utils
|
7
|
+
{
|
8
|
+
static std::u16string utf8To16(const std::string& str)
|
9
|
+
{
|
10
|
+
std::u16string ret;
|
11
|
+
for (auto it = str.begin(); it != str.end(); ++it)
|
12
|
+
{
|
13
|
+
uint32_t code = 0;
|
14
|
+
uint32_t byte = (uint8_t)*it;
|
15
|
+
if ((byte & 0xF8) == 0xF0)
|
16
|
+
{
|
17
|
+
code = (uint32_t)((byte & 0x07) << 18);
|
18
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
19
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
20
|
+
code |= (uint32_t)((byte & 0x3F) << 12);
|
21
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
22
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
23
|
+
code |= (uint32_t)((byte & 0x3F) << 6);
|
24
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
25
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
26
|
+
code |= (byte & 0x3F);
|
27
|
+
}
|
28
|
+
else if ((byte & 0xF0) == 0xE0)
|
29
|
+
{
|
30
|
+
code = (uint32_t)((byte & 0x0F) << 12);
|
31
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
32
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
33
|
+
code |= (uint32_t)((byte & 0x3F) << 6);
|
34
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
35
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
36
|
+
code |= (byte & 0x3F);
|
37
|
+
}
|
38
|
+
else if ((byte & 0xE0) == 0xC0)
|
39
|
+
{
|
40
|
+
code = (uint32_t)((byte & 0x1F) << 6);
|
41
|
+
if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
|
42
|
+
if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
|
43
|
+
code |= (byte & 0x3F);
|
44
|
+
}
|
45
|
+
else if ((byte & 0x80) == 0x00)
|
46
|
+
{
|
47
|
+
code = byte;
|
48
|
+
}
|
49
|
+
else
|
50
|
+
{
|
51
|
+
throw std::invalid_argument{ "unicode error" };
|
52
|
+
}
|
53
|
+
|
54
|
+
if (code < 0x10000)
|
55
|
+
{
|
56
|
+
ret.push_back((char16_t)code);
|
57
|
+
}
|
58
|
+
else if (code < 0x10FFFF)
|
59
|
+
{
|
60
|
+
code -= 0x10000;
|
61
|
+
ret.push_back((char16_t)(0xD800 | (code >> 10)));
|
62
|
+
ret.push_back((char16_t)(0xDC00 | (code & 0x3FF)));
|
63
|
+
}
|
64
|
+
else
|
65
|
+
{
|
66
|
+
throw std::invalid_argument{ "unicode error" };
|
67
|
+
}
|
68
|
+
}
|
69
|
+
return ret;
|
70
|
+
}
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
namespace tomoto
|
75
|
+
{
|
76
|
+
namespace utils
|
77
|
+
{
|
78
|
+
MMap::MMap(const std::string& filepath)
|
79
|
+
{
|
80
|
+
#ifdef _WIN32
|
81
|
+
hFile = CreateFileW((const wchar_t*)utf8To16(filepath).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr);
|
82
|
+
if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'");
|
83
|
+
hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr);
|
84
|
+
if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError()));
|
85
|
+
view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0);
|
86
|
+
if (!view) throw std::ios_base::failure("Cannot MapViewOfFile() Code:" + std::to_string(GetLastError()));
|
87
|
+
DWORD high;
|
88
|
+
len = GetFileSize(hFile, &high);
|
89
|
+
len |= (uint64_t)high << 32;
|
90
|
+
#else
|
91
|
+
fd = open(filepath.c_str(), O_RDONLY);
|
92
|
+
if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'");
|
93
|
+
struct stat sb;
|
94
|
+
if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'");
|
95
|
+
len = sb.st_size;
|
96
|
+
view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0);
|
97
|
+
if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed");
|
98
|
+
#endif
|
99
|
+
}
|
100
|
+
|
101
|
+
#ifdef _WIN32
|
102
|
+
MMap::MMap(MMap&& o) noexcept
|
103
|
+
: view{ o.view }, len{ o.len }
|
104
|
+
{
|
105
|
+
o.view = nullptr;
|
106
|
+
std::swap(hFile, o.hFile);
|
107
|
+
std::swap(hFileMap, o.hFileMap);
|
108
|
+
}
|
109
|
+
#else
|
110
|
+
MMap::MMap(MMap&& o) noexcept
|
111
|
+
: len{ o.len }, fd{ std::move(o.fd) }
|
112
|
+
{
|
113
|
+
std::swap(view, o.view);
|
114
|
+
}
|
115
|
+
#endif
|
116
|
+
|
117
|
+
MMap& MMap::operator=(MMap&& o) noexcept
|
118
|
+
{
|
119
|
+
std::swap(view, o.view);
|
120
|
+
std::swap(len, o.len);
|
121
|
+
#ifdef _WIN32
|
122
|
+
std::swap(hFile, o.hFile);
|
123
|
+
std::swap(hFileMap, o.hFileMap);
|
124
|
+
#else
|
125
|
+
std::swap(fd, o.fd);
|
126
|
+
#endif
|
127
|
+
return *this;
|
128
|
+
}
|
129
|
+
|
130
|
+
MMap::~MMap()
|
131
|
+
{
|
132
|
+
#ifdef _WIN32
|
133
|
+
if (hFileMap)
|
134
|
+
{
|
135
|
+
UnmapViewOfFile(view);
|
136
|
+
view = nullptr;
|
137
|
+
}
|
138
|
+
#else
|
139
|
+
if (view)
|
140
|
+
{
|
141
|
+
munmap((void*)view, len);
|
142
|
+
}
|
143
|
+
#endif
|
144
|
+
}
|
145
|
+
}
|
146
|
+
}
|
@@ -0,0 +1,139 @@
|
|
1
|
+
#pragma once
|
2
|
+
#include <string>
|
3
|
+
#include <iostream>
|
4
|
+
|
5
|
+
#ifdef _WIN32
|
6
|
+
#define NOMINMAX
|
7
|
+
#include <Windows.h>
|
8
|
+
namespace tomoto
|
9
|
+
{
|
10
|
+
namespace utils
|
11
|
+
{
|
12
|
+
namespace detail
|
13
|
+
{
|
14
|
+
class HandleGuard
|
15
|
+
{
|
16
|
+
HANDLE handle = nullptr;
|
17
|
+
public:
|
18
|
+
HandleGuard(HANDLE _handle = nullptr) : handle(_handle)
|
19
|
+
{
|
20
|
+
}
|
21
|
+
|
22
|
+
HandleGuard(const HandleGuard&) = delete;
|
23
|
+
HandleGuard& operator =(const HandleGuard&) = delete;
|
24
|
+
|
25
|
+
HandleGuard(HandleGuard&& o) noexcept
|
26
|
+
{
|
27
|
+
std::swap(handle, o.handle);
|
28
|
+
}
|
29
|
+
|
30
|
+
HandleGuard& operator=(HandleGuard&& o) noexcept
|
31
|
+
{
|
32
|
+
std::swap(handle, o.handle);
|
33
|
+
return *this;
|
34
|
+
}
|
35
|
+
|
36
|
+
~HandleGuard()
|
37
|
+
{
|
38
|
+
if (handle && handle != INVALID_HANDLE_VALUE)
|
39
|
+
{
|
40
|
+
CloseHandle(handle);
|
41
|
+
handle = nullptr;
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
operator HANDLE() const
|
46
|
+
{
|
47
|
+
return handle;
|
48
|
+
}
|
49
|
+
};
|
50
|
+
}
|
51
|
+
|
52
|
+
class MMap
|
53
|
+
{
|
54
|
+
const char* view = nullptr;
|
55
|
+
uint64_t len = 0;
|
56
|
+
detail::HandleGuard hFile, hFileMap;
|
57
|
+
public:
|
58
|
+
MMap(const std::string& filepath);
|
59
|
+
MMap(const MMap&) = delete;
|
60
|
+
MMap& operator=(const MMap&) = delete;
|
61
|
+
MMap(MMap&& o) noexcept;
|
62
|
+
MMap& operator=(MMap&& o) noexcept;
|
63
|
+
~MMap();
|
64
|
+
|
65
|
+
const char* get() const { return view; }
|
66
|
+
size_t size() const { return len; }
|
67
|
+
};
|
68
|
+
}
|
69
|
+
}
|
70
|
+
#else
|
71
|
+
#include <unistd.h>
|
72
|
+
#include <sys/types.h>
|
73
|
+
#include <sys/stat.h>
|
74
|
+
#include <fcntl.h>
|
75
|
+
#include <sys/mman.h>
|
76
|
+
|
77
|
+
namespace tomoto
|
78
|
+
{
|
79
|
+
namespace utils
|
80
|
+
{
|
81
|
+
namespace detail
|
82
|
+
{
|
83
|
+
class FDGuard
|
84
|
+
{
|
85
|
+
int fd = 0;
|
86
|
+
public:
|
87
|
+
FDGuard(int _fd = 0) : fd(_fd)
|
88
|
+
{
|
89
|
+
}
|
90
|
+
|
91
|
+
FDGuard(const FDGuard&) = delete;
|
92
|
+
FDGuard& operator =(const FDGuard&) = delete;
|
93
|
+
|
94
|
+
FDGuard(FDGuard&& o)
|
95
|
+
{
|
96
|
+
std::swap(fd, o.fd);
|
97
|
+
}
|
98
|
+
|
99
|
+
FDGuard& operator=(FDGuard&& o)
|
100
|
+
{
|
101
|
+
std::swap(fd, o.fd);
|
102
|
+
return *this;
|
103
|
+
}
|
104
|
+
|
105
|
+
~FDGuard()
|
106
|
+
{
|
107
|
+
if (fd && fd != -1)
|
108
|
+
{
|
109
|
+
close(fd);
|
110
|
+
fd = 0;
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
operator int() const
|
115
|
+
{
|
116
|
+
return fd;
|
117
|
+
}
|
118
|
+
};
|
119
|
+
}
|
120
|
+
|
121
|
+
class MMap
|
122
|
+
{
|
123
|
+
const char* view = nullptr;
|
124
|
+
size_t len = 0;
|
125
|
+
detail::FDGuard fd;
|
126
|
+
public:
|
127
|
+
MMap(const std::string& filepath);
|
128
|
+
MMap(const MMap&) = delete;
|
129
|
+
MMap& operator=(const MMap&) = delete;
|
130
|
+
MMap(MMap&& o) noexcept;
|
131
|
+
MMap& operator=(MMap&& o) noexcept;
|
132
|
+
~MMap();
|
133
|
+
|
134
|
+
const char* get() const { return view; }
|
135
|
+
size_t size() const { return len; }
|
136
|
+
};
|
137
|
+
}
|
138
|
+
}
|
139
|
+
#endif
|