tomoto 0.4.0 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/README.md +1 -1
  4. data/ext/tomoto/extconf.rb +4 -2
  5. data/lib/tomoto/version.rb +1 -1
  6. data/vendor/tomotopy/README.kr.rst +10 -1
  7. data/vendor/tomotopy/README.rst +10 -1
  8. data/vendor/tomotopy/src/TopicModel/CT.h +2 -2
  9. data/vendor/tomotopy/src/TopicModel/CTModel.cpp +5 -0
  10. data/vendor/tomotopy/src/TopicModel/CTModel.hpp +1 -0
  11. data/vendor/tomotopy/src/TopicModel/DMR.h +2 -2
  12. data/vendor/tomotopy/src/TopicModel/DMRModel.cpp +5 -0
  13. data/vendor/tomotopy/src/TopicModel/DMRModel.hpp +1 -0
  14. data/vendor/tomotopy/src/TopicModel/DT.h +2 -2
  15. data/vendor/tomotopy/src/TopicModel/DTModel.cpp +5 -0
  16. data/vendor/tomotopy/src/TopicModel/DTModel.hpp +3 -0
  17. data/vendor/tomotopy/src/TopicModel/GDMR.h +2 -2
  18. data/vendor/tomotopy/src/TopicModel/GDMRModel.cpp +5 -0
  19. data/vendor/tomotopy/src/TopicModel/GDMRModel.hpp +1 -0
  20. data/vendor/tomotopy/src/TopicModel/HDP.h +2 -2
  21. data/vendor/tomotopy/src/TopicModel/HDPModel.cpp +5 -0
  22. data/vendor/tomotopy/src/TopicModel/HDPModel.hpp +2 -0
  23. data/vendor/tomotopy/src/TopicModel/HLDA.h +2 -2
  24. data/vendor/tomotopy/src/TopicModel/HLDAModel.cpp +5 -0
  25. data/vendor/tomotopy/src/TopicModel/HLDAModel.hpp +9 -0
  26. data/vendor/tomotopy/src/TopicModel/HPA.h +2 -2
  27. data/vendor/tomotopy/src/TopicModel/HPAModel.cpp +5 -0
  28. data/vendor/tomotopy/src/TopicModel/HPAModel.hpp +2 -0
  29. data/vendor/tomotopy/src/TopicModel/LDA.h +8 -2
  30. data/vendor/tomotopy/src/TopicModel/LDAModel.cpp +5 -0
  31. data/vendor/tomotopy/src/TopicModel/LDAModel.hpp +8 -0
  32. data/vendor/tomotopy/src/TopicModel/LLDA.h +2 -2
  33. data/vendor/tomotopy/src/TopicModel/LLDAModel.cpp +5 -0
  34. data/vendor/tomotopy/src/TopicModel/LLDAModel.hpp +1 -0
  35. data/vendor/tomotopy/src/TopicModel/MGLDA.h +2 -2
  36. data/vendor/tomotopy/src/TopicModel/MGLDAModel.cpp +5 -0
  37. data/vendor/tomotopy/src/TopicModel/MGLDAModel.hpp +1 -0
  38. data/vendor/tomotopy/src/TopicModel/PA.h +2 -2
  39. data/vendor/tomotopy/src/TopicModel/PAModel.cpp +5 -0
  40. data/vendor/tomotopy/src/TopicModel/PAModel.hpp +2 -0
  41. data/vendor/tomotopy/src/TopicModel/PLDAModel.hpp +1 -0
  42. data/vendor/tomotopy/src/TopicModel/PT.h +3 -3
  43. data/vendor/tomotopy/src/TopicModel/PTModel.cpp +5 -0
  44. data/vendor/tomotopy/src/TopicModel/PTModel.hpp +1 -0
  45. data/vendor/tomotopy/src/TopicModel/SLDA.h +3 -2
  46. data/vendor/tomotopy/src/TopicModel/SLDAModel.cpp +5 -0
  47. data/vendor/tomotopy/src/TopicModel/SLDAModel.hpp +1 -0
  48. data/vendor/tomotopy/src/TopicModel/TopicModel.hpp +77 -3
  49. data/vendor/tomotopy/src/Utils/Dictionary.cpp +102 -0
  50. data/vendor/tomotopy/src/Utils/Dictionary.h +26 -75
  51. data/vendor/tomotopy/src/Utils/Mmap.cpp +146 -0
  52. data/vendor/tomotopy/src/Utils/Mmap.h +139 -0
  53. data/vendor/tomotopy/src/Utils/MultiNormalDistribution.hpp +1 -0
  54. data/vendor/tomotopy/src/Utils/SharedString.cpp +134 -0
  55. data/vendor/tomotopy/src/Utils/SharedString.h +104 -0
  56. data/vendor/tomotopy/src/Utils/serializer.cpp +166 -0
  57. data/vendor/tomotopy/src/Utils/serializer.hpp +261 -85
  58. metadata +9 -4
  59. data/vendor/tomotopy/src/Utils/SharedString.hpp +0 -206
@@ -16,8 +16,9 @@ namespace tomoto
16
16
  ret["y"] = y;
17
17
  return ret;
18
18
  }
19
- DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 0, y);
20
- DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseDocument, 1, 0x00010001, y);
19
+
20
+ DECLARE_SERIALIZER_WITH_VERSION(0);
21
+ DECLARE_SERIALIZER_WITH_VERSION(1);
21
22
  };
22
23
 
23
24
  struct SLDAArgs;
@@ -2,6 +2,11 @@
2
2
 
3
3
  namespace tomoto
4
4
  {
5
+ DEFINE_OUT_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 0, y);
6
+ DEFINE_OUT_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(DocumentSLDA, BaseDocument, 1, 0x00010001, y);
7
+
8
+ TMT_INSTANTIATE_DOC(DocumentSLDA);
9
+
5
10
  ISLDAModel* ISLDAModel::create(TermWeight _weight, const SLDAArgs& args, bool scalarRng)
6
11
  {
7
12
  TMT_SWITCH_TW(_weight, scalarRng, SLDAModel, args);
@@ -348,6 +348,7 @@ namespace tomoto
348
348
  public:
349
349
  DEFINE_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 0, F, responseVars, mu, nuSq);
350
350
  DEFINE_TAGGED_SERIALIZER_AFTER_BASE_WITH_VERSION(BaseClass, 1, 0x00010001, F, responseVars, mu, nuSq);
351
+ DEFINE_HASHER_AFTER_BASE(BaseClass, F, mu, nuSq);
351
352
 
352
353
  SLDAModel(const SLDAArgs& args)
353
354
  : BaseClass(args), F(args.vars.size()), varTypes(args.vars),
@@ -1,4 +1,4 @@
1
- #pragma once
1
+ #pragma once
2
2
  #include <numeric>
3
3
  #include <unordered_set>
4
4
  #include "../Utils/Utils.hpp"
@@ -7,7 +7,7 @@
7
7
  #include "../Utils/ThreadPool.hpp"
8
8
  #include "../Utils/serializer.hpp"
9
9
  #include "../Utils/exception.h"
10
- #include "../Utils/SharedString.hpp"
10
+ #include "../Utils/SharedString.h"
11
11
  #include <EigenRand/EigenRand>
12
12
  #include <mapbox/variant.hpp>
13
13
 
@@ -107,7 +107,7 @@ namespace tomoto
107
107
 
108
108
  virtual operator RawDoc() const
109
109
  {
110
- RawDoc raw{ *this };
110
+ RawDoc raw{ *static_cast<const RawDocKernel*>(this) };
111
111
  if (wOrder.empty())
112
112
  {
113
113
  raw.words.insert(raw.words.begin(), words.begin(), words.end());
@@ -224,6 +224,8 @@ namespace tomoto
224
224
  virtual void loadModel(std::istream& reader,
225
225
  std::vector<uint8_t>* extra_data = nullptr) = 0;
226
226
 
227
+ virtual std::array<uint64_t, 2> getHash() const = 0;
228
+
227
229
  virtual std::unique_ptr<ITopicModel> copy() const = 0;
228
230
 
229
231
  virtual const DocumentBase* getDoc(size_t docId) const = 0;
@@ -251,6 +253,7 @@ namespace tomoto
251
253
  virtual const std::vector<uint64_t>& getVocabCf() const = 0;
252
254
  virtual std::vector<double> getVocabWeightedCf() const = 0;
253
255
  virtual const std::vector<uint64_t>& getVocabDf() const = 0;
256
+ virtual const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const = 0;
254
257
 
255
258
  virtual int train(size_t iteration, size_t numWorkers, ParallelScheme ps = ParallelScheme::default_, bool freeze_topics = false) = 0;
256
259
  virtual size_t getGlobalStep() const = 0;
@@ -260,6 +263,7 @@ namespace tomoto
260
263
  virtual size_t getNumTopicsForPrior() const = 0;
261
264
  virtual std::vector<Float> getWidsByTopic(size_t tid, bool normalize = true) const = 0;
262
265
  virtual std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const = 0;
266
+ virtual std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const = 0;
263
267
 
264
268
  virtual std::vector<std::pair<std::string, Float>> getWordsByDocSorted(const DocumentBase* doc, size_t topN) const = 0;
265
269
 
@@ -319,6 +323,7 @@ namespace tomoto
319
323
  size_t globalStep = 0;
320
324
  _ModelState globalState, tState;
321
325
  Dictionary dict;
326
+ std::vector<std::vector<std::pair<std::string, size_t>>> wordFormCnts;
322
327
  uint64_t realV = 0; // vocab size after removing stopwords
323
328
  uint64_t realN = 0; // total word size after removing stopwords
324
329
  double weightedN = 0;
@@ -565,6 +570,44 @@ namespace tomoto
565
570
  }
566
571
  }
567
572
 
573
+ void updateWordFormCnts()
574
+ {
575
+ wordFormCnts.clear();
576
+ wordFormCnts.resize(realV);
577
+ std::vector<std::unordered_map<std::string, size_t>> cnts(realV);
578
+ for (auto& doc : docs)
579
+ {
580
+ for (size_t i = 0; i < doc.words.size(); ++i)
581
+ {
582
+ auto w = doc.words[doc.wOrder.empty() ? i : doc.wOrder[i]];
583
+ if (w >= realV) continue;
584
+ auto& cnt = cnts[w];
585
+ std::string word;
586
+ if (!doc.rawStr.empty() && i < doc.origWordPos.size())
587
+ {
588
+ word = doc.rawStr.substr(doc.origWordPos[i], doc.origWordLen[i]);
589
+ }
590
+ else
591
+ {
592
+ word = dict.toWord(w);
593
+ }
594
+ ++cnt[word];
595
+ }
596
+ }
597
+
598
+ for (size_t i = 0; i < realV; ++i)
599
+ {
600
+ auto& cnt = cnts[i];
601
+ std::vector<std::pair<std::string, size_t>> v{ std::make_move_iterator(cnt.begin()), std::make_move_iterator(cnt.end()) };
602
+ std::sort(v.begin(), v.end(), [](const std::pair<std::string, size_t>& a, const std::pair<std::string, size_t>& b)
603
+ {
604
+ return a.second > b.second;
605
+ });
606
+ wordFormCnts[i] = move(v);
607
+ cnt.clear();
608
+ }
609
+ }
610
+
568
611
  int restoreFromTrainingError(const exc::TrainingError& e, ThreadPool& pool, _ModelState* localData, _RandGen* rgs)
569
612
  {
570
613
  throw e;
@@ -751,11 +794,26 @@ namespace tomoto
751
794
  return ret;
752
795
  }
753
796
 
797
+ std::vector<std::tuple<std::string, Vid, Float>> vid2StringVid(const std::vector<std::pair<Vid, Float>>& vids) const
798
+ {
799
+ std::vector<std::tuple<std::string, Vid, Float>> ret(vids.size());
800
+ for (size_t i = 0; i < vids.size(); ++i)
801
+ {
802
+ ret[i] = std::make_tuple(dict.toWord(vids[i].first), vids[i].first, vids[i].second);
803
+ }
804
+ return ret;
805
+ }
806
+
754
807
  std::vector<std::pair<std::string, Float>> getWordsByTopicSorted(size_t tid, size_t topN) const override
755
808
  {
756
809
  return vid2String(getWidsByTopicSorted(tid, topN));
757
810
  }
758
811
 
812
+ std::vector<std::tuple<std::string, Vid, Float>> getWordIdsByTopicSorted(size_t tid, size_t topN) const override
813
+ {
814
+ return vid2StringVid(getWidsByTopicSorted(tid, topN));
815
+ }
816
+
759
817
  std::vector<std::pair<Vid, Float>> getWidsByDocSorted(const DocumentBase* doc, size_t topN) const
760
818
  {
761
819
  std::vector<Float> cnt(dict.size());
@@ -872,6 +930,11 @@ namespace tomoto
872
930
  return vocabDf;
873
931
  }
874
932
 
933
+ const std::vector<std::vector<std::pair<std::string, size_t>>>& getWordFormCnts() const override
934
+ {
935
+ return wordFormCnts;
936
+ }
937
+
875
938
  void saveModel(std::ostream& writer, bool fullModel, const std::vector<uint8_t>* extra_data) const override
876
939
  {
877
940
  static_cast<const _Derived*>(this)->_saveModel(writer, fullModel, extra_data);
@@ -882,6 +945,17 @@ namespace tomoto
882
945
  static_cast<_Derived*>(this)->_loadModel(reader, extra_data);
883
946
  static_cast<_Derived*>(this)->prepare(false);
884
947
  }
948
+
949
+ std::array<uint64_t, 2> getHash() const override
950
+ {
951
+ std::array<uint64_t, 2> ret;
952
+ ret[0] = dict.computeHash(0);
953
+ const std::string s = static_cast<const _Derived*>(this)->tmid().str() + static_cast<const _Derived*>(this)->twid().str();
954
+ ret[0] = serializer::computeHashMany(ret[0], s, realV, globalStep, docs.size());
955
+ ret[1] = globalState.computeHash(0);
956
+ ret[1] = static_cast<const _Derived*>(this)->computeHash(ret[1]);
957
+ return ret;
958
+ }
885
959
  };
886
960
 
887
961
  }
@@ -0,0 +1,102 @@
1
+ #include "Dictionary.h"
2
+
3
+ namespace tomoto
4
+ {
5
+ Dictionary::Dictionary() = default;
6
+ Dictionary::~Dictionary() = default;
7
+
8
+ Dictionary::Dictionary(const Dictionary&) = default;
9
+ Dictionary& Dictionary::operator=(const Dictionary&) = default;
10
+
11
+ Dictionary::Dictionary(Dictionary&&) noexcept = default;
12
+ Dictionary& Dictionary::operator=(Dictionary&&) noexcept = default;
13
+
14
+ Vid Dictionary::add(const std::string& word)
15
+ {
16
+ auto it = dict.find(word);
17
+ if (it == dict.end())
18
+ {
19
+ dict.emplace(word, (Vid)dict.size());
20
+ id2word.emplace_back(word);
21
+ return (Vid)(dict.size() - 1);
22
+ }
23
+ return it->second;
24
+ }
25
+
26
+ const std::string& Dictionary::toWord(Vid vid) const
27
+ {
28
+ assert(vid < id2word.size());
29
+ return id2word[vid];
30
+ }
31
+
32
+ Vid Dictionary::toWid(const std::string& word) const
33
+ {
34
+ auto it = dict.find(word);
35
+ if (it == dict.end()) return non_vocab_id;
36
+ return it->second;
37
+ }
38
+
39
+ void Dictionary::serializerWrite(std::ostream& writer) const
40
+ {
41
+ serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
42
+ }
43
+
44
+ void Dictionary::serializerRead(std::istream& reader)
45
+ {
46
+ serializer::readMany(reader, serializer::to_key("Dict"), id2word);
47
+ for (size_t i = 0; i < id2word.size(); ++i)
48
+ {
49
+ dict.emplace(id2word[i], (Vid)i);
50
+ }
51
+ }
52
+
53
+ uint64_t Dictionary::computeHash(uint64_t seed) const
54
+ {
55
+ return serializer::computeHashMany(seed, id2word);
56
+ }
57
+
58
+ void Dictionary::swap(Dictionary& rhs)
59
+ {
60
+ std::swap(dict, rhs.dict);
61
+ std::swap(id2word, rhs.id2word);
62
+ }
63
+
64
+ void Dictionary::reorder(const std::vector<Vid>& order)
65
+ {
66
+ for (auto& p : dict)
67
+ {
68
+ p.second = order[p.second];
69
+ id2word[p.second] = p.first;
70
+ }
71
+ }
72
+
73
+ const std::vector<std::string>& Dictionary::getRaw() const
74
+ {
75
+ return id2word;
76
+ }
77
+
78
+ Vid Dictionary::mapToNewDict(Vid v, const Dictionary& newDict) const
79
+ {
80
+ return newDict.toWid(toWord(v));
81
+ }
82
+
83
+ std::vector<Vid> Dictionary::mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const
84
+ {
85
+ std::vector<Vid> r(v.size());
86
+ for (size_t i = 0; i < v.size(); ++i)
87
+ {
88
+ r[i] = mapToNewDict(v[i], newDict);
89
+ }
90
+ return r;
91
+ }
92
+
93
+ std::vector<Vid> Dictionary::mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const
94
+ {
95
+ std::vector<Vid> r(v.size());
96
+ for (size_t i = 0; i < v.size(); ++i)
97
+ {
98
+ r[i] = mapToNewDict(v[i], newDict);
99
+ }
100
+ return r;
101
+ }
102
+ }
@@ -12,8 +12,9 @@ namespace tomoto
12
12
  {
13
13
  using Vid = uint32_t;
14
14
  static constexpr Vid non_vocab_id = (Vid)-1;
15
+ static constexpr Vid rm_vocab_id = (Vid)-2;
15
16
  using Tid = uint16_t;
16
- static constexpr Vid non_topic_id = (Tid)-1;
17
+ static constexpr Tid non_topic_id = (Tid)-1;
17
18
  using Float = float;
18
19
 
19
20
  struct VidPair : public std::pair<Vid, Vid>
@@ -27,91 +28,41 @@ namespace tomoto
27
28
  std::unordered_map<std::string, Vid> dict;
28
29
  std::vector<std::string> id2word;
29
30
  public:
30
- Vid add(const std::string& word)
31
- {
32
- auto it = dict.find(word);
33
- if (it == dict.end())
34
- {
35
- dict.emplace(word, (Vid)dict.size());
36
- id2word.emplace_back(word);
37
- return (Vid)(dict.size() - 1);
38
- }
39
- return it->second;
40
- }
31
+
32
+ Dictionary();
33
+ ~Dictionary();
34
+
35
+ Dictionary(const Dictionary&);
36
+ Dictionary& operator=(const Dictionary&);
37
+
38
+ Dictionary(Dictionary&&) noexcept;
39
+ Dictionary& operator=(Dictionary&&) noexcept;
40
+
41
+ Vid add(const std::string& word);
41
42
 
42
43
  size_t size() const { return dict.size(); }
43
44
 
44
- const std::string& toWord(Vid vid) const
45
- {
46
- assert(vid < id2word.size());
47
- return id2word[vid];
48
- }
45
+ const std::string& toWord(Vid vid) const;
49
46
 
50
- Vid toWid(const std::string& word) const
51
- {
52
- auto it = dict.find(word);
53
- if (it == dict.end()) return non_vocab_id;
54
- return it->second;
55
- }
47
+ Vid toWid(const std::string& word) const;
56
48
 
57
- void serializerWrite(std::ostream& writer) const
58
- {
59
- serializer::writeMany(writer, serializer::to_key("Dict"), id2word);
60
- }
49
+ void serializerWrite(std::ostream& writer) const;
61
50
 
62
- void serializerRead(std::istream& reader)
63
- {
64
- serializer::readMany(reader, serializer::to_key("Dict"), id2word);
65
- for (size_t i = 0; i < id2word.size(); ++i)
66
- {
67
- dict.emplace(id2word[i], (Vid)i);
68
- }
69
- }
51
+ void serializerRead(std::istream& reader);
70
52
 
71
- void swap(Dictionary& rhs)
72
- {
73
- std::swap(dict, rhs.dict);
74
- std::swap(id2word, rhs.id2word);
75
- }
53
+ uint64_t computeHash(uint64_t seed) const;
76
54
 
77
- void reorder(const std::vector<Vid>& order)
78
- {
79
- for (auto& p : dict)
80
- {
81
- p.second = order[p.second];
82
- id2word[p.second] = p.first;
83
- }
84
- }
55
+ void swap(Dictionary& rhs);
85
56
 
86
- const std::vector<std::string>& getRaw() const
87
- {
88
- return id2word;
89
- }
57
+ void reorder(const std::vector<Vid>& order);
90
58
 
91
- Vid mapToNewDict(Vid v, const Dictionary& newDict) const
92
- {
93
- return newDict.toWid(toWord(v));
94
- }
59
+ const std::vector<std::string>& getRaw() const;
95
60
 
96
- std::vector<Vid> mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const
97
- {
98
- std::vector<Vid> r(v.size());
99
- for (size_t i = 0; i < v.size(); ++i)
100
- {
101
- r[i] = mapToNewDict(v[i], newDict);
102
- }
103
- return r;
104
- }
61
+ Vid mapToNewDict(Vid v, const Dictionary& newDict) const;
105
62
 
106
- std::vector<Vid> mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const
107
- {
108
- std::vector<Vid> r(v.size());
109
- for (size_t i = 0; i < v.size(); ++i)
110
- {
111
- r[i] = mapToNewDict(v[i], newDict);
112
- }
113
- return r;
114
- }
63
+ std::vector<Vid> mapToNewDict(const std::vector<Vid>& v, const Dictionary& newDict) const;
64
+
65
+ std::vector<Vid> mapToNewDictAdd(const std::vector<Vid>& v, Dictionary& newDict) const;
115
66
  };
116
67
 
117
68
  }
@@ -126,4 +77,4 @@ namespace std
126
77
  return hash<size_t>{}(p.first) ^ hash<size_t>{}(p.second);
127
78
  }
128
79
  };
129
- }
80
+ }
@@ -0,0 +1,146 @@
1
+ #include <cstdint>
2
+ #include "Mmap.h"
3
+
4
+ namespace tomoto
5
+ {
6
+ namespace utils
7
+ {
8
+ static std::u16string utf8To16(const std::string& str)
9
+ {
10
+ std::u16string ret;
11
+ for (auto it = str.begin(); it != str.end(); ++it)
12
+ {
13
+ uint32_t code = 0;
14
+ uint32_t byte = (uint8_t)*it;
15
+ if ((byte & 0xF8) == 0xF0)
16
+ {
17
+ code = (uint32_t)((byte & 0x07) << 18);
18
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
19
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
20
+ code |= (uint32_t)((byte & 0x3F) << 12);
21
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
22
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
23
+ code |= (uint32_t)((byte & 0x3F) << 6);
24
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
25
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
26
+ code |= (byte & 0x3F);
27
+ }
28
+ else if ((byte & 0xF0) == 0xE0)
29
+ {
30
+ code = (uint32_t)((byte & 0x0F) << 12);
31
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
32
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
33
+ code |= (uint32_t)((byte & 0x3F) << 6);
34
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
35
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
36
+ code |= (byte & 0x3F);
37
+ }
38
+ else if ((byte & 0xE0) == 0xC0)
39
+ {
40
+ code = (uint32_t)((byte & 0x1F) << 6);
41
+ if (++it == str.end()) throw std::invalid_argument{ "unexpected ending" };
42
+ if (((byte = *it) & 0xC0) != 0x80) throw std::invalid_argument{ "unexpected trailing byte" };
43
+ code |= (byte & 0x3F);
44
+ }
45
+ else if ((byte & 0x80) == 0x00)
46
+ {
47
+ code = byte;
48
+ }
49
+ else
50
+ {
51
+ throw std::invalid_argument{ "unicode error" };
52
+ }
53
+
54
+ if (code < 0x10000)
55
+ {
56
+ ret.push_back((char16_t)code);
57
+ }
58
+ else if (code < 0x10FFFF)
59
+ {
60
+ code -= 0x10000;
61
+ ret.push_back((char16_t)(0xD800 | (code >> 10)));
62
+ ret.push_back((char16_t)(0xDC00 | (code & 0x3FF)));
63
+ }
64
+ else
65
+ {
66
+ throw std::invalid_argument{ "unicode error" };
67
+ }
68
+ }
69
+ return ret;
70
+ }
71
+ }
72
+ }
73
+
74
+ namespace tomoto
75
+ {
76
+ namespace utils
77
+ {
78
+ MMap::MMap(const std::string& filepath)
79
+ {
80
+ #ifdef _WIN32
81
+ hFile = CreateFileW((const wchar_t*)utf8To16(filepath).c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, nullptr);
82
+ if (hFile == INVALID_HANDLE_VALUE) throw std::ios_base::failure("Cannot open '" + filepath + "'");
83
+ hFileMap = CreateFileMapping(hFile, nullptr, PAGE_READONLY, 0, 0, nullptr);
84
+ if (hFileMap == nullptr) throw std::ios_base::failure("Cannot open '" + filepath + "' Code:" + std::to_string(GetLastError()));
85
+ view = (const char*)MapViewOfFile(hFileMap, FILE_MAP_READ, 0, 0, 0);
86
+ if (!view) throw std::ios_base::failure("Cannot MapViewOfFile() Code:" + std::to_string(GetLastError()));
87
+ DWORD high;
88
+ len = GetFileSize(hFile, &high);
89
+ len |= (uint64_t)high << 32;
90
+ #else
91
+ fd = open(filepath.c_str(), O_RDONLY);
92
+ if (fd == -1) throw std::ios_base::failure("Cannot open '" + filepath + "'");
93
+ struct stat sb;
94
+ if (fstat(fd, &sb) < 0) throw std::ios_base::failure("Cannot open '" + filepath + "'");
95
+ len = sb.st_size;
96
+ view = (const char*)mmap(nullptr, len, PROT_READ, MAP_PRIVATE, fd, 0);
97
+ if (view == MAP_FAILED) throw std::ios_base::failure("Mapping failed");
98
+ #endif
99
+ }
100
+
101
+ #ifdef _WIN32
102
+ MMap::MMap(MMap&& o) noexcept
103
+ : view{ o.view }, len{ o.len }
104
+ {
105
+ o.view = nullptr;
106
+ std::swap(hFile, o.hFile);
107
+ std::swap(hFileMap, o.hFileMap);
108
+ }
109
+ #else
110
+ MMap::MMap(MMap&& o) noexcept
111
+ : len{ o.len }, fd{ std::move(o.fd) }
112
+ {
113
+ std::swap(view, o.view);
114
+ }
115
+ #endif
116
+
117
+ MMap& MMap::operator=(MMap&& o) noexcept
118
+ {
119
+ std::swap(view, o.view);
120
+ std::swap(len, o.len);
121
+ #ifdef _WIN32
122
+ std::swap(hFile, o.hFile);
123
+ std::swap(hFileMap, o.hFileMap);
124
+ #else
125
+ std::swap(fd, o.fd);
126
+ #endif
127
+ return *this;
128
+ }
129
+
130
+ MMap::~MMap()
131
+ {
132
+ #ifdef _WIN32
133
+ if (hFileMap)
134
+ {
135
+ UnmapViewOfFile(view);
136
+ view = nullptr;
137
+ }
138
+ #else
139
+ if (view)
140
+ {
141
+ munmap((void*)view, len);
142
+ }
143
+ #endif
144
+ }
145
+ }
146
+ }
@@ -0,0 +1,139 @@
1
+ #pragma once
2
+ #include <string>
3
+ #include <iostream>
4
+
5
+ #ifdef _WIN32
6
+ #define NOMINMAX
7
+ #include <Windows.h>
8
+ namespace tomoto
9
+ {
10
+ namespace utils
11
+ {
12
+ namespace detail
13
+ {
14
+ class HandleGuard
15
+ {
16
+ HANDLE handle = nullptr;
17
+ public:
18
+ HandleGuard(HANDLE _handle = nullptr) : handle(_handle)
19
+ {
20
+ }
21
+
22
+ HandleGuard(const HandleGuard&) = delete;
23
+ HandleGuard& operator =(const HandleGuard&) = delete;
24
+
25
+ HandleGuard(HandleGuard&& o) noexcept
26
+ {
27
+ std::swap(handle, o.handle);
28
+ }
29
+
30
+ HandleGuard& operator=(HandleGuard&& o) noexcept
31
+ {
32
+ std::swap(handle, o.handle);
33
+ return *this;
34
+ }
35
+
36
+ ~HandleGuard()
37
+ {
38
+ if (handle && handle != INVALID_HANDLE_VALUE)
39
+ {
40
+ CloseHandle(handle);
41
+ handle = nullptr;
42
+ }
43
+ }
44
+
45
+ operator HANDLE() const
46
+ {
47
+ return handle;
48
+ }
49
+ };
50
+ }
51
+
52
+ class MMap
53
+ {
54
+ const char* view = nullptr;
55
+ uint64_t len = 0;
56
+ detail::HandleGuard hFile, hFileMap;
57
+ public:
58
+ MMap(const std::string& filepath);
59
+ MMap(const MMap&) = delete;
60
+ MMap& operator=(const MMap&) = delete;
61
+ MMap(MMap&& o) noexcept;
62
+ MMap& operator=(MMap&& o) noexcept;
63
+ ~MMap();
64
+
65
+ const char* get() const { return view; }
66
+ size_t size() const { return len; }
67
+ };
68
+ }
69
+ }
70
+ #else
71
+ #include <unistd.h>
72
+ #include <sys/types.h>
73
+ #include <sys/stat.h>
74
+ #include <fcntl.h>
75
+ #include <sys/mman.h>
76
+
77
+ namespace tomoto
78
+ {
79
+ namespace utils
80
+ {
81
+ namespace detail
82
+ {
83
+ class FDGuard
84
+ {
85
+ int fd = 0;
86
+ public:
87
+ FDGuard(int _fd = 0) : fd(_fd)
88
+ {
89
+ }
90
+
91
+ FDGuard(const FDGuard&) = delete;
92
+ FDGuard& operator =(const FDGuard&) = delete;
93
+
94
+ FDGuard(FDGuard&& o)
95
+ {
96
+ std::swap(fd, o.fd);
97
+ }
98
+
99
+ FDGuard& operator=(FDGuard&& o)
100
+ {
101
+ std::swap(fd, o.fd);
102
+ return *this;
103
+ }
104
+
105
+ ~FDGuard()
106
+ {
107
+ if (fd && fd != -1)
108
+ {
109
+ close(fd);
110
+ fd = 0;
111
+ }
112
+ }
113
+
114
+ operator int() const
115
+ {
116
+ return fd;
117
+ }
118
+ };
119
+ }
120
+
121
+ class MMap
122
+ {
123
+ const char* view = nullptr;
124
+ size_t len = 0;
125
+ detail::FDGuard fd;
126
+ public:
127
+ MMap(const std::string& filepath);
128
+ MMap(const MMap&) = delete;
129
+ MMap& operator=(const MMap&) = delete;
130
+ MMap(MMap&& o) noexcept;
131
+ MMap& operator=(MMap&& o) noexcept;
132
+ ~MMap();
133
+
134
+ const char* get() const { return view; }
135
+ size_t size() const { return len; }
136
+ };
137
+ }
138
+ }
139
+ #endif