ooxml_crypt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,291 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief FM-index
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ */
9
+ #include <map>
10
+ #include <vector>
11
+ #include <fstream>
12
+ #include <stdio.h>
13
+ #ifdef CYBOZU_FMINDEX_USE_CSUCVECTOR
14
+ #include <cybozu/csucvector.hpp>
15
+ #endif
16
+ #include <cybozu/wavelet_matrix.hpp>
17
+ #include <cybozu/bitvector.hpp>
18
+ #include <cybozu/frequency.hpp>
19
+
20
+ #ifdef _MSC_VER
21
+ #pragma warning(push)
22
+ #pragma warning(disable:4244)
23
+ #pragma warning(disable:4389)
24
+ #pragma warning(disable:4018)
25
+ #endif
26
+ #include "sais.hxx"
27
+ #ifdef _MSC_VER
28
+ #pragma warning(pop)
29
+ #endif
30
+
31
+ #ifdef _MSC_VER
32
+ #pragma warning(push)
33
+ #pragma warning(disable:4127) // constant condition
34
+ #endif
35
+
36
+ namespace cybozu {
37
+ /*
38
+ T : type of alphabet
39
+ isRawData : deal with input data as is
40
+ T must be uint8_t or uint16_t if isRawData
41
+ */
42
+ template<class T, bool isRawData = false>
43
+ class FMindexT {
44
+ public:
45
+ static const size_t maxCharNum = size_t(1) << (sizeof(T) * 8);
46
+ typedef std::vector<uint32_t> Vec32;
47
+ typedef std::vector<T> Vec;
48
+ #ifdef CYBOZU_FMINDEX_USE_CSUCVECTOR
49
+ typedef cybozu::CSucVector SucVector;
50
+ #else
51
+ typedef cybozu::SucVectorT<uint32_t, false> SucVector;
52
+ #endif
53
+ typedef cybozu::WaveletMatrixT<false, SucVector> WaveletMatrix;
54
+ Vec32 cf;
55
+ WaveletMatrix wm;
56
+ Vec32 alignedSa;
57
+ SucVector alignedPos;
58
+ cybozu::Frequency<T, uint32_t> freq;
59
+ int skip_;
60
+ size_t charNum_;
61
+
62
+ /*
63
+ setup freq, cf by [begin, end)
64
+ */
65
+ template<class Iter>
66
+ void initCf(Vec& v, Iter begin, Iter end)
67
+ {
68
+ const size_t size = std::distance(begin, end);
69
+ if (size >= (uint64_t(1) << 32) - 1) {
70
+ throw cybozu::Exception("FMindexT:initCf:too large dataSize") << size;
71
+ }
72
+ v.resize(size + 1); // add NUL at the end of data
73
+ if (isRawData) {
74
+ assert(sizeof(T) <= 16);
75
+ charNum_ = size_t(1) << (sizeof(T) * 8);
76
+ std::vector<uint32_t> charNumTbl(charNum_);
77
+ charNumTbl[0] = 1;
78
+ for (size_t i = 0; i < size; i++) {
79
+ T c = *begin++;
80
+ if (c <= 0) throw cybozu::Exception("FMindext:initCf:zero alphabet") << c;
81
+ v[i] = c;
82
+ charNumTbl[c]++;
83
+ }
84
+ cf.resize(charNum_);
85
+ uint32_t sum = 0;
86
+ for (size_t i = 0; i < charNum_; i++) {
87
+ cf[i] = sum;
88
+ sum += charNumTbl[i];
89
+ }
90
+ } else {
91
+ freq.init(begin, end);
92
+ charNum_ = freq.size() + 1; // +1 means last zero
93
+ if (charNum_ > maxCharNum) throw cybozu::Exception("FMindexT:initCf:too many alphabet");
94
+ for (size_t i = 0; i < size; i++) {
95
+ v[i] = static_cast<T>(freq.getIndex(*begin++) + 1);
96
+ }
97
+ cf.resize(charNum_);
98
+ cf[0] = 0;
99
+ uint32_t sum = 1;
100
+ for (size_t i = 1; i < charNum_; i++) {
101
+ cf[i] = sum;
102
+ sum += freq.getFrequency(freq.getElement(i - 1));
103
+ }
104
+ }
105
+ }
106
+ void initBwt(Vec& bwt, const Vec& s, const Vec32& sa) const
107
+ {
108
+ const size_t size = sa.size();
109
+ bwt.resize(size);
110
+ for (size_t i = 0; i < size; i++) {
111
+ if (sa[i] > 0) {
112
+ bwt[i] = s[sa[i] - 1];
113
+ } else {
114
+ bwt[i] = s[size - 1];
115
+ }
116
+ }
117
+ }
118
+ size_t getBitLen(size_t x) const
119
+ {
120
+ if (x == 0) return 1;
121
+ size_t ret = 0;
122
+ while (x > 0) {
123
+ x >>= 1;
124
+ ret++;
125
+ }
126
+ return ret;
127
+ }
128
+ public:
129
+ FMindexT()
130
+ : skip_(8)
131
+ , charNum_(0)
132
+ {
133
+ }
134
+
135
+ /*
136
+ [begin, end)
137
+ replace '\0' in [begin, end) with space
138
+ append '\0' at the end of [begin, end)
139
+ */
140
+ template<class Iter>
141
+ void init(Iter begin, Iter end, int skip = 8)
142
+ {
143
+ if (skip <= 0) {
144
+ throw cybozu::Exception("FMindexT:buildFMindex:skip is positive") << skip;
145
+ }
146
+ skip_ = skip;
147
+ Vec v;
148
+ initCf(v, begin, end);
149
+ const size_t dataSize = v.size();
150
+
151
+ Vec32 sa;
152
+ sa.resize(dataSize);
153
+ if (saisxx(&v[0], &sa[0], (int)dataSize, (int)charNum_) == -1) {
154
+ throw cybozu::Exception("FMindexT:init:saisxx");
155
+ }
156
+ Vec bwt;
157
+ initBwt(bwt, v, sa);
158
+ wm.init(bwt, getBitLen(charNum_));
159
+
160
+ #if 1
161
+ cybozu::BitVector bv;
162
+ bv.resize(dataSize);
163
+ for (size_t i = 0; i < dataSize; i++) {
164
+ if ((sa[i] % skip) == 0) {
165
+ bv.set(i);
166
+ alignedSa.push_back(sa[i]);
167
+ }
168
+ }
169
+ alignedPos.init(bv.getBlock(), bv.size());
170
+ #else
171
+ alignedPos.resize(dataSize);
172
+ for (size_t i = 0; i < dataSize; i++) {
173
+ if ((sa[i] % skip) == 0) {
174
+ alignedPos.set(i);
175
+ alignedSa.push_back(sa[i]);
176
+ }
177
+ }
178
+ alignedPos.ready();
179
+ #endif
180
+ }
181
+
182
+ /*
183
+ get range of bwt for key
184
+ */
185
+ template<class Int, class Key>
186
+ bool getRange(Int* pbegin, Int* pend, const Key& _key) const
187
+ {
188
+ if (_key.empty()) return false;
189
+ const size_t keySize = _key.size();
190
+ const typename Key::value_type *key;
191
+ Key cvtKey;
192
+ if (isRawData) {
193
+ key = &_key[0];
194
+ } else {
195
+ cvtKey.resize(keySize);
196
+ for (size_t i = 0; i < keySize; i++) {
197
+ if (freq.getFrequency(_key[i]) == 0) return false;
198
+ cvtKey[i] = typename Key::value_type(freq.getIndex(_key[i]) + 1);
199
+ }
200
+ key = &cvtKey[0];
201
+ }
202
+ size_t i = keySize - 1;
203
+ size_t begin = 0;
204
+ size_t end = wm.size();
205
+ while (begin < end) {
206
+ const T c = key[i];
207
+ const uint32_t cfc = cf[c];
208
+ begin = cfc + wm.rank(c, begin);
209
+ end = cfc + wm.rank(c, end);
210
+ if (i == 0) break;
211
+ i--;
212
+ }
213
+
214
+ if (begin < end) {
215
+ *pbegin = Int(begin);
216
+ *pend = Int(end);
217
+ return true;
218
+ }
219
+ return false;
220
+ }
221
+ template<class Int>
222
+ bool getRange(Int* pbegin, Int* pend, const char *key) const
223
+ {
224
+ return getRange(pbegin, pend, std::string(key));
225
+ }
226
+ size_t convertPosition(size_t bwtPos) const
227
+ {
228
+ size_t t = 0;
229
+ while (!alignedPos.get(bwtPos)) {
230
+ T c;
231
+ bwtPos = wm.get(&c, bwtPos);
232
+ bwtPos += cf[c];
233
+ t++;
234
+ }
235
+ return t + alignedSa[alignedPos.rank1(bwtPos)];
236
+ }
237
+ /*
238
+ get previous string at pos
239
+ @note assume T is vector or std::string
240
+ */
241
+ template<class Str>
242
+ void getPrevString(Str& str, size_t bwtPos, size_t len) const
243
+ {
244
+ str.resize(len);
245
+ T c;
246
+ while (len > 0) {
247
+ bwtPos = wm.get(&c, bwtPos);
248
+ bwtPos += cf[c];
249
+ if (c == 0) {
250
+ str.erase(str.begin(), str.begin() + len);
251
+ return;
252
+ }
253
+ len--;
254
+ str[len] = isRawData ? c : freq.getElement(c - 1);
255
+ }
256
+ }
257
+
258
+ template<class OutputStream>
259
+ void save(OutputStream& os) const
260
+ {
261
+ cybozu::save(os, skip_);
262
+ cybozu::savePodVec(os, cf);
263
+ wm.save(os);
264
+ cybozu::savePodVec(os, alignedSa);
265
+ alignedPos.save(os);
266
+ if (!isRawData) freq.save(os);
267
+ }
268
+ template<class InputStream>
269
+ void load(InputStream& is)
270
+ {
271
+ cybozu::load(skip_, is);
272
+ cybozu::loadPodVec(cf, is);
273
+ wm.load(is);
274
+ cybozu::loadPodVec(alignedSa, is);
275
+ alignedPos.load(is);
276
+ if (isRawData) {
277
+ charNum_ = size_t(1) << (sizeof(T) * 8);
278
+ } else {
279
+ freq.load(is);
280
+ charNum_ = freq.size();
281
+ }
282
+ }
283
+ };
284
+
285
+ typedef FMindexT<uint8_t> FMindex;
286
+
287
+ } // cybozu
288
+
289
+ #ifdef _MSC_VER
290
+ #pragma warning(pop)
291
+ #endif
@@ -0,0 +1,93 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief format string
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ */
7
+ #include <string>
8
+ #include <stdio.h>
9
+ #include <stdarg.h>
10
+ #include <stdlib.h>
11
+ #include <cybozu/exception.hpp>
12
+
13
+ #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
14
+ #define CYBOZU_FORMAT_DISABLE_WARNING
15
+ #pragma GCC diagnostic push
16
+ #pragma GCC diagnostic ignored "-Wformat-nonliteral"
17
+ #endif
18
+
19
+ namespace cybozu {
20
+
21
+ inline void vformat(std::string& str, const char *format, va_list args)
22
+ {
23
+ #ifdef _MSC_VER
24
+ _locale_t curLoc = _get_current_locale();
25
+ int size = _vscprintf_l(format, curLoc, args);
26
+ if (size < 0 || size >= INT_MAX) throw cybozu::Exception("vformat:_vscprintf_l");
27
+
28
+ str.resize(size + 1);
29
+
30
+ int ret = _vsprintf_s_l(&str[0], size + 1, format, curLoc, args);
31
+ if (ret < 0) throw cybozu::Exception("vformat:_vsprintf_s_l");
32
+ str.resize(size);
33
+ #else
34
+ #if 1
35
+ char *p;
36
+ int ret = vasprintf(&p, format, args);
37
+ if (ret < 0) throw cybozu::Exception("vformat:vasnprintf");
38
+ try {
39
+ str.assign(p, ret);
40
+ free(p);
41
+ } catch (...) {
42
+ free(p);
43
+ throw std::bad_alloc();
44
+ }
45
+ #else
46
+ // slow
47
+ va_list keep;
48
+ va_copy(keep, args);
49
+ int len = vsnprintf(0, 0, format, args); // len excludes the null byte
50
+ if (len < 0) throw cybozu::Exception("vformat:vasnprintf err1");
51
+ str.resize(len + 1);
52
+ len = vsnprintf(&str[0], str.size(), format, keep); // len incluedes the null byte
53
+ if (len < 0) throw cybozu::Exception("vformat:vasnprintf err2");
54
+ str.resize(len);
55
+ #endif
56
+ #endif
57
+ }
58
+
59
+ #ifdef _MSC_VER
60
+ #define CYBOZU_FORMAT_PRINTF _Printf_format_string_
61
+ #else
62
+ #define CYBOZU_FORMAT_PRINTF
63
+ #endif
64
+
65
+ #ifdef __GNUC__
66
+ __attribute__((format(printf, 2, 3)))
67
+ #endif
68
+ inline void format(std::string& str, CYBOZU_FORMAT_PRINTF const char *format, ...)
69
+ {
70
+ va_list args;
71
+ va_start(args, format);
72
+ cybozu::vformat(str, format, args);
73
+ va_end(args);
74
+ }
75
+
76
+ #ifdef __GNUC__
77
+ __attribute__((format(printf, 1, 2)))
78
+ #endif
79
+ inline std::string format(CYBOZU_FORMAT_PRINTF const char *format, ...)
80
+ {
81
+ std::string str;
82
+ va_list args;
83
+ va_start(args, format);
84
+ cybozu::vformat(str, format, args);
85
+ va_end(args);
86
+ return str;
87
+ }
88
+
89
+ } // cybozu
90
+
91
+ #ifdef CYBOZU_FORMAT_DISABLE_WARNING
92
+ #pragma GCC diagnostic push
93
+ #endif
@@ -0,0 +1,264 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief frequency of elements in a sequence
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ */
9
+ #include <assert.h>
10
+ #include <vector>
11
+ #include <algorithm>
12
+ #include <functional>
13
+ #include <iostream>
14
+ #include <cybozu/exception.hpp>
15
+ #include <cybozu/unordered_map.hpp>
16
+ #include <cybozu/serializer.hpp>
17
+
18
+ namespace cybozu {
19
+
20
+ namespace freq_local {
21
+
22
+ template<class Element, class Int = size_t>
23
+ class FrequencyVec {
24
+ static const size_t N = size_t(1) << (sizeof(Element) * 8);
25
+ size_t size_;
26
+ Int freqTbl_[N];
27
+ uint8_t char2idx_[N];
28
+ uint8_t idx2char_[N];
29
+ struct Greater {
30
+ const Int *p_;
31
+ explicit Greater(const Int *p) : p_(p) {}
32
+ bool operator()(uint8_t lhs, uint8_t rhs) const
33
+ {
34
+ Int a = p_[lhs];
35
+ Int b = p_[rhs];
36
+ if (a > b) return true;
37
+ if (a < b) return false;
38
+ return a > b;
39
+ }
40
+ };
41
+ public:
42
+ typedef Element value_type;
43
+ typedef Int size_type;
44
+
45
+ FrequencyVec() { clear(); }
46
+ template<class Iter>
47
+ FrequencyVec(Iter begin, Iter end)
48
+ {
49
+ clear();
50
+ init(begin, end);
51
+ }
52
+ void clear()
53
+ {
54
+ size_ = 0;
55
+ memset(freqTbl_, 0, sizeof(freqTbl_));
56
+ }
57
+ template<class Iter>
58
+ void init(Iter begin, Iter end)
59
+ {
60
+ while (begin != end) {
61
+ append(*begin);
62
+ ++begin;
63
+ }
64
+ ready();
65
+ }
66
+ void append(const Element e)
67
+ {
68
+ freqTbl_[uint8_t(e)]++;
69
+ }
70
+ void ready()
71
+ {
72
+ for (size_t i = 0; i < N; i++) idx2char_[i] = uint8_t(i);
73
+ Greater greater(freqTbl_);
74
+ std::sort(idx2char_, idx2char_ + N, greater);
75
+ size_ = 0;
76
+ for (size_t i = 0; i < N; i++) {
77
+ uint8_t c = idx2char_[i];
78
+ char2idx_[c] = (uint8_t)i;
79
+ if (freqTbl_[c]) size_++;
80
+ }
81
+ }
82
+ /*
83
+ element -> freq
84
+ */
85
+ Int getFrequency(Element e) const { return freqTbl_[uint8_t(e)]; }
86
+ /*
87
+ element -> idx
88
+ */
89
+ Int getIndex(Element e) const { return char2idx_[uint8_t(e)]; }
90
+ /*
91
+ idx -> element
92
+ */
93
+ Element getElement(size_t idx) const
94
+ {
95
+ // if (idx >= N) throw cybozu::Exception("Frequency:getElement:bad idx") << idx;
96
+ assert(idx < N);
97
+ return Element(idx2char_[idx]);
98
+ }
99
+ size_t size() const { return size_; }
100
+ template<class InputStream>
101
+ void load(InputStream& is)
102
+ {
103
+ cybozu::load(size_, is);
104
+ cybozu::loadRange(freqTbl_, N, is);
105
+ cybozu::loadRange(char2idx_, N, is);
106
+ cybozu::loadRange(idx2char_, N, is);
107
+ }
108
+ void save(std::ostream& os) const
109
+ {
110
+ cybozu::save(os, size_);
111
+ cybozu::saveRange(os, freqTbl_, N);
112
+ cybozu::saveRange(os, char2idx_, N);
113
+ cybozu::saveRange(os, idx2char_, N);
114
+ }
115
+ void put() const
116
+ {
117
+ for (size_t i = 0; i < size_; i++) {
118
+ uint8_t c = idx2char_[i];
119
+ printf("%d %d %d\n", (int)i, c, freqTbl_[c]);
120
+ }
121
+ }
122
+ };
123
+
124
+ } // cybozu::freq_local
125
+
126
+ /*
127
+ count Element
128
+ Element : type of element
129
+ Int : type of counter
130
+ */
131
+ template<class Element, class Int = size_t>
132
+ class Frequency {
133
+ struct FreqIdx {
134
+ Int freq;
135
+ mutable Int idx;
136
+ template<class InputStream>
137
+ void load(InputStream& is)
138
+ {
139
+ cybozu::load(freq, is);
140
+ cybozu::load(idx, is);
141
+ }
142
+ template<class OutputStream>
143
+ void save(OutputStream& os) const
144
+ {
145
+ cybozu::save(os, freq);
146
+ cybozu::save(os, idx);
147
+ }
148
+ };
149
+ typedef CYBOZU_NAMESPACE_STD::unordered_map<Element, FreqIdx> Map;
150
+ typedef Element value_type;
151
+ typedef Int size_type;
152
+ typedef std::vector<typename Map::const_iterator> Idx2Ref;
153
+ static inline bool greater(typename Map::const_iterator i, typename Map::const_iterator j)
154
+ {
155
+ const Int a = i->second.freq;
156
+ const Int b = j->second.freq;
157
+ if (a > b) return true;
158
+ if (a < b) return false;
159
+ return i->first > j->first;
160
+ }
161
+ Map m_;
162
+ Idx2Ref idx2ref_;
163
+ void initIdx2Ref()
164
+ {
165
+ idx2ref_.resize(m_.size());
166
+ size_t pos = 0;
167
+ for (typename Map::const_iterator i = m_.begin(), ie = m_.end(); i != ie; ++i) {
168
+ idx2ref_[pos++] = i;
169
+ }
170
+ std::sort(idx2ref_.begin(), idx2ref_.end(), greater);
171
+ }
172
+ public:
173
+ Frequency(){ clear(); }
174
+ template<class Iter>
175
+ Frequency(Iter begin, Iter end)
176
+ {
177
+ clear();
178
+ init(begin, end);
179
+ }
180
+ void clear()
181
+ {
182
+ m_.clear();
183
+ idx2ref_.clear();
184
+ }
185
+ template<class Iter>
186
+ void init(Iter begin, Iter end)
187
+ {
188
+ while (begin != end) {
189
+ append(*begin);
190
+ ++begin;
191
+ }
192
+ ready();
193
+ }
194
+ void append(const Element& e)
195
+ {
196
+ m_[e].freq++;
197
+ }
198
+ void ready()
199
+ {
200
+ initIdx2Ref();
201
+ for (size_t i = 0, ie = idx2ref_.size(); i < ie; i++) {
202
+ idx2ref_[i]->second.idx = (Int)i;
203
+ }
204
+ }
205
+ /*
206
+ element -> freq
207
+ */
208
+ Int getFrequency(const Element& e) const
209
+ {
210
+ typename Map::const_iterator i = m_.find(e);
211
+ return (i != m_.end()) ? i->second.freq : 0;
212
+ }
213
+ /*
214
+ element -> idx
215
+ */
216
+ Int getIndex(const Element& e) const
217
+ {
218
+ typename Map::const_iterator i = m_.find(e);
219
+ if (i == m_.end()) throw cybozu::Exception("Frequency:getIndex:not found") << e;
220
+ return i->second.idx;
221
+ }
222
+ /*
223
+ idx -> element
224
+ */
225
+ const Element& getElement(size_t idx) const
226
+ {
227
+ if (idx >= idx2ref_.size()) throw cybozu::Exception("Frequency:getElement:bad idx") << idx;
228
+ return idx2ref_[idx]->first;
229
+ }
230
+ size_t size() const { return idx2ref_.size(); }
231
+ template<class InputStream>
232
+ void load(InputStream& is)
233
+ {
234
+ cybozu::load(m_, is);
235
+ initIdx2Ref();
236
+ }
237
+ template<class OutputStream>
238
+ void save(OutputStream& os) const
239
+ {
240
+ cybozu::save(os, m_);
241
+ }
242
+ void put() const
243
+ {
244
+ for (size_t i = 0, n = idx2ref_.size(); i < n; i++) {
245
+ typename Map::const_iterator j = idx2ref_[i];
246
+ std::cout << i << ' ' << j->first << ' ' << j->second.freq << std::endl;
247
+ }
248
+ }
249
+ };
250
+
251
+ template<class Int>
252
+ struct Frequency<uint8_t, Int> : freq_local::FrequencyVec<uint8_t, Int> {
253
+ Frequency() {}
254
+ template<class Iterator>
255
+ Frequency(Iterator begin, Iterator end) : freq_local::FrequencyVec<uint8_t, Int>(begin, end) {}
256
+ };
257
+ template<class Int>
258
+ struct Frequency<char, Int> : freq_local::FrequencyVec<char, Int> {
259
+ Frequency() {}
260
+ template<class Iterator>
261
+ Frequency(Iterator begin, Iterator end) : freq_local::FrequencyVec<char, Int>(begin, end) {}
262
+ };
263
+
264
+ } // cybozu
@@ -0,0 +1,67 @@
1
+ #pragma once
2
+ #include <cybozu/inttype.hpp>
3
+
4
+ namespace cybozu {
5
+
6
+ template<class Iter>
7
+ uint32_t hash32(Iter begin, Iter end, uint32_t v = 0)
8
+ {
9
+ if (v == 0) v = 2166136261U;
10
+ while (begin != end) {
11
+ v ^= *begin++;
12
+ v *= 16777619;
13
+ }
14
+ return v;
15
+ }
16
+ template<class Iter>
17
+ uint64_t hash64(Iter begin, Iter end, uint64_t v = 0)
18
+ {
19
+ if (v == 0) v = 14695981039346656037ULL;
20
+ while (begin != end) {
21
+ v ^= *begin++;
22
+ v *= 1099511628211ULL;
23
+ }
24
+ v ^= v >> 32;
25
+ return v;
26
+ }
27
+ template<class T>
28
+ uint32_t hash32(const T *x, size_t n, uint32_t v = 0)
29
+ {
30
+ return hash32(x, x + n, v);
31
+ }
32
+ template<class T>
33
+ uint64_t hash64(const T *x, size_t n, uint64_t v = 0)
34
+ {
35
+ return hash64(x, x + n, v);
36
+ }
37
+
38
+ } // cybozu
39
+
40
+ namespace boost {
41
+
42
+ template<class T>
43
+ struct hash;
44
+
45
+ } // boost
46
+
47
+ #if CYBOZU_CPP_VERSION >= CYBOZU_CPP_VERSION_CPP11
48
+ #include <functional>
49
+ #else
50
+
51
+ namespace std { CYBOZU_NAMESPACE_TR1_BEGIN
52
+
53
+ #ifdef _MSC_VER
54
+ #pragma warning(push)
55
+ #pragma warning(disable : 4099) // missmatch class and struct
56
+ #endif
57
+ #if !(defined(__APPLE__) && defined(__clang__))
58
+ template<class T>
59
+ struct hash;
60
+ #endif
61
+ #ifdef _MSC_VER
62
+ #pragma warning(pop)
63
+ #endif
64
+
65
+ CYBOZU_NAMESPACE_TR1_END } // std
66
+
67
+ #endif