ooxml_crypt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,291 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief FM-index
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ */
9
+ #include <map>
10
+ #include <vector>
11
+ #include <fstream>
12
+ #include <stdio.h>
13
+ #ifdef CYBOZU_FMINDEX_USE_CSUCVECTOR
14
+ #include <cybozu/csucvector.hpp>
15
+ #endif
16
+ #include <cybozu/wavelet_matrix.hpp>
17
+ #include <cybozu/bitvector.hpp>
18
+ #include <cybozu/frequency.hpp>
19
+
20
+ #ifdef _MSC_VER
21
+ #pragma warning(push)
22
+ #pragma warning(disable:4244)
23
+ #pragma warning(disable:4389)
24
+ #pragma warning(disable:4018)
25
+ #endif
26
+ #include "sais.hxx"
27
+ #ifdef _MSC_VER
28
+ #pragma warning(pop)
29
+ #endif
30
+
31
+ #ifdef _MSC_VER
32
+ #pragma warning(push)
33
+ #pragma warning(disable:4127) // constant condition
34
+ #endif
35
+
36
+ namespace cybozu {
37
+ /*
38
+ T : type of alphabet
39
+ isRawData : deal with input data as is
40
+ T must be uint8_t or uint16_t if isRawData
41
+ */
42
+ template<class T, bool isRawData = false>
43
+ class FMindexT {
44
+ public:
45
+ static const size_t maxCharNum = size_t(1) << (sizeof(T) * 8);
46
+ typedef std::vector<uint32_t> Vec32;
47
+ typedef std::vector<T> Vec;
48
+ #ifdef CYBOZU_FMINDEX_USE_CSUCVECTOR
49
+ typedef cybozu::CSucVector SucVector;
50
+ #else
51
+ typedef cybozu::SucVectorT<uint32_t, false> SucVector;
52
+ #endif
53
+ typedef cybozu::WaveletMatrixT<false, SucVector> WaveletMatrix;
54
+ Vec32 cf;
55
+ WaveletMatrix wm;
56
+ Vec32 alignedSa;
57
+ SucVector alignedPos;
58
+ cybozu::Frequency<T, uint32_t> freq;
59
+ int skip_;
60
+ size_t charNum_;
61
+
62
+ /*
63
+ setup freq, cf by [begin, end)
64
+ */
65
+ template<class Iter>
66
+ void initCf(Vec& v, Iter begin, Iter end)
67
+ {
68
+ const size_t size = std::distance(begin, end);
69
+ if (size >= (uint64_t(1) << 32) - 1) {
70
+ throw cybozu::Exception("FMindexT:initCf:too large dataSize") << size;
71
+ }
72
+ v.resize(size + 1); // add NUL at the end of data
73
+ if (isRawData) {
74
+ assert(sizeof(T) <= 16);
75
+ charNum_ = size_t(1) << (sizeof(T) * 8);
76
+ std::vector<uint32_t> charNumTbl(charNum_);
77
+ charNumTbl[0] = 1;
78
+ for (size_t i = 0; i < size; i++) {
79
+ T c = *begin++;
80
+ if (c <= 0) throw cybozu::Exception("FMindext:initCf:zero alphabet") << c;
81
+ v[i] = c;
82
+ charNumTbl[c]++;
83
+ }
84
+ cf.resize(charNum_);
85
+ uint32_t sum = 0;
86
+ for (size_t i = 0; i < charNum_; i++) {
87
+ cf[i] = sum;
88
+ sum += charNumTbl[i];
89
+ }
90
+ } else {
91
+ freq.init(begin, end);
92
+ charNum_ = freq.size() + 1; // +1 means last zero
93
+ if (charNum_ > maxCharNum) throw cybozu::Exception("FMindexT:initCf:too many alphabet");
94
+ for (size_t i = 0; i < size; i++) {
95
+ v[i] = static_cast<T>(freq.getIndex(*begin++) + 1);
96
+ }
97
+ cf.resize(charNum_);
98
+ cf[0] = 0;
99
+ uint32_t sum = 1;
100
+ for (size_t i = 1; i < charNum_; i++) {
101
+ cf[i] = sum;
102
+ sum += freq.getFrequency(freq.getElement(i - 1));
103
+ }
104
+ }
105
+ }
106
+ void initBwt(Vec& bwt, const Vec& s, const Vec32& sa) const
107
+ {
108
+ const size_t size = sa.size();
109
+ bwt.resize(size);
110
+ for (size_t i = 0; i < size; i++) {
111
+ if (sa[i] > 0) {
112
+ bwt[i] = s[sa[i] - 1];
113
+ } else {
114
+ bwt[i] = s[size - 1];
115
+ }
116
+ }
117
+ }
118
+ size_t getBitLen(size_t x) const
119
+ {
120
+ if (x == 0) return 1;
121
+ size_t ret = 0;
122
+ while (x > 0) {
123
+ x >>= 1;
124
+ ret++;
125
+ }
126
+ return ret;
127
+ }
128
+ public:
129
+ FMindexT()
130
+ : skip_(8)
131
+ , charNum_(0)
132
+ {
133
+ }
134
+
135
+ /*
136
+ [begin, end)
137
+ replace '\0' in [begin, end) with space
138
+ append '\0' at the end of [begin, end)
139
+ */
140
+ template<class Iter>
141
+ void init(Iter begin, Iter end, int skip = 8)
142
+ {
143
+ if (skip <= 0) {
144
+ throw cybozu::Exception("FMindexT:buildFMindex:skip is positive") << skip;
145
+ }
146
+ skip_ = skip;
147
+ Vec v;
148
+ initCf(v, begin, end);
149
+ const size_t dataSize = v.size();
150
+
151
+ Vec32 sa;
152
+ sa.resize(dataSize);
153
+ if (saisxx(&v[0], &sa[0], (int)dataSize, (int)charNum_) == -1) {
154
+ throw cybozu::Exception("FMindexT:init:saisxx");
155
+ }
156
+ Vec bwt;
157
+ initBwt(bwt, v, sa);
158
+ wm.init(bwt, getBitLen(charNum_));
159
+
160
+ #if 1
161
+ cybozu::BitVector bv;
162
+ bv.resize(dataSize);
163
+ for (size_t i = 0; i < dataSize; i++) {
164
+ if ((sa[i] % skip) == 0) {
165
+ bv.set(i);
166
+ alignedSa.push_back(sa[i]);
167
+ }
168
+ }
169
+ alignedPos.init(bv.getBlock(), bv.size());
170
+ #else
171
+ alignedPos.resize(dataSize);
172
+ for (size_t i = 0; i < dataSize; i++) {
173
+ if ((sa[i] % skip) == 0) {
174
+ alignedPos.set(i);
175
+ alignedSa.push_back(sa[i]);
176
+ }
177
+ }
178
+ alignedPos.ready();
179
+ #endif
180
+ }
181
+
182
+ /*
183
+ get range of bwt for key
184
+ */
185
+ template<class Int, class Key>
186
+ bool getRange(Int* pbegin, Int* pend, const Key& _key) const
187
+ {
188
+ if (_key.empty()) return false;
189
+ const size_t keySize = _key.size();
190
+ const typename Key::value_type *key;
191
+ Key cvtKey;
192
+ if (isRawData) {
193
+ key = &_key[0];
194
+ } else {
195
+ cvtKey.resize(keySize);
196
+ for (size_t i = 0; i < keySize; i++) {
197
+ if (freq.getFrequency(_key[i]) == 0) return false;
198
+ cvtKey[i] = typename Key::value_type(freq.getIndex(_key[i]) + 1);
199
+ }
200
+ key = &cvtKey[0];
201
+ }
202
+ size_t i = keySize - 1;
203
+ size_t begin = 0;
204
+ size_t end = wm.size();
205
+ while (begin < end) {
206
+ const T c = key[i];
207
+ const uint32_t cfc = cf[c];
208
+ begin = cfc + wm.rank(c, begin);
209
+ end = cfc + wm.rank(c, end);
210
+ if (i == 0) break;
211
+ i--;
212
+ }
213
+
214
+ if (begin < end) {
215
+ *pbegin = Int(begin);
216
+ *pend = Int(end);
217
+ return true;
218
+ }
219
+ return false;
220
+ }
221
+ template<class Int>
222
+ bool getRange(Int* pbegin, Int* pend, const char *key) const
223
+ {
224
+ return getRange(pbegin, pend, std::string(key));
225
+ }
226
+ size_t convertPosition(size_t bwtPos) const
227
+ {
228
+ size_t t = 0;
229
+ while (!alignedPos.get(bwtPos)) {
230
+ T c;
231
+ bwtPos = wm.get(&c, bwtPos);
232
+ bwtPos += cf[c];
233
+ t++;
234
+ }
235
+ return t + alignedSa[alignedPos.rank1(bwtPos)];
236
+ }
237
+ /*
238
+ get previous string at pos
239
+ @note assume T is vector or std::string
240
+ */
241
+ template<class Str>
242
+ void getPrevString(Str& str, size_t bwtPos, size_t len) const
243
+ {
244
+ str.resize(len);
245
+ T c;
246
+ while (len > 0) {
247
+ bwtPos = wm.get(&c, bwtPos);
248
+ bwtPos += cf[c];
249
+ if (c == 0) {
250
+ str.erase(str.begin(), str.begin() + len);
251
+ return;
252
+ }
253
+ len--;
254
+ str[len] = isRawData ? c : freq.getElement(c - 1);
255
+ }
256
+ }
257
+
258
+ template<class OutputStream>
259
+ void save(OutputStream& os) const
260
+ {
261
+ cybozu::save(os, skip_);
262
+ cybozu::savePodVec(os, cf);
263
+ wm.save(os);
264
+ cybozu::savePodVec(os, alignedSa);
265
+ alignedPos.save(os);
266
+ if (!isRawData) freq.save(os);
267
+ }
268
+ template<class InputStream>
269
+ void load(InputStream& is)
270
+ {
271
+ cybozu::load(skip_, is);
272
+ cybozu::loadPodVec(cf, is);
273
+ wm.load(is);
274
+ cybozu::loadPodVec(alignedSa, is);
275
+ alignedPos.load(is);
276
+ if (isRawData) {
277
+ charNum_ = size_t(1) << (sizeof(T) * 8);
278
+ } else {
279
+ freq.load(is);
280
+ charNum_ = freq.size();
281
+ }
282
+ }
283
+ };
284
+
285
+ typedef FMindexT<uint8_t> FMindex;
286
+
287
+ } // cybozu
288
+
289
+ #ifdef _MSC_VER
290
+ #pragma warning(pop)
291
+ #endif
@@ -0,0 +1,93 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief format string
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ */
7
+ #include <string>
8
+ #include <stdio.h>
9
+ #include <stdarg.h>
10
+ #include <stdlib.h>
11
+ #include <cybozu/exception.hpp>
12
+
13
+ #if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4)
14
+ #define CYBOZU_FORMAT_DISABLE_WARNING
15
+ #pragma GCC diagnostic push
16
+ #pragma GCC diagnostic ignored "-Wformat-nonliteral"
17
+ #endif
18
+
19
+ namespace cybozu {
20
+
21
+ inline void vformat(std::string& str, const char *format, va_list args)
22
+ {
23
+ #ifdef _MSC_VER
24
+ _locale_t curLoc = _get_current_locale();
25
+ int size = _vscprintf_l(format, curLoc, args);
26
+ if (size < 0 || size >= INT_MAX) throw cybozu::Exception("vformat:_vscprintf_l");
27
+
28
+ str.resize(size + 1);
29
+
30
+ int ret = _vsprintf_s_l(&str[0], size + 1, format, curLoc, args);
31
+ if (ret < 0) throw cybozu::Exception("vformat:_vsprintf_s_l");
32
+ str.resize(size);
33
+ #else
34
+ #if 1
35
+ char *p;
36
+ int ret = vasprintf(&p, format, args);
37
+ if (ret < 0) throw cybozu::Exception("vformat:vasnprintf");
38
+ try {
39
+ str.assign(p, ret);
40
+ free(p);
41
+ } catch (...) {
42
+ free(p);
43
+ throw std::bad_alloc();
44
+ }
45
+ #else
46
+ // slow
47
+ va_list keep;
48
+ va_copy(keep, args);
49
+ int len = vsnprintf(0, 0, format, args); // len excludes the null byte
50
+ if (len < 0) throw cybozu::Exception("vformat:vasnprintf err1");
51
+ str.resize(len + 1);
52
+ len = vsnprintf(&str[0], str.size(), format, keep); // len incluedes the null byte
53
+ if (len < 0) throw cybozu::Exception("vformat:vasnprintf err2");
54
+ str.resize(len);
55
+ #endif
56
+ #endif
57
+ }
58
+
59
+ #ifdef _MSC_VER
60
+ #define CYBOZU_FORMAT_PRINTF _Printf_format_string_
61
+ #else
62
+ #define CYBOZU_FORMAT_PRINTF
63
+ #endif
64
+
65
+ #ifdef __GNUC__
66
+ __attribute__((format(printf, 2, 3)))
67
+ #endif
68
+ inline void format(std::string& str, CYBOZU_FORMAT_PRINTF const char *format, ...)
69
+ {
70
+ va_list args;
71
+ va_start(args, format);
72
+ cybozu::vformat(str, format, args);
73
+ va_end(args);
74
+ }
75
+
76
+ #ifdef __GNUC__
77
+ __attribute__((format(printf, 1, 2)))
78
+ #endif
79
+ inline std::string format(CYBOZU_FORMAT_PRINTF const char *format, ...)
80
+ {
81
+ std::string str;
82
+ va_list args;
83
+ va_start(args, format);
84
+ cybozu::vformat(str, format, args);
85
+ va_end(args);
86
+ return str;
87
+ }
88
+
89
+ } // cybozu
90
+
91
+ #ifdef CYBOZU_FORMAT_DISABLE_WARNING
92
+ #pragma GCC diagnostic push
93
+ #endif
@@ -0,0 +1,264 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief frequency of elements in a sequence
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ */
9
+ #include <assert.h>
10
+ #include <vector>
11
+ #include <algorithm>
12
+ #include <functional>
13
+ #include <iostream>
14
+ #include <cybozu/exception.hpp>
15
+ #include <cybozu/unordered_map.hpp>
16
+ #include <cybozu/serializer.hpp>
17
+
18
+ namespace cybozu {
19
+
20
+ namespace freq_local {
21
+
22
+ template<class Element, class Int = size_t>
23
+ class FrequencyVec {
24
+ static const size_t N = size_t(1) << (sizeof(Element) * 8);
25
+ size_t size_;
26
+ Int freqTbl_[N];
27
+ uint8_t char2idx_[N];
28
+ uint8_t idx2char_[N];
29
+ struct Greater {
30
+ const Int *p_;
31
+ explicit Greater(const Int *p) : p_(p) {}
32
+ bool operator()(uint8_t lhs, uint8_t rhs) const
33
+ {
34
+ Int a = p_[lhs];
35
+ Int b = p_[rhs];
36
+ if (a > b) return true;
37
+ if (a < b) return false;
38
+ return a > b;
39
+ }
40
+ };
41
+ public:
42
+ typedef Element value_type;
43
+ typedef Int size_type;
44
+
45
+ FrequencyVec() { clear(); }
46
+ template<class Iter>
47
+ FrequencyVec(Iter begin, Iter end)
48
+ {
49
+ clear();
50
+ init(begin, end);
51
+ }
52
+ void clear()
53
+ {
54
+ size_ = 0;
55
+ memset(freqTbl_, 0, sizeof(freqTbl_));
56
+ }
57
+ template<class Iter>
58
+ void init(Iter begin, Iter end)
59
+ {
60
+ while (begin != end) {
61
+ append(*begin);
62
+ ++begin;
63
+ }
64
+ ready();
65
+ }
66
+ void append(const Element e)
67
+ {
68
+ freqTbl_[uint8_t(e)]++;
69
+ }
70
+ void ready()
71
+ {
72
+ for (size_t i = 0; i < N; i++) idx2char_[i] = uint8_t(i);
73
+ Greater greater(freqTbl_);
74
+ std::sort(idx2char_, idx2char_ + N, greater);
75
+ size_ = 0;
76
+ for (size_t i = 0; i < N; i++) {
77
+ uint8_t c = idx2char_[i];
78
+ char2idx_[c] = (uint8_t)i;
79
+ if (freqTbl_[c]) size_++;
80
+ }
81
+ }
82
+ /*
83
+ element -> freq
84
+ */
85
+ Int getFrequency(Element e) const { return freqTbl_[uint8_t(e)]; }
86
+ /*
87
+ element -> idx
88
+ */
89
+ Int getIndex(Element e) const { return char2idx_[uint8_t(e)]; }
90
+ /*
91
+ idx -> element
92
+ */
93
+ Element getElement(size_t idx) const
94
+ {
95
+ // if (idx >= N) throw cybozu::Exception("Frequency:getElement:bad idx") << idx;
96
+ assert(idx < N);
97
+ return Element(idx2char_[idx]);
98
+ }
99
+ size_t size() const { return size_; }
100
+ template<class InputStream>
101
+ void load(InputStream& is)
102
+ {
103
+ cybozu::load(size_, is);
104
+ cybozu::loadRange(freqTbl_, N, is);
105
+ cybozu::loadRange(char2idx_, N, is);
106
+ cybozu::loadRange(idx2char_, N, is);
107
+ }
108
+ void save(std::ostream& os) const
109
+ {
110
+ cybozu::save(os, size_);
111
+ cybozu::saveRange(os, freqTbl_, N);
112
+ cybozu::saveRange(os, char2idx_, N);
113
+ cybozu::saveRange(os, idx2char_, N);
114
+ }
115
+ void put() const
116
+ {
117
+ for (size_t i = 0; i < size_; i++) {
118
+ uint8_t c = idx2char_[i];
119
+ printf("%d %d %d\n", (int)i, c, freqTbl_[c]);
120
+ }
121
+ }
122
+ };
123
+
124
+ } // cybozu::freq_local
125
+
126
+ /*
127
+ count Element
128
+ Element : type of element
129
+ Int : type of counter
130
+ */
131
+ template<class Element, class Int = size_t>
132
+ class Frequency {
133
+ struct FreqIdx {
134
+ Int freq;
135
+ mutable Int idx;
136
+ template<class InputStream>
137
+ void load(InputStream& is)
138
+ {
139
+ cybozu::load(freq, is);
140
+ cybozu::load(idx, is);
141
+ }
142
+ template<class OutputStream>
143
+ void save(OutputStream& os) const
144
+ {
145
+ cybozu::save(os, freq);
146
+ cybozu::save(os, idx);
147
+ }
148
+ };
149
+ typedef CYBOZU_NAMESPACE_STD::unordered_map<Element, FreqIdx> Map;
150
+ typedef Element value_type;
151
+ typedef Int size_type;
152
+ typedef std::vector<typename Map::const_iterator> Idx2Ref;
153
+ static inline bool greater(typename Map::const_iterator i, typename Map::const_iterator j)
154
+ {
155
+ const Int a = i->second.freq;
156
+ const Int b = j->second.freq;
157
+ if (a > b) return true;
158
+ if (a < b) return false;
159
+ return i->first > j->first;
160
+ }
161
+ Map m_;
162
+ Idx2Ref idx2ref_;
163
+ void initIdx2Ref()
164
+ {
165
+ idx2ref_.resize(m_.size());
166
+ size_t pos = 0;
167
+ for (typename Map::const_iterator i = m_.begin(), ie = m_.end(); i != ie; ++i) {
168
+ idx2ref_[pos++] = i;
169
+ }
170
+ std::sort(idx2ref_.begin(), idx2ref_.end(), greater);
171
+ }
172
+ public:
173
+ Frequency(){ clear(); }
174
+ template<class Iter>
175
+ Frequency(Iter begin, Iter end)
176
+ {
177
+ clear();
178
+ init(begin, end);
179
+ }
180
+ void clear()
181
+ {
182
+ m_.clear();
183
+ idx2ref_.clear();
184
+ }
185
+ template<class Iter>
186
+ void init(Iter begin, Iter end)
187
+ {
188
+ while (begin != end) {
189
+ append(*begin);
190
+ ++begin;
191
+ }
192
+ ready();
193
+ }
194
+ void append(const Element& e)
195
+ {
196
+ m_[e].freq++;
197
+ }
198
+ void ready()
199
+ {
200
+ initIdx2Ref();
201
+ for (size_t i = 0, ie = idx2ref_.size(); i < ie; i++) {
202
+ idx2ref_[i]->second.idx = (Int)i;
203
+ }
204
+ }
205
+ /*
206
+ element -> freq
207
+ */
208
+ Int getFrequency(const Element& e) const
209
+ {
210
+ typename Map::const_iterator i = m_.find(e);
211
+ return (i != m_.end()) ? i->second.freq : 0;
212
+ }
213
+ /*
214
+ element -> idx
215
+ */
216
+ Int getIndex(const Element& e) const
217
+ {
218
+ typename Map::const_iterator i = m_.find(e);
219
+ if (i == m_.end()) throw cybozu::Exception("Frequency:getIndex:not found") << e;
220
+ return i->second.idx;
221
+ }
222
+ /*
223
+ idx -> element
224
+ */
225
+ const Element& getElement(size_t idx) const
226
+ {
227
+ if (idx >= idx2ref_.size()) throw cybozu::Exception("Frequency:getElement:bad idx") << idx;
228
+ return idx2ref_[idx]->first;
229
+ }
230
+ size_t size() const { return idx2ref_.size(); }
231
+ template<class InputStream>
232
+ void load(InputStream& is)
233
+ {
234
+ cybozu::load(m_, is);
235
+ initIdx2Ref();
236
+ }
237
+ template<class OutputStream>
238
+ void save(OutputStream& os) const
239
+ {
240
+ cybozu::save(os, m_);
241
+ }
242
+ void put() const
243
+ {
244
+ for (size_t i = 0, n = idx2ref_.size(); i < n; i++) {
245
+ typename Map::const_iterator j = idx2ref_[i];
246
+ std::cout << i << ' ' << j->first << ' ' << j->second.freq << std::endl;
247
+ }
248
+ }
249
+ };
250
+
251
+ template<class Int>
252
+ struct Frequency<uint8_t, Int> : freq_local::FrequencyVec<uint8_t, Int> {
253
+ Frequency() {}
254
+ template<class Iterator>
255
+ Frequency(Iterator begin, Iterator end) : freq_local::FrequencyVec<uint8_t, Int>(begin, end) {}
256
+ };
257
+ template<class Int>
258
+ struct Frequency<char, Int> : freq_local::FrequencyVec<char, Int> {
259
+ Frequency() {}
260
+ template<class Iterator>
261
+ Frequency(Iterator begin, Iterator end) : freq_local::FrequencyVec<char, Int>(begin, end) {}
262
+ };
263
+
264
+ } // cybozu
@@ -0,0 +1,67 @@
1
+ #pragma once
2
+ #include <cybozu/inttype.hpp>
3
+
4
+ namespace cybozu {
5
+
6
+ template<class Iter>
7
+ uint32_t hash32(Iter begin, Iter end, uint32_t v = 0)
8
+ {
9
+ if (v == 0) v = 2166136261U;
10
+ while (begin != end) {
11
+ v ^= *begin++;
12
+ v *= 16777619;
13
+ }
14
+ return v;
15
+ }
16
+ template<class Iter>
17
+ uint64_t hash64(Iter begin, Iter end, uint64_t v = 0)
18
+ {
19
+ if (v == 0) v = 14695981039346656037ULL;
20
+ while (begin != end) {
21
+ v ^= *begin++;
22
+ v *= 1099511628211ULL;
23
+ }
24
+ v ^= v >> 32;
25
+ return v;
26
+ }
27
+ template<class T>
28
+ uint32_t hash32(const T *x, size_t n, uint32_t v = 0)
29
+ {
30
+ return hash32(x, x + n, v);
31
+ }
32
+ template<class T>
33
+ uint64_t hash64(const T *x, size_t n, uint64_t v = 0)
34
+ {
35
+ return hash64(x, x + n, v);
36
+ }
37
+
38
+ } // cybozu
39
+
40
+ namespace boost {
41
+
42
+ template<class T>
43
+ struct hash;
44
+
45
+ } // boost
46
+
47
+ #if CYBOZU_CPP_VERSION >= CYBOZU_CPP_VERSION_CPP11
48
+ #include <functional>
49
+ #else
50
+
51
+ namespace std { CYBOZU_NAMESPACE_TR1_BEGIN
52
+
53
+ #ifdef _MSC_VER
54
+ #pragma warning(push)
55
+ #pragma warning(disable : 4099) // missmatch class and struct
56
+ #endif
57
+ #if !(defined(__APPLE__) && defined(__clang__))
58
+ template<class T>
59
+ struct hash;
60
+ #endif
61
+ #ifdef _MSC_VER
62
+ #pragma warning(pop)
63
+ #endif
64
+
65
+ CYBOZU_NAMESPACE_TR1_END } // std
66
+
67
+ #endif