ooxml_crypt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,624 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief compressed succinct vector
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ @note use -msse4.2 option for popcnt
9
+ */
10
+ #include <cybozu/sucvector.hpp>
11
+ #include <cybozu/bitvector.hpp>
12
+ #include <cybozu/serializer.hpp>
13
+ #include <vector>
14
+ #include <iosfwd>
15
+ #include <map>
16
+
17
+ #ifdef _MSC_VER
18
+ #pragma warning(push)
19
+ #pragma warning(disable : 4351) // init buf in cstr
20
+ #endif
21
+
22
+ //#define USE_CLK
23
+ #ifdef USE_CLK
24
+ #include <cybozu/benchmark.hpp>
25
+ #endif
26
+
27
+ namespace cybozu {
28
+
29
+ namespace csucvector_util {
30
+
31
+ static const size_t tblBitLen = 8;
32
+ static const size_t maxTblSize = size_t(1) << tblBitLen;
33
+ static const uint64_t all1 = uint64_t(-1);
34
+
35
+ inline uint64_t getMask(size_t pos)
36
+ {
37
+ assert(pos < 64);
38
+ return (uint64_t(1) << pos) - 1;
39
+ }
40
+
41
+ struct Encoding {
42
+ uint64_t v;
43
+ uint32_t len;
44
+ uint32_t rk;
45
+ Encoding(uint64_t v = 0, uint32_t len = 0)
46
+ : v(v)
47
+ , len(len)
48
+ , rk(len <= 64 ? cybozu::popcnt<uint64_t>(v) : v == 0 ? 0 : len) { }
49
+ bool operator<(const Encoding& rhs) const
50
+ {
51
+ if (len > rhs.len) return true;
52
+ if (len < rhs.len) return false;
53
+ return v > rhs.v;
54
+ }
55
+ };
56
+
57
+ struct InputStream {
58
+ const uint64_t *block_;
59
+ size_t bitSize_;
60
+ size_t blockSize_;
61
+ size_t cur_;
62
+ InputStream(const uint64_t *block, size_t bitSize)
63
+ : block_(block), bitSize_(bitSize), blockSize_((bitSize + 63) / 64), cur_(0)
64
+ {
65
+ }
66
+ uint64_t peek(size_t offset = 0) const
67
+ {
68
+ const size_t q = (cur_ + offset) / 64;
69
+ const size_t r = (cur_ + offset) & 63;
70
+ if (q >= blockSize_) return 0;
71
+ if (r == 0) return block_[q];
72
+ uint64_t L = block_[q];
73
+ uint64_t H = q < blockSize_ - 1 ? block_[q + 1] : 0;
74
+ return ((L >> r) & getMask(64 - r)) | (H << (64 - r));
75
+ }
76
+ void consume(size_t size)
77
+ {
78
+ if (!empty()) cur_ += size;
79
+ }
80
+ bool empty() const { return cur_ >= bitSize_; }
81
+ };
82
+
83
+ struct Bigram {
84
+ struct Pair {
85
+ uint32_t prev;
86
+ uint32_t cur;
87
+ Pair(uint32_t prev = 0, uint32_t cur = 0) : prev(prev), cur(cur) {}
88
+ };
89
+ typedef std::multimap<uint32_t, Pair, std::greater<uint32_t> > PairMap;
90
+ const std::vector<Encoding>& encTbl_;
91
+ uint32_t tblNum;
92
+ std::vector<std::vector<uint32_t> > tbl;
93
+ size_t prev;
94
+ explicit Bigram(const std::vector<Encoding>& encTbl)
95
+ : encTbl_(encTbl)
96
+ , tblNum((uint32_t)encTbl.size())
97
+ , tbl()
98
+ , prev(tblNum) // first special value
99
+ {
100
+ tbl.resize(tblNum);
101
+ for (uint32_t i = 0; i < tblNum; i++) {
102
+ tbl[i].resize(tblNum);
103
+ }
104
+ }
105
+ // ~Bigram(){ put(); }
106
+ void append(uint32_t v)
107
+ {
108
+ if (v >= tblNum) throw cybozu::Exception("CSucVector:Bigram:bad v") << v;
109
+ if (prev == tblNum) {
110
+ prev = v;
111
+ return;
112
+ }
113
+ tbl[prev][v]++;
114
+ prev = v;
115
+ }
116
+ void getPairMap(PairMap& m) const
117
+ {
118
+ for (uint32_t i = 0; i < tblNum; i++) {
119
+ for (uint32_t j = 0; j < tblNum; j++) {
120
+ m.insert(PairMap::value_type(tbl[i][j], Pair(i, j)));
121
+ }
122
+ }
123
+ }
124
+ bool isAll1(uint64_t x, size_t len) const
125
+ {
126
+ if (len >= 64) {
127
+ return x == all1;
128
+ }
129
+ const uint64_t mask = getMask(len);
130
+ return (x & mask) == (all1 & mask);
131
+ }
132
+ bool concatPair(uint64_t& v, uint32_t& len, const Pair& pair) const
133
+ {
134
+ const uint64_t L = encTbl_[pair.prev].v;
135
+ const uint32_t Ln = encTbl_[pair.prev].len;
136
+ const uint64_t H = encTbl_[pair.cur].v;
137
+ const uint32_t Hn = encTbl_[pair.cur].len;
138
+ if (L == 0 && H == 0) {
139
+ v = 0;
140
+ len = Ln + Hn;
141
+ return true;
142
+ }
143
+ if (isAll1(L, Ln) && isAll1(H, Hn)) {
144
+ len = Ln + Hn;
145
+ v = len >= 64 ? all1 : getMask(len);
146
+ return true;
147
+ }
148
+ if (Ln + Hn <= 64) {
149
+ v = (H << Ln) | L;
150
+ len = Ln + Hn;
151
+ return true;
152
+ }
153
+ return false;
154
+ }
155
+ bool getTopEncoding(uint64_t& v, uint32_t& len) const
156
+ {
157
+ PairMap m;
158
+ getPairMap(m);
159
+ return concatPair(v, len, m.begin()->second);
160
+ }
161
+ void put() const
162
+ {
163
+ PairMap m;
164
+ getPairMap(m);
165
+ int n = 0;
166
+ for (PairMap::const_iterator i = m.begin(), ie = m.end(); i != ie; ++i) {
167
+ if (i->first > 0) {
168
+ printf("%u (%u, %u) ", i->first, i->second.prev, i->second.cur);
169
+ uint64_t v;
170
+ uint32_t len;
171
+ if (concatPair(v, len, i->second)) {
172
+ printf(" { 0x%llx, %u }\n", (long long)v, len);
173
+ } else {
174
+ printf("over prev=%u cur=%u\n", i->second.prev, i->second.cur);
175
+ }
176
+ n++;
177
+ if (n == 10) break;
178
+ }
179
+ }
180
+ }
181
+ private:
182
+ Bigram(const Bigram&);
183
+ void operator=(const Bigram&);
184
+ };
185
+
186
+ } // cybozu::csucvector_util
187
+
188
+ struct CSucVector {
189
+ #ifdef USE_CLK
190
+ mutable cybozu::CpuClock clkGet;
191
+ mutable cybozu::CpuClock clkRank;
192
+ void putClkSub(const char *msg, const cybozu::CpuClock& clk) const
193
+ {
194
+ if (clk.getCount() == 0) return;
195
+ printf("%s:%6.2f %d\n", msg, clk.getClock() / double(clk.getCount()), clk.getCount());
196
+ }
197
+ void putClk() const
198
+ {
199
+ putClkSub("get ", clkGet);
200
+ putClkSub("rank ", clkRank);
201
+ puts("");
202
+ }
203
+ #endif
204
+
205
+ struct Block {
206
+ uint32_t orgPos;
207
+ uint32_t vecPos;
208
+ uint32_t rk;
209
+ Block(uint32_t orgPos = 0, uint32_t vecPos = 0, uint32_t rk = 0) : orgPos(orgPos), vecPos(vecPos), rk(rk) {}
210
+ };
211
+ static const uint32_t skip = 1024;
212
+ typedef std::vector<Block> BlockVec;
213
+ typedef std::vector<csucvector_util::Encoding> EncodingTbl;
214
+ typedef std::vector<uint32_t> Vec32;
215
+ typedef std::vector<uint8_t> Vec8;
216
+ EncodingTbl encTbl;
217
+ uint32_t bitSize_;
218
+ Vec8 vec;
219
+ BlockVec blkVec;
220
+ uint32_t rk_;
221
+ Vec32 freqTbl;
222
+
223
+ struct OutputStream {
224
+ Vec32& freqTbl; // output
225
+ Vec8& vec; // output
226
+ uint32_t& rk; // output
227
+ csucvector_util::Bigram bi; // output
228
+ const EncodingTbl& encTbl; // in
229
+ OutputStream(Vec32& freqTbl, Vec8& vec, uint32_t& rk, const uint64_t *buf, uint32_t bitSize, const EncodingTbl& encTbl)
230
+ : freqTbl(freqTbl)
231
+ , vec(vec)
232
+ , rk(rk)
233
+ , bi(encTbl)
234
+ , encTbl(encTbl)
235
+ {
236
+ csucvector_util::InputStream is(buf, bitSize);
237
+ freqTbl.clear();
238
+ freqTbl.resize(encTbl.size());
239
+ vec.clear();
240
+ rk = 0;
241
+ for (;;) {
242
+ uint32_t s = append(is);
243
+ is.consume(s);
244
+ if (is.empty()) break;
245
+ }
246
+ printf("bitSize=%u\n",bitSize);
247
+ }
248
+ uint32_t append(const csucvector_util::InputStream& is)
249
+ {
250
+ uint64_t v = is.peek();
251
+ for (size_t i = 0; i < encTbl.size(); i++) {
252
+ const uint32_t len = encTbl[i].len;
253
+ bool found = false;
254
+ if (len >= 64) {
255
+ const size_t q = len / 64;
256
+ const size_t r = len % 64;
257
+ const uint64_t target = encTbl[i].v;
258
+ if (v == target) {
259
+ found = true;
260
+ for (size_t j = 1; j < q; j++) {
261
+ if (is.peek(j * 64) != target) {
262
+ found = false;
263
+ break;
264
+ }
265
+ }
266
+ if (found && r > 0) {
267
+ const uint64_t mask = csucvector_util::getMask(r);
268
+ if ((is.peek(q * 64) & mask) != (target & mask)) {
269
+ found = false;
270
+ }
271
+ }
272
+ }
273
+ } else {
274
+ const uint64_t mask = csucvector_util::getMask(len);
275
+ found = (v & mask) == encTbl[i].v;
276
+ }
277
+ if (found) {
278
+ bi.append((uint8_t)i);
279
+ freqTbl[i]++;
280
+ rk += encTbl[i].rk;
281
+ vec.push_back(uint8_t(i));
282
+ return len;
283
+ }
284
+ }
285
+ printf("NOT HERE!!! in debug mode\n");
286
+ for (size_t i = 0; i < 4; i++) {
287
+ printf("of=%d %llx\n", (int)i, (long long)is.peek(i * 64));
288
+ }
289
+ exit(1);
290
+ }
291
+ };
292
+ void initTable()
293
+ {
294
+ static const struct {
295
+ uint64_t v;
296
+ uint32_t len;
297
+ } tbl[] = {
298
+ #if 1
299
+ { 0x0, 16384 },
300
+ { 0xffffffffffffffff, 8192 },
301
+ { 0x0, 8192 },
302
+ { 0xffffffffffffffff, 4096 },
303
+ { 0x0, 4096 },
304
+ { 0xffffffffffffffff, 2048 },
305
+ { 0x0, 2048 },
306
+ { 0xffffffffffffffff, 1024 },
307
+ { 0x0, 1024 },
308
+ { 0xffffffffffffffff, 512 },
309
+ { 0x0, 512 },
310
+ { 0x0, 384 },
311
+ { 0xffffffffffffffff, 256 },
312
+ { 0x0, 256 },
313
+ { 0x0, 224 },
314
+ { 0xffffffffffffffff, 192 },
315
+ { 0xffffffffffffffff, 128 },
316
+ { 0x0, 128 },
317
+ { 0x0, 96 },
318
+ { 0x0, 85 },
319
+ { 0xffffffffffffffff, 64 },
320
+ { 0x0, 64 },
321
+ { 0x1fffffffffffff, 53 },
322
+ { 0x0, 53 },
323
+ { 0x3fffffffffff, 46 },
324
+ { 0x0, 46 },
325
+ { 0x7fffffffff, 39 },
326
+ { 0x4000000, 35 },
327
+ { 0x2000000, 35 },
328
+ { 0x1000000, 35 },
329
+ { 0x800000, 35 },
330
+ { 0x400000, 35 },
331
+ { 0x200000, 35 },
332
+ { 0xffffffff, 32 },
333
+ { 0x0, 32 },
334
+ { 0xfffffff, 28 },
335
+ { 0x8000000, 28 },
336
+ { 0x7ffffff, 28 },
337
+ { 0x4000000, 28 },
338
+ { 0x0, 28 },
339
+ { 0x1fffff, 21 },
340
+ { 0x1fff7f, 21 },
341
+ { 0x1dffff, 21 },
342
+ { 0x1bffff, 21 },
343
+ { 0x180000, 21 },
344
+ { 0x17ffff, 21 },
345
+ { 0x100000, 21 },
346
+ { 0xfffff, 21 },
347
+ { 0x80000, 21 },
348
+ { 0x40000, 21 },
349
+ { 0x20000, 21 },
350
+ { 0x10000, 21 },
351
+ { 0x8000, 21 },
352
+ { 0x4000, 21 },
353
+ { 0x81, 21 },
354
+ { 0x0, 21 },
355
+ { 0x3fff, 14 },
356
+ { 0x3ffe, 14 },
357
+ { 0x3f7f, 14 },
358
+ { 0x3eff, 14 },
359
+ { 0x3dff, 14 },
360
+ { 0x3c00, 14 },
361
+ { 0x3bff, 14 },
362
+ { 0x3800, 14 },
363
+ { 0x37ff, 14 },
364
+ { 0x3000, 14 },
365
+ { 0x2fff, 14 },
366
+ { 0x2800, 14 },
367
+ { 0x2400, 14 },
368
+ { 0x2200, 14 },
369
+ { 0x2100, 14 },
370
+ { 0x2080, 14 },
371
+ { 0x2040, 14 },
372
+ { 0x2020, 14 },
373
+ { 0x2010, 14 },
374
+ { 0x2008, 14 },
375
+ { 0x2004, 14 },
376
+ { 0x2002, 14 },
377
+ { 0x2001, 14 },
378
+ { 0x2000, 14 },
379
+ { 0x1fff, 14 },
380
+ { 0x1800, 14 },
381
+ { 0x1400, 14 },
382
+ { 0x1200, 14 },
383
+ { 0x1100, 14 },
384
+ { 0x1080, 14 },
385
+ { 0x1020, 14 },
386
+ { 0x1010, 14 },
387
+ { 0x1008, 14 },
388
+ { 0x1004, 14 },
389
+ { 0x1002, 14 },
390
+ { 0x1001, 14 },
391
+ { 0x1000, 14 },
392
+ { 0xfff, 14 },
393
+ { 0xc00, 14 },
394
+ { 0xa00, 14 },
395
+ { 0x900, 14 },
396
+ { 0x880, 14 },
397
+ { 0x808, 14 },
398
+ { 0x804, 14 },
399
+ { 0x802, 14 },
400
+ { 0x801, 14 },
401
+ { 0x800, 14 },
402
+ { 0x7ff, 14 },
403
+ { 0x600, 14 },
404
+ { 0x500, 14 },
405
+ { 0x480, 14 },
406
+ { 0x401, 14 },
407
+ { 0x400, 14 },
408
+ { 0x300, 14 },
409
+ { 0x280, 14 },
410
+ { 0x208, 14 },
411
+ { 0x202, 14 },
412
+ { 0x201, 14 },
413
+ { 0x200, 14 },
414
+ { 0x180, 14 },
415
+ { 0x102, 14 },
416
+ { 0x101, 14 },
417
+ { 0x100, 14 },
418
+ { 0x80, 14 },
419
+ { 0x40, 14 },
420
+ { 0x20, 14 },
421
+ { 0x10, 14 },
422
+ { 0x8, 14 },
423
+ { 0x4, 14 },
424
+ { 0x2, 14 },
425
+ { 0x1, 14 },
426
+ { 0x0, 14 },
427
+ #else
428
+ { 0, 64 * 32 },
429
+ { uint64_t(-1), 64 * 16 },
430
+ { uint64_t(-1), 256 },
431
+ { 0, 256 },
432
+ { 0, 32 },
433
+ { 0xffffffff, 32 },
434
+ #endif
435
+ };
436
+ encTbl.clear();
437
+ for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
438
+ encTbl.push_back(csucvector_util::Encoding(tbl[i].v, tbl[i].len));
439
+ }
440
+ for (int i = 0; i < 128; i++) {
441
+ encTbl.push_back(csucvector_util::Encoding(i, 7));
442
+ }
443
+ std::sort(encTbl.begin(), encTbl.end());
444
+ if (encTbl.size() > csucvector_util::maxTblSize) {
445
+ throw cybozu::Exception("CSucVector:initTable:bad size") << encTbl.size();
446
+ }
447
+ }
448
+
449
+ CSucVector() { clear(); }
450
+ ~CSucVector()
451
+ {
452
+ // put();
453
+ #ifdef USE_CLK
454
+ putClk();
455
+ #endif
456
+ }
457
+ CSucVector(const uint64_t *buf, uint64_t bitSize)
458
+ {
459
+ clear();
460
+ init(buf, bitSize);
461
+ }
462
+ void clear()
463
+ {
464
+ bitSize_ = 0;
465
+ rk_ = 0;
466
+ }
467
+ void init(const uint64_t *buf, uint64_t bitSize)
468
+ {
469
+ if (bitSize >= (uint64_t(1) << 32)) throw cybozu::Exception("CSucVector:init:big bitSize") << bitSize;
470
+ bitSize_ = (uint32_t)bitSize;
471
+ initTable();
472
+ for (;;) {
473
+ OutputStream os(freqTbl, vec, rk_, buf, bitSize_, encTbl);
474
+ // os.bi.put();
475
+ if (encTbl.size() == csucvector_util::maxTblSize) break;
476
+ uint64_t v;
477
+ uint32_t len;
478
+ if (!os.bi.getTopEncoding(v, len)) {
479
+ printf("ERR getTopEncoding\n");
480
+ os.bi.put();
481
+ putEncTbl();
482
+ exit(1);
483
+ }
484
+ printf("append v=%llx, len=%u tblSize=%u\n", (long long)v, len, (uint32_t)encTbl.size());
485
+ encTbl.push_back(csucvector_util::Encoding(v, len));
486
+ std::sort(encTbl.begin(), encTbl.end());
487
+ }
488
+ // putEncTbl();
489
+ initBlockVec();
490
+ }
491
+ void initBlockVec()
492
+ {
493
+ blkVec.reserve(bitSize_ / skip + 16);
494
+ uint32_t orgPos = 0;
495
+ uint32_t rk = 0;
496
+ uint32_t samplingPos = 0;
497
+ for (size_t vecPos = 0, n = vec.size(); vecPos < n; vecPos++) {
498
+ uint8_t v = vec[vecPos];
499
+ uint32_t next = orgPos + encTbl[v].len;
500
+
501
+ while (samplingPos < next) {
502
+ blkVec.push_back(Block(orgPos, (uint32_t)vecPos, rk));
503
+ samplingPos += skip;
504
+ }
505
+ orgPos = next;
506
+ rk += encTbl[v].rk;
507
+ }
508
+ }
509
+ void putEncTbl() const
510
+ {
511
+ for (size_t i = 0; i < encTbl.size(); i++) {
512
+ printf("%2d : { 0x%llx, %u },\n", (int)i, (long long)encTbl[i].v, encTbl[i].len);
513
+ }
514
+ }
515
+ void putSub() const
516
+ {
517
+ const uint32_t inSize = bitSize_ / 8;
518
+ if (inSize == 0) return;
519
+ const uint32_t compSize = (uint32_t)vec.size();
520
+ const uint32_t idxSize = (uint32_t)(blkVec.size() * sizeof(blkVec[0]));
521
+ const double cr = compSize * 100.0 / inSize;
522
+ const double ir = idxSize * 100.0 / inSize;
523
+ printf("in Size= %9d, rank=%u\n", inSize, rk_);
524
+ printf("comp Size= %9u\n", compSize);
525
+ printf("idx Size= %9u(blkVec.size=%7u)\n", idxSize, (uint32_t)blkVec.size());
526
+ printf("totalSize= %9u\n", compSize + idxSize);
527
+ printf("rate=%5.2f%%(%5.2f%% + %5.2f%%)\n", cr + ir, cr, ir);
528
+ }
529
+ void put() const
530
+ {
531
+ putSub();
532
+ if (freqTbl.empty()) return;
533
+ const uint32_t compSize = (uint32_t)vec.size();
534
+ for (size_t i = 0; i < freqTbl.size(); i++) {
535
+ printf("freqTbl[%2d] = %8d(%5.2f%%, %5.2f%%)\n", (int)i, freqTbl[i], freqTbl[i] * 100.0 / compSize, freqTbl[i] * encTbl[i].len * 100.0 / bitSize_);
536
+ }
537
+ }
538
+ bool get(size_t pos) const
539
+ {
540
+ if (pos >= bitSize_) throw cybozu::Exception("CSucVector:get:bad pos") << pos;
541
+ #ifdef USE_CLK
542
+ clkGet.begin();
543
+ #endif
544
+ const uint32_t cur = blkVec[pos / skip].orgPos;
545
+ uint32_t vecPos = blkVec[pos / skip].vecPos;
546
+ pos -= cur;
547
+ uint8_t v;
548
+ for (;;) {
549
+ v = vec[vecPos++];
550
+ uint32_t len = encTbl[v].len;
551
+ if (len > pos) break;
552
+ pos -= len;
553
+ }
554
+ const bool b = (pos >= 64) ? encTbl[v].v != 0 : (encTbl[v].v & (size_t(1) << pos)) != 0;
555
+ #ifdef USE_CLK
556
+ clkGet.end();
557
+ #endif
558
+ return b;
559
+ }
560
+ size_t rank1(size_t pos) const
561
+ {
562
+ if (pos >= bitSize_) return rk_;
563
+ #ifdef USE_CLK
564
+ clkRank.begin();
565
+ #endif
566
+ const uint32_t cur = blkVec[pos / skip].orgPos;
567
+ uint32_t vecPos = blkVec[pos / skip].vecPos;
568
+ size_t rk = blkVec[pos / skip].rk;
569
+ pos -= cur;
570
+ uint8_t v;
571
+ for (;;) {
572
+ v = vec[vecPos++];
573
+ size_t len = encTbl[v].len;
574
+ if (len > pos) break;
575
+ pos -= len;
576
+ rk += encTbl[v].rk;
577
+ }
578
+ size_t adj = 0;
579
+ if (pos >= 64) {
580
+ if (encTbl[v].v != 0) adj = pos;
581
+ } else {
582
+ uint64_t x = encTbl[v].v & csucvector_util::getMask(pos);
583
+ adj = cybozu::popcnt<uint64_t>(x);
584
+ }
585
+ rk += adj;
586
+ #ifdef USE_CLK
587
+ clkRank.end();
588
+ #endif
589
+ return rk;
590
+ }
591
+ size_t rank0(size_t pos) const
592
+ {
593
+ return pos - rank1(pos);
594
+ }
595
+ size_t rank(bool b, size_t pos) const
596
+ {
597
+ if (b) return rank1(pos);
598
+ return rank0(pos);
599
+ }
600
+ template<class OutputStream>
601
+ void save(OutputStream& os) const
602
+ {
603
+ cybozu::save(os, bitSize_);
604
+ cybozu::savePodVec(os, vec);
605
+ cybozu::savePodVec(os, blkVec);
606
+ cybozu::save(os, rk_);
607
+ cybozu::savePodVec(os, encTbl);
608
+ }
609
+ template<class InputStream>
610
+ void load(InputStream& is)
611
+ {
612
+ cybozu::load(bitSize_, is);
613
+ cybozu::loadPodVec(vec, is);
614
+ cybozu::loadPodVec(blkVec, is);
615
+ cybozu::load(rk_, is);
616
+ cybozu::loadPodVec(encTbl, is);
617
+ }
618
+ };
619
+
620
+ } // cybozu
621
+
622
+ #ifdef _WIN32
623
+ #pragma warning(pop)
624
+ #endif