ooxml_crypt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,624 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief compressed succinct vector
5
+ @author MITSUNARI Shigeo(@herumi)
6
+ @license modified new BSD license
7
+ http://opensource.org/licenses/BSD-3-Clause
8
+ @note use -msse4.2 option for popcnt
9
+ */
10
+ #include <cybozu/sucvector.hpp>
11
+ #include <cybozu/bitvector.hpp>
12
+ #include <cybozu/serializer.hpp>
13
+ #include <vector>
14
+ #include <iosfwd>
15
+ #include <map>
16
+
17
+ #ifdef _MSC_VER
18
+ #pragma warning(push)
19
+ #pragma warning(disable : 4351) // init buf in cstr
20
+ #endif
21
+
22
+ //#define USE_CLK
23
+ #ifdef USE_CLK
24
+ #include <cybozu/benchmark.hpp>
25
+ #endif
26
+
27
+ namespace cybozu {
28
+
29
+ namespace csucvector_util {
30
+
31
+ static const size_t tblBitLen = 8;
32
+ static const size_t maxTblSize = size_t(1) << tblBitLen;
33
+ static const uint64_t all1 = uint64_t(-1);
34
+
35
+ inline uint64_t getMask(size_t pos)
36
+ {
37
+ assert(pos < 64);
38
+ return (uint64_t(1) << pos) - 1;
39
+ }
40
+
41
+ struct Encoding {
42
+ uint64_t v;
43
+ uint32_t len;
44
+ uint32_t rk;
45
+ Encoding(uint64_t v = 0, uint32_t len = 0)
46
+ : v(v)
47
+ , len(len)
48
+ , rk(len <= 64 ? cybozu::popcnt<uint64_t>(v) : v == 0 ? 0 : len) { }
49
+ bool operator<(const Encoding& rhs) const
50
+ {
51
+ if (len > rhs.len) return true;
52
+ if (len < rhs.len) return false;
53
+ return v > rhs.v;
54
+ }
55
+ };
56
+
57
+ struct InputStream {
58
+ const uint64_t *block_;
59
+ size_t bitSize_;
60
+ size_t blockSize_;
61
+ size_t cur_;
62
+ InputStream(const uint64_t *block, size_t bitSize)
63
+ : block_(block), bitSize_(bitSize), blockSize_((bitSize + 63) / 64), cur_(0)
64
+ {
65
+ }
66
+ uint64_t peek(size_t offset = 0) const
67
+ {
68
+ const size_t q = (cur_ + offset) / 64;
69
+ const size_t r = (cur_ + offset) & 63;
70
+ if (q >= blockSize_) return 0;
71
+ if (r == 0) return block_[q];
72
+ uint64_t L = block_[q];
73
+ uint64_t H = q < blockSize_ - 1 ? block_[q + 1] : 0;
74
+ return ((L >> r) & getMask(64 - r)) | (H << (64 - r));
75
+ }
76
+ void consume(size_t size)
77
+ {
78
+ if (!empty()) cur_ += size;
79
+ }
80
+ bool empty() const { return cur_ >= bitSize_; }
81
+ };
82
+
83
+ struct Bigram {
84
+ struct Pair {
85
+ uint32_t prev;
86
+ uint32_t cur;
87
+ Pair(uint32_t prev = 0, uint32_t cur = 0) : prev(prev), cur(cur) {}
88
+ };
89
+ typedef std::multimap<uint32_t, Pair, std::greater<uint32_t> > PairMap;
90
+ const std::vector<Encoding>& encTbl_;
91
+ uint32_t tblNum;
92
+ std::vector<std::vector<uint32_t> > tbl;
93
+ size_t prev;
94
+ explicit Bigram(const std::vector<Encoding>& encTbl)
95
+ : encTbl_(encTbl)
96
+ , tblNum((uint32_t)encTbl.size())
97
+ , tbl()
98
+ , prev(tblNum) // first special value
99
+ {
100
+ tbl.resize(tblNum);
101
+ for (uint32_t i = 0; i < tblNum; i++) {
102
+ tbl[i].resize(tblNum);
103
+ }
104
+ }
105
+ // ~Bigram(){ put(); }
106
+ void append(uint32_t v)
107
+ {
108
+ if (v >= tblNum) throw cybozu::Exception("CSucVector:Bigram:bad v") << v;
109
+ if (prev == tblNum) {
110
+ prev = v;
111
+ return;
112
+ }
113
+ tbl[prev][v]++;
114
+ prev = v;
115
+ }
116
+ void getPairMap(PairMap& m) const
117
+ {
118
+ for (uint32_t i = 0; i < tblNum; i++) {
119
+ for (uint32_t j = 0; j < tblNum; j++) {
120
+ m.insert(PairMap::value_type(tbl[i][j], Pair(i, j)));
121
+ }
122
+ }
123
+ }
124
+ bool isAll1(uint64_t x, size_t len) const
125
+ {
126
+ if (len >= 64) {
127
+ return x == all1;
128
+ }
129
+ const uint64_t mask = getMask(len);
130
+ return (x & mask) == (all1 & mask);
131
+ }
132
+ bool concatPair(uint64_t& v, uint32_t& len, const Pair& pair) const
133
+ {
134
+ const uint64_t L = encTbl_[pair.prev].v;
135
+ const uint32_t Ln = encTbl_[pair.prev].len;
136
+ const uint64_t H = encTbl_[pair.cur].v;
137
+ const uint32_t Hn = encTbl_[pair.cur].len;
138
+ if (L == 0 && H == 0) {
139
+ v = 0;
140
+ len = Ln + Hn;
141
+ return true;
142
+ }
143
+ if (isAll1(L, Ln) && isAll1(H, Hn)) {
144
+ len = Ln + Hn;
145
+ v = len >= 64 ? all1 : getMask(len);
146
+ return true;
147
+ }
148
+ if (Ln + Hn <= 64) {
149
+ v = (H << Ln) | L;
150
+ len = Ln + Hn;
151
+ return true;
152
+ }
153
+ return false;
154
+ }
155
+ bool getTopEncoding(uint64_t& v, uint32_t& len) const
156
+ {
157
+ PairMap m;
158
+ getPairMap(m);
159
+ return concatPair(v, len, m.begin()->second);
160
+ }
161
+ void put() const
162
+ {
163
+ PairMap m;
164
+ getPairMap(m);
165
+ int n = 0;
166
+ for (PairMap::const_iterator i = m.begin(), ie = m.end(); i != ie; ++i) {
167
+ if (i->first > 0) {
168
+ printf("%u (%u, %u) ", i->first, i->second.prev, i->second.cur);
169
+ uint64_t v;
170
+ uint32_t len;
171
+ if (concatPair(v, len, i->second)) {
172
+ printf(" { 0x%llx, %u }\n", (long long)v, len);
173
+ } else {
174
+ printf("over prev=%u cur=%u\n", i->second.prev, i->second.cur);
175
+ }
176
+ n++;
177
+ if (n == 10) break;
178
+ }
179
+ }
180
+ }
181
+ private:
182
+ Bigram(const Bigram&);
183
+ void operator=(const Bigram&);
184
+ };
185
+
186
+ } // cybozu::csucvector_util
187
+
188
+ struct CSucVector {
189
+ #ifdef USE_CLK
190
+ mutable cybozu::CpuClock clkGet;
191
+ mutable cybozu::CpuClock clkRank;
192
+ void putClkSub(const char *msg, const cybozu::CpuClock& clk) const
193
+ {
194
+ if (clk.getCount() == 0) return;
195
+ printf("%s:%6.2f %d\n", msg, clk.getClock() / double(clk.getCount()), clk.getCount());
196
+ }
197
+ void putClk() const
198
+ {
199
+ putClkSub("get ", clkGet);
200
+ putClkSub("rank ", clkRank);
201
+ puts("");
202
+ }
203
+ #endif
204
+
205
+ struct Block {
206
+ uint32_t orgPos;
207
+ uint32_t vecPos;
208
+ uint32_t rk;
209
+ Block(uint32_t orgPos = 0, uint32_t vecPos = 0, uint32_t rk = 0) : orgPos(orgPos), vecPos(vecPos), rk(rk) {}
210
+ };
211
+ static const uint32_t skip = 1024;
212
+ typedef std::vector<Block> BlockVec;
213
+ typedef std::vector<csucvector_util::Encoding> EncodingTbl;
214
+ typedef std::vector<uint32_t> Vec32;
215
+ typedef std::vector<uint8_t> Vec8;
216
+ EncodingTbl encTbl;
217
+ uint32_t bitSize_;
218
+ Vec8 vec;
219
+ BlockVec blkVec;
220
+ uint32_t rk_;
221
+ Vec32 freqTbl;
222
+
223
+ struct OutputStream {
224
+ Vec32& freqTbl; // output
225
+ Vec8& vec; // output
226
+ uint32_t& rk; // output
227
+ csucvector_util::Bigram bi; // output
228
+ const EncodingTbl& encTbl; // in
229
+ OutputStream(Vec32& freqTbl, Vec8& vec, uint32_t& rk, const uint64_t *buf, uint32_t bitSize, const EncodingTbl& encTbl)
230
+ : freqTbl(freqTbl)
231
+ , vec(vec)
232
+ , rk(rk)
233
+ , bi(encTbl)
234
+ , encTbl(encTbl)
235
+ {
236
+ csucvector_util::InputStream is(buf, bitSize);
237
+ freqTbl.clear();
238
+ freqTbl.resize(encTbl.size());
239
+ vec.clear();
240
+ rk = 0;
241
+ for (;;) {
242
+ uint32_t s = append(is);
243
+ is.consume(s);
244
+ if (is.empty()) break;
245
+ }
246
+ printf("bitSize=%u\n",bitSize);
247
+ }
248
+ uint32_t append(const csucvector_util::InputStream& is)
249
+ {
250
+ uint64_t v = is.peek();
251
+ for (size_t i = 0; i < encTbl.size(); i++) {
252
+ const uint32_t len = encTbl[i].len;
253
+ bool found = false;
254
+ if (len >= 64) {
255
+ const size_t q = len / 64;
256
+ const size_t r = len % 64;
257
+ const uint64_t target = encTbl[i].v;
258
+ if (v == target) {
259
+ found = true;
260
+ for (size_t j = 1; j < q; j++) {
261
+ if (is.peek(j * 64) != target) {
262
+ found = false;
263
+ break;
264
+ }
265
+ }
266
+ if (found && r > 0) {
267
+ const uint64_t mask = csucvector_util::getMask(r);
268
+ if ((is.peek(q * 64) & mask) != (target & mask)) {
269
+ found = false;
270
+ }
271
+ }
272
+ }
273
+ } else {
274
+ const uint64_t mask = csucvector_util::getMask(len);
275
+ found = (v & mask) == encTbl[i].v;
276
+ }
277
+ if (found) {
278
+ bi.append((uint8_t)i);
279
+ freqTbl[i]++;
280
+ rk += encTbl[i].rk;
281
+ vec.push_back(uint8_t(i));
282
+ return len;
283
+ }
284
+ }
285
+ printf("NOT HERE!!! in debug mode\n");
286
+ for (size_t i = 0; i < 4; i++) {
287
+ printf("of=%d %llx\n", (int)i, (long long)is.peek(i * 64));
288
+ }
289
+ exit(1);
290
+ }
291
+ };
292
+ void initTable()
293
+ {
294
+ static const struct {
295
+ uint64_t v;
296
+ uint32_t len;
297
+ } tbl[] = {
298
+ #if 1
299
+ { 0x0, 16384 },
300
+ { 0xffffffffffffffff, 8192 },
301
+ { 0x0, 8192 },
302
+ { 0xffffffffffffffff, 4096 },
303
+ { 0x0, 4096 },
304
+ { 0xffffffffffffffff, 2048 },
305
+ { 0x0, 2048 },
306
+ { 0xffffffffffffffff, 1024 },
307
+ { 0x0, 1024 },
308
+ { 0xffffffffffffffff, 512 },
309
+ { 0x0, 512 },
310
+ { 0x0, 384 },
311
+ { 0xffffffffffffffff, 256 },
312
+ { 0x0, 256 },
313
+ { 0x0, 224 },
314
+ { 0xffffffffffffffff, 192 },
315
+ { 0xffffffffffffffff, 128 },
316
+ { 0x0, 128 },
317
+ { 0x0, 96 },
318
+ { 0x0, 85 },
319
+ { 0xffffffffffffffff, 64 },
320
+ { 0x0, 64 },
321
+ { 0x1fffffffffffff, 53 },
322
+ { 0x0, 53 },
323
+ { 0x3fffffffffff, 46 },
324
+ { 0x0, 46 },
325
+ { 0x7fffffffff, 39 },
326
+ { 0x4000000, 35 },
327
+ { 0x2000000, 35 },
328
+ { 0x1000000, 35 },
329
+ { 0x800000, 35 },
330
+ { 0x400000, 35 },
331
+ { 0x200000, 35 },
332
+ { 0xffffffff, 32 },
333
+ { 0x0, 32 },
334
+ { 0xfffffff, 28 },
335
+ { 0x8000000, 28 },
336
+ { 0x7ffffff, 28 },
337
+ { 0x4000000, 28 },
338
+ { 0x0, 28 },
339
+ { 0x1fffff, 21 },
340
+ { 0x1fff7f, 21 },
341
+ { 0x1dffff, 21 },
342
+ { 0x1bffff, 21 },
343
+ { 0x180000, 21 },
344
+ { 0x17ffff, 21 },
345
+ { 0x100000, 21 },
346
+ { 0xfffff, 21 },
347
+ { 0x80000, 21 },
348
+ { 0x40000, 21 },
349
+ { 0x20000, 21 },
350
+ { 0x10000, 21 },
351
+ { 0x8000, 21 },
352
+ { 0x4000, 21 },
353
+ { 0x81, 21 },
354
+ { 0x0, 21 },
355
+ { 0x3fff, 14 },
356
+ { 0x3ffe, 14 },
357
+ { 0x3f7f, 14 },
358
+ { 0x3eff, 14 },
359
+ { 0x3dff, 14 },
360
+ { 0x3c00, 14 },
361
+ { 0x3bff, 14 },
362
+ { 0x3800, 14 },
363
+ { 0x37ff, 14 },
364
+ { 0x3000, 14 },
365
+ { 0x2fff, 14 },
366
+ { 0x2800, 14 },
367
+ { 0x2400, 14 },
368
+ { 0x2200, 14 },
369
+ { 0x2100, 14 },
370
+ { 0x2080, 14 },
371
+ { 0x2040, 14 },
372
+ { 0x2020, 14 },
373
+ { 0x2010, 14 },
374
+ { 0x2008, 14 },
375
+ { 0x2004, 14 },
376
+ { 0x2002, 14 },
377
+ { 0x2001, 14 },
378
+ { 0x2000, 14 },
379
+ { 0x1fff, 14 },
380
+ { 0x1800, 14 },
381
+ { 0x1400, 14 },
382
+ { 0x1200, 14 },
383
+ { 0x1100, 14 },
384
+ { 0x1080, 14 },
385
+ { 0x1020, 14 },
386
+ { 0x1010, 14 },
387
+ { 0x1008, 14 },
388
+ { 0x1004, 14 },
389
+ { 0x1002, 14 },
390
+ { 0x1001, 14 },
391
+ { 0x1000, 14 },
392
+ { 0xfff, 14 },
393
+ { 0xc00, 14 },
394
+ { 0xa00, 14 },
395
+ { 0x900, 14 },
396
+ { 0x880, 14 },
397
+ { 0x808, 14 },
398
+ { 0x804, 14 },
399
+ { 0x802, 14 },
400
+ { 0x801, 14 },
401
+ { 0x800, 14 },
402
+ { 0x7ff, 14 },
403
+ { 0x600, 14 },
404
+ { 0x500, 14 },
405
+ { 0x480, 14 },
406
+ { 0x401, 14 },
407
+ { 0x400, 14 },
408
+ { 0x300, 14 },
409
+ { 0x280, 14 },
410
+ { 0x208, 14 },
411
+ { 0x202, 14 },
412
+ { 0x201, 14 },
413
+ { 0x200, 14 },
414
+ { 0x180, 14 },
415
+ { 0x102, 14 },
416
+ { 0x101, 14 },
417
+ { 0x100, 14 },
418
+ { 0x80, 14 },
419
+ { 0x40, 14 },
420
+ { 0x20, 14 },
421
+ { 0x10, 14 },
422
+ { 0x8, 14 },
423
+ { 0x4, 14 },
424
+ { 0x2, 14 },
425
+ { 0x1, 14 },
426
+ { 0x0, 14 },
427
+ #else
428
+ { 0, 64 * 32 },
429
+ { uint64_t(-1), 64 * 16 },
430
+ { uint64_t(-1), 256 },
431
+ { 0, 256 },
432
+ { 0, 32 },
433
+ { 0xffffffff, 32 },
434
+ #endif
435
+ };
436
+ encTbl.clear();
437
+ for (size_t i = 0; i < CYBOZU_NUM_OF_ARRAY(tbl); i++) {
438
+ encTbl.push_back(csucvector_util::Encoding(tbl[i].v, tbl[i].len));
439
+ }
440
+ for (int i = 0; i < 128; i++) {
441
+ encTbl.push_back(csucvector_util::Encoding(i, 7));
442
+ }
443
+ std::sort(encTbl.begin(), encTbl.end());
444
+ if (encTbl.size() > csucvector_util::maxTblSize) {
445
+ throw cybozu::Exception("CSucVector:initTable:bad size") << encTbl.size();
446
+ }
447
+ }
448
+
449
+ CSucVector() { clear(); }
450
+ ~CSucVector()
451
+ {
452
+ // put();
453
+ #ifdef USE_CLK
454
+ putClk();
455
+ #endif
456
+ }
457
+ CSucVector(const uint64_t *buf, uint64_t bitSize)
458
+ {
459
+ clear();
460
+ init(buf, bitSize);
461
+ }
462
+ void clear()
463
+ {
464
+ bitSize_ = 0;
465
+ rk_ = 0;
466
+ }
467
+ void init(const uint64_t *buf, uint64_t bitSize)
468
+ {
469
+ if (bitSize >= (uint64_t(1) << 32)) throw cybozu::Exception("CSucVector:init:big bitSize") << bitSize;
470
+ bitSize_ = (uint32_t)bitSize;
471
+ initTable();
472
+ for (;;) {
473
+ OutputStream os(freqTbl, vec, rk_, buf, bitSize_, encTbl);
474
+ // os.bi.put();
475
+ if (encTbl.size() == csucvector_util::maxTblSize) break;
476
+ uint64_t v;
477
+ uint32_t len;
478
+ if (!os.bi.getTopEncoding(v, len)) {
479
+ printf("ERR getTopEncoding\n");
480
+ os.bi.put();
481
+ putEncTbl();
482
+ exit(1);
483
+ }
484
+ printf("append v=%llx, len=%u tblSize=%u\n", (long long)v, len, (uint32_t)encTbl.size());
485
+ encTbl.push_back(csucvector_util::Encoding(v, len));
486
+ std::sort(encTbl.begin(), encTbl.end());
487
+ }
488
+ // putEncTbl();
489
+ initBlockVec();
490
+ }
491
+ void initBlockVec()
492
+ {
493
+ blkVec.reserve(bitSize_ / skip + 16);
494
+ uint32_t orgPos = 0;
495
+ uint32_t rk = 0;
496
+ uint32_t samplingPos = 0;
497
+ for (size_t vecPos = 0, n = vec.size(); vecPos < n; vecPos++) {
498
+ uint8_t v = vec[vecPos];
499
+ uint32_t next = orgPos + encTbl[v].len;
500
+
501
+ while (samplingPos < next) {
502
+ blkVec.push_back(Block(orgPos, (uint32_t)vecPos, rk));
503
+ samplingPos += skip;
504
+ }
505
+ orgPos = next;
506
+ rk += encTbl[v].rk;
507
+ }
508
+ }
509
+ void putEncTbl() const
510
+ {
511
+ for (size_t i = 0; i < encTbl.size(); i++) {
512
+ printf("%2d : { 0x%llx, %u },\n", (int)i, (long long)encTbl[i].v, encTbl[i].len);
513
+ }
514
+ }
515
+ void putSub() const
516
+ {
517
+ const uint32_t inSize = bitSize_ / 8;
518
+ if (inSize == 0) return;
519
+ const uint32_t compSize = (uint32_t)vec.size();
520
+ const uint32_t idxSize = (uint32_t)(blkVec.size() * sizeof(blkVec[0]));
521
+ const double cr = compSize * 100.0 / inSize;
522
+ const double ir = idxSize * 100.0 / inSize;
523
+ printf("in Size= %9d, rank=%u\n", inSize, rk_);
524
+ printf("comp Size= %9u\n", compSize);
525
+ printf("idx Size= %9u(blkVec.size=%7u)\n", idxSize, (uint32_t)blkVec.size());
526
+ printf("totalSize= %9u\n", compSize + idxSize);
527
+ printf("rate=%5.2f%%(%5.2f%% + %5.2f%%)\n", cr + ir, cr, ir);
528
+ }
529
+ void put() const
530
+ {
531
+ putSub();
532
+ if (freqTbl.empty()) return;
533
+ const uint32_t compSize = (uint32_t)vec.size();
534
+ for (size_t i = 0; i < freqTbl.size(); i++) {
535
+ printf("freqTbl[%2d] = %8d(%5.2f%%, %5.2f%%)\n", (int)i, freqTbl[i], freqTbl[i] * 100.0 / compSize, freqTbl[i] * encTbl[i].len * 100.0 / bitSize_);
536
+ }
537
+ }
538
+ bool get(size_t pos) const
539
+ {
540
+ if (pos >= bitSize_) throw cybozu::Exception("CSucVector:get:bad pos") << pos;
541
+ #ifdef USE_CLK
542
+ clkGet.begin();
543
+ #endif
544
+ const uint32_t cur = blkVec[pos / skip].orgPos;
545
+ uint32_t vecPos = blkVec[pos / skip].vecPos;
546
+ pos -= cur;
547
+ uint8_t v;
548
+ for (;;) {
549
+ v = vec[vecPos++];
550
+ uint32_t len = encTbl[v].len;
551
+ if (len > pos) break;
552
+ pos -= len;
553
+ }
554
+ const bool b = (pos >= 64) ? encTbl[v].v != 0 : (encTbl[v].v & (size_t(1) << pos)) != 0;
555
+ #ifdef USE_CLK
556
+ clkGet.end();
557
+ #endif
558
+ return b;
559
+ }
560
+ size_t rank1(size_t pos) const
561
+ {
562
+ if (pos >= bitSize_) return rk_;
563
+ #ifdef USE_CLK
564
+ clkRank.begin();
565
+ #endif
566
+ const uint32_t cur = blkVec[pos / skip].orgPos;
567
+ uint32_t vecPos = blkVec[pos / skip].vecPos;
568
+ size_t rk = blkVec[pos / skip].rk;
569
+ pos -= cur;
570
+ uint8_t v;
571
+ for (;;) {
572
+ v = vec[vecPos++];
573
+ size_t len = encTbl[v].len;
574
+ if (len > pos) break;
575
+ pos -= len;
576
+ rk += encTbl[v].rk;
577
+ }
578
+ size_t adj = 0;
579
+ if (pos >= 64) {
580
+ if (encTbl[v].v != 0) adj = pos;
581
+ } else {
582
+ uint64_t x = encTbl[v].v & csucvector_util::getMask(pos);
583
+ adj = cybozu::popcnt<uint64_t>(x);
584
+ }
585
+ rk += adj;
586
+ #ifdef USE_CLK
587
+ clkRank.end();
588
+ #endif
589
+ return rk;
590
+ }
591
+ size_t rank0(size_t pos) const
592
+ {
593
+ return pos - rank1(pos);
594
+ }
595
+ size_t rank(bool b, size_t pos) const
596
+ {
597
+ if (b) return rank1(pos);
598
+ return rank0(pos);
599
+ }
600
+ template<class OutputStream>
601
+ void save(OutputStream& os) const
602
+ {
603
+ cybozu::save(os, bitSize_);
604
+ cybozu::savePodVec(os, vec);
605
+ cybozu::savePodVec(os, blkVec);
606
+ cybozu::save(os, rk_);
607
+ cybozu::savePodVec(os, encTbl);
608
+ }
609
+ template<class InputStream>
610
+ void load(InputStream& is)
611
+ {
612
+ cybozu::load(bitSize_, is);
613
+ cybozu::loadPodVec(vec, is);
614
+ cybozu::loadPodVec(blkVec, is);
615
+ cybozu::load(rk_, is);
616
+ cybozu::loadPodVec(encTbl, is);
617
+ }
618
+ };
619
+
620
+ } // cybozu
621
+
622
+ #ifdef _WIN32
623
+ #pragma warning(pop)
624
+ #endif