ooxml_crypt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,46 @@
1
+ #include <cybozu/exception.hpp>
2
+ #include <cybozu/string.hpp>
3
+ #include <cybozu/atoi.hpp>
4
+ #include <iostream>
5
+ #include <assert.h>
6
+ #include <cybozu/stacktrace.hpp>
7
+
8
+ struct MailException : cybozu::Exception {
9
+ MailException() : cybozu::Exception("mail") { }
10
+ };
11
+
12
+ void f2()
13
+ {
14
+ const char *msg = "HTTP/...";
15
+ std::string abc = "abc";
16
+ char c = 'x';
17
+ int port = 80;
18
+ unsigned int s = 90;
19
+ MailException e;
20
+ e << "can't send" << msg << abc << c << port << s << '\n';
21
+ cybozu::StackTrace st;
22
+ e << st;
23
+ throw e;
24
+ }
25
+
26
+ void f1()
27
+ {
28
+ f2();
29
+ }
30
+
31
+ void f0()
32
+ {
33
+ f1();
34
+ }
35
+
36
+ int main()
37
+ {
38
+ try {
39
+ f0();
40
+ } catch (cybozu::Exception &e) {
41
+ std::cout << "for user" << std::endl;
42
+ std::cout << e.toString() << std::endl;
43
+ } catch (...) {
44
+ std::cout << "Error!" << std::endl;
45
+ }
46
+ }
@@ -0,0 +1,231 @@
1
+ #include <fstream>
2
+ #include <cybozu/time.hpp>
3
+ #include <cybozu/fmindex.hpp>
4
+ #include <cybozu/mmap.hpp>
5
+ #include <cybozu/string.hpp>
6
+ #include <cybozu/hash.hpp>
7
+ #include <cybozu/benchmark.hpp>
8
+ #include <set>
9
+
10
+ #ifdef USE_UTF32
11
+ typedef cybozu::FMindexT<cybozu::Char> FMindex;
12
+ typedef cybozu::String String;
13
+ #else
14
+ typedef cybozu::FMindex FMindex;
15
+ typedef std::string String;
16
+ #endif
17
+
18
+ typedef std::set<int> Set;
19
+
20
+ void putSet(const Set& set)
21
+ {
22
+ for (Set::const_iterator i = set.begin(), ie = set.end(); i != ie; ++i) {
23
+ std::cout << *i << ' ';
24
+ }
25
+ std::cout << std::endl;
26
+ }
27
+
28
+ template<class STRING>
29
+ void simpleSearch(const std::string& inName, const std::string& queryFile, bool putHash)
30
+ {
31
+ cybozu::Mmap m(inName);
32
+ STRING text(m.get(), m.size());
33
+
34
+ double beginTime = cybozu::GetCurrentTimeSec();
35
+
36
+ std::ifstream qs(queryFile.c_str(), std::ios::binary);
37
+ STRING key;
38
+ uint64_t hash = 0;
39
+ while (qs >> key) {
40
+ if (!putHash) std::cout << "query " << key << std::endl;
41
+ size_t p = 0;
42
+ Set set;
43
+ for (;;) {
44
+ size_t q = text.find(key, p);
45
+ if (q == std::string::npos) break;
46
+ set.insert((int)q);
47
+ p = q + 1;
48
+ }
49
+ if (putHash) {
50
+ hash = cybozu::hash64(set.begin(), set.end(), hash);
51
+ } else {
52
+ putSet(set);
53
+ }
54
+ }
55
+ if (putHash) printf("hash=%llx\n", (long long)hash);
56
+
57
+ double endTime = cybozu::GetCurrentTimeSec();
58
+ fprintf(stderr, "time: %gsec\n", endTime - beginTime);
59
+ }
60
+
61
+ template<class FMINDEX, class STRING>
62
+ void recover(const std::string& inName, const std::string& outName)
63
+ {
64
+ std::ifstream is(inName.c_str(), std::ios::binary);
65
+ FMINDEX f;
66
+ f.load(is);
67
+
68
+ double beginTime = cybozu::GetCurrentTimeSec();
69
+
70
+ STRING str;
71
+ f.getPrevString(str, 0, f.wm.size() - 1);
72
+ double endTime = cybozu::GetCurrentTimeSec();
73
+ fprintf(stderr, "time: %gsec\n", endTime - beginTime);
74
+ std::ofstream os(outName.c_str(), std::ios::binary);
75
+ os << str;
76
+ }
77
+
78
+ template<class FMINDEX, class STRING>
79
+ void search(const std::string& inName, const std::string& queryFile, bool putHash, bool bench)
80
+ {
81
+ std::ifstream is(inName.c_str(), std::ios::binary);
82
+ FMINDEX f;
83
+ f.load(is);
84
+
85
+ double beginTime = cybozu::GetCurrentTimeSec();
86
+
87
+ std::ifstream qs(queryFile.c_str(), std::ios::binary);
88
+ STRING key;
89
+ uint64_t hash = 0;
90
+ cybozu::CpuClock clkRange;
91
+ cybozu::CpuClock clkPos;
92
+ while (qs >> key) {
93
+ if (!putHash) std::cout << "query " << key << std::endl;
94
+ size_t begin, end = 0;
95
+ if (bench) clkRange.begin();
96
+ bool found = f.getRange(&begin, &end, key);
97
+ if (bench) clkRange.end();
98
+ Set set;
99
+ if (found) {
100
+ while (begin != end) {
101
+ if (bench) clkPos.begin();
102
+ int pos = (int)f.convertPosition(begin);
103
+ if (bench) clkPos.end();
104
+ set.insert(pos);
105
+ begin++;
106
+ }
107
+ }
108
+ if (putHash) {
109
+ hash = cybozu::hash64(set.begin(), set.end(), hash);
110
+ } else {
111
+ putSet(set);
112
+ }
113
+ }
114
+ if (putHash) printf("hash=%llx\n", (long long)hash);
115
+
116
+ double endTime = cybozu::GetCurrentTimeSec();
117
+ fprintf(stderr, "time: %gsec\n", endTime - beginTime);
118
+ if (bench) {
119
+ int rangeNum = (int)clkRange.getCount();
120
+ int posNum = (int)clkPos.getCount();
121
+ fprintf(stderr, "getRange %.2f(%d) pos %.2f(%d)\n", clkRange.getClock() / double(rangeNum), rangeNum, clkPos.getClock() / double(posNum), posNum);
122
+ }
123
+ }
124
+
125
+ template<class FMINDEX, class STRING>
126
+ static void create(const std::string& inName, const std::string& outName, int skip)
127
+ {
128
+ fprintf(stderr, "inName=%s, outName=%s, skip=%d\n", inName.c_str(), outName.c_str(), skip);
129
+
130
+ double beginTime = cybozu::GetCurrentTimeSec();
131
+
132
+ cybozu::Mmap m(inName);
133
+ FMINDEX f;
134
+ STRING text(m.get(), m.get() + m.size());
135
+ f.init(text.begin(), text.end(), skip);
136
+
137
+ double endTime = cybozu::GetCurrentTimeSec();
138
+ fprintf(stderr, "create time %gsec\n", endTime - beginTime);
139
+ std::ofstream os(outName.c_str(), std::ios::binary);
140
+ f.save(os);
141
+ }
142
+
143
+ void usage()
144
+ {
145
+ printf("fmindex_smpl.exe (-c|-s|-r|-ss) file1 file2 [-skip skip][-hash][-time]\n");
146
+ printf(" -c : create index file\n");
147
+ printf(" file1 : any UTF-8 string file\n");
148
+ printf(" file2 : output index file\n");
149
+ printf(" -skip skip : skip to sampling(default 8)\n");
150
+ printf(" -hash : put position hash\n");
151
+ printf(" -time : benchmark\n");
152
+ printf(" -s : search mode\n");
153
+ printf(" file1 : index file\n");
154
+ printf(" file2 : query string file\n");
155
+ printf(" -r : recover mode\n");
156
+ printf(" file1 : index file\n");
157
+ printf(" file2 : org index file\n");
158
+ printf(" -ss: simple search\n");
159
+ printf(" file1 : any UTF-8 string file\n");
160
+ printf(" file2 : query string file\n");
161
+ exit(1);
162
+ }
163
+
164
+ int main(int argc, char* argv[])
165
+ try
166
+ {
167
+ argc--, argv++;
168
+ std::string fName1;
169
+ std::string fName2;
170
+ std::string mode;
171
+ int skip = 8;
172
+ bool putHash = false;
173
+ bool bench = false;
174
+
175
+ while (argc > 0) {
176
+ if (strcmp(*argv, "-c") == 0) {
177
+ mode = *argv;
178
+ } else
179
+ if (strcmp(*argv, "-s") == 0) {
180
+ mode = *argv;
181
+ } else
182
+ if (strcmp(*argv, "-r") == 0) {
183
+ mode = *argv;
184
+ } else
185
+ if (strcmp(*argv, "-ss") == 0) {
186
+ mode = *argv;
187
+ } else
188
+ if (argc > 1 && strcmp(*argv, "-skip") == 0) {
189
+ argc--, argv++;
190
+ skip = atoi(*argv);
191
+ } else
192
+ if (strcmp(*argv, "-hash") == 0) {
193
+ putHash = true;
194
+ } else
195
+ if (strcmp(*argv, "-time") == 0) {
196
+ bench = true;
197
+ } else
198
+ if (**argv != '-' && fName1.empty()) {
199
+ fName1 = *argv;
200
+ } else
201
+ if (**argv != '-' && fName2.empty()) {
202
+ fName2 = *argv;
203
+ } else
204
+ {
205
+ usage();
206
+ }
207
+ argc--, argv++;
208
+ }
209
+ if (fName1.empty() || fName2.empty() || mode.empty()) {
210
+ usage();
211
+ }
212
+ if (mode == "-c") {
213
+ create<FMindex, String>(fName1, fName2, skip);
214
+ } else
215
+ if (mode == "-s") {
216
+ search<FMindex, String>(fName1, fName2, putHash, bench);
217
+ } else
218
+ if (mode == "-r") {
219
+ recover<FMindex, String>(fName1, fName2);
220
+ } else
221
+ if (mode == "-ss") {
222
+ simpleSearch<String>(fName1, fName2, putHash);
223
+ } else
224
+ {
225
+ usage();
226
+ }
227
+ } catch (std::exception& e) {
228
+ printf("ERR %s\n", e.what());
229
+ return 1;
230
+ }
231
+
@@ -0,0 +1,19 @@
1
+ #include <stdio.h>
2
+ #include <cybozu/log.hpp>
3
+
4
+ int main()
5
+ {
6
+ cybozu::PutLog(cybozu::LogInfo, "this is a pen1");
7
+ cybozu::useSyslog(false);
8
+ cybozu::SetLogUseMsec();
9
+ cybozu::PutLog(cybozu::LogInfo, "this is a pen2");
10
+ cybozu::OpenLogFile("test.log");
11
+ cybozu::PutLog(cybozu::LogInfo, "this is a pen3");
12
+ cybozu::useSyslog(true);
13
+ cybozu::PutLog(cybozu::LogInfo, "this is a pen4");
14
+
15
+ cybozu::PutLog(cybozu::LogInfo, "AAtest");
16
+ cybozu::SetLogPriority(cybozu::LogInfo);
17
+ cybozu::PutLog(cybozu::LogInfo, "AAtest2");
18
+ cybozu::PutLog(cybozu::LogDebug, "not print");
19
+ }
@@ -0,0 +1,37 @@
1
+ #include <vector>
2
+ #include <stdio.h>
3
+ #include <cybozu/nlp/mecab.hpp>
4
+ #include <cybozu/mmap.hpp>
5
+
6
+ int main(int argc, char *argv[])
7
+ {
8
+ argc--, argv++;
9
+ if (argc == 0) {
10
+ fprintf(stderr, "mecab_smpl filename\n");
11
+ return 1;
12
+ }
13
+ try {
14
+ const std::string fileName = argv[0];
15
+ cybozu::Mmap mmap(fileName);
16
+ if (mmap.size() > (1 << 30)) {
17
+ fprintf(stderr, "file is too large %lld\n", (long long)mmap.size());
18
+ return 1;
19
+ }
20
+
21
+ cybozu::nlp::Mecab mecab;
22
+ typedef std::vector<std::string> StrVec;
23
+ StrVec sv;
24
+ if (mecab.parse(sv, mmap.get(), (int)mmap.size())) {
25
+ for (size_t i = 0, n = sv.size(); i < n; i++) {
26
+ printf("%s ", sv[i].c_str());
27
+ }
28
+ printf("\n");
29
+ }
30
+ return 0;
31
+ } catch (std::exception& e) {
32
+ fprintf(stderr, "exception %s\n", e.what());
33
+ } catch (...) {
34
+ fprintf(stderr, "unknown exception\n");
35
+ }
36
+ return 1;
37
+ }
@@ -0,0 +1,68 @@
1
+ /*
2
+ how to use two step option parser
3
+ */
4
+ #include <stdio.h>
5
+ #include <cybozu/option.hpp>
6
+ #include <vector>
7
+
8
+ struct Opt {
9
+ // common option
10
+ int x;
11
+ cybozu::Option opt1;
12
+
13
+ // cmd option
14
+ std::string cmd;
15
+
16
+ std::string init_s;
17
+ double run_d;
18
+ char status_c;
19
+ cybozu::Option opt2;
20
+
21
+ int parse1(int argc, char *argv[])
22
+ {
23
+ opt1.appendOpt(&x, 5, "x", " :value");
24
+ opt1.appendDelimiter("init");
25
+ opt1.appendDelimiter("run");
26
+ opt1.appendDelimiter("status");
27
+ opt1.appendHelp("h");
28
+ opt1.setUsage("option2 [opt] (init|run|status)", true);
29
+
30
+ if (!opt1.parse(argc, argv)) return false;
31
+ const int pos = opt1.getNextPositionOfDelimiter();
32
+ if (pos == 0) return 0;
33
+ cmd = argv[pos - 1];
34
+ if (cmd == "init") {
35
+ opt2.appendOpt(&init_s, "abc", "s", " :string");
36
+ } else if (cmd == "run") {
37
+ opt2.appendOpt(&run_d, 1.2, "d", " :double");
38
+ } else if (cmd == "status") {
39
+ opt2.appendOpt(&status_c, 'X', "c", " :char");
40
+ } else {
41
+ return 0;
42
+ }
43
+ opt2.appendHelp("h");
44
+ return pos;
45
+ }
46
+ void parse(int argc, char *argv[])
47
+ {
48
+ int pos = parse1(argc, argv);
49
+ if (pos == 0) {
50
+ opt1.usage();
51
+ exit(1);
52
+ }
53
+ if (!opt2.parse(argc, argv, pos)) {
54
+ opt2.usage();
55
+ exit(1);
56
+ }
57
+ puts("common");
58
+ opt1.put();
59
+ printf("opt for %s\n", cmd.c_str());
60
+ opt2.put();
61
+ }
62
+ };
63
+
64
+ int main(int argc, char *argv[])
65
+ {
66
+ Opt opt;
67
+ opt.parse(argc, argv);
68
+ }
@@ -0,0 +1,42 @@
1
+ /*
2
+ how to use
3
+ */
4
+ #include <stdio.h>
5
+ #include <cybozu/option.hpp>
6
+ #include <vector>
7
+
8
+ int main(int argc, char *argv[])
9
+ try
10
+ {
11
+ int x;
12
+ bool b;
13
+ double d;
14
+ std::string y;
15
+ std::vector<int> z;
16
+ std::vector<std::string> w;
17
+ std::string inName;
18
+ std::vector<std::string> r;
19
+ std::vector<std::string> vi;
20
+ uint64_t u;
21
+
22
+ cybozu::Option opt;
23
+
24
+ opt.appendOpt(&x, 5, "x", "int");
25
+ opt.appendBoolOpt(&b, "b", "bool");
26
+ opt.appendMust(&d, "d", "double");
27
+ opt.appendMust(&y, "y", "string");
28
+ opt.appendVec(&z, "z", "int int int ...");
29
+ opt.appendVec(&w, "w", "str str str ...");
30
+ opt.appendOpt(&u, 0, "u", "uint64 val");
31
+ opt.appendParam(&inName, "input-file", "text file");
32
+ opt.appendParamVec(&vi, "remains", "sss");
33
+ opt.appendHelp("h");
34
+
35
+ if (opt.parse(argc, argv)) {
36
+ opt.put();
37
+ } else {
38
+ opt.usage();
39
+ }
40
+ } catch (std::exception& e) {
41
+ printf("ERR %s\n", e.what());
42
+ }
@@ -0,0 +1,207 @@
1
+ /**
2
+ pLSI(probabilistic latent semantic indexing)
3
+ @author MITSUNARI Shigeo(@herumi)
4
+ */
5
+
6
+ #include <stdio.h>
7
+ #include <map>
8
+ #include <cybozu/file.hpp>
9
+ #include <cybozu/csv.hpp>
10
+ #include <cybozu/nlp/plsi.hpp>
11
+ #include <cybozu/string_operation.hpp>
12
+ #include <cybozu/time.hpp>
13
+ #include <iostream>
14
+
15
+ void load(cybozu::nlp::Plsi& plsi, const std::string& filepath)
16
+ {
17
+ cybozu::CsvReader csv(filepath, ' ');
18
+ std::vector<std::string> line;
19
+ while (csv.read(line)) {
20
+ cybozu::nlp::Plsi::ITEM_TYPE item_key = cybozu::atoi(line[0]);
21
+ size_t size = line.size();
22
+ if (size < 2) continue;
23
+ std::map<size_t, bool> map;
24
+ for (size_t i = 1; i < line.size(); ++i) {
25
+ cybozu::nlp::Plsi::USER_TYPE user_key = cybozu::atoi(line[i]);
26
+ map[plsi.get_user_id(user_key)] = true;
27
+ }
28
+ plsi.getItem(item_key).set(map);
29
+ }
30
+ }
31
+
32
+ void usage()
33
+ {
34
+ printf("usage: plsi [option] -f [dataset filename]\n");
35
+ printf(" -k [num] : # of latent classes");
36
+ printf(" -i [num] : # of iterations");
37
+ exit(1);
38
+ }
39
+
40
+ /**
41
+ @brief Atnd Data
42
+ */
43
+ struct AtndData {
44
+ std::string date; // for only event
45
+ std::string name; // user or event name
46
+ };
47
+
48
+ /**
49
+ @brief Atnd Information (Users / Events)
50
+ */
51
+ struct AtndInfo {
52
+ typedef std::map<int, AtndData> Int2Data;
53
+ typedef std::map<std::string, int> Str2Int;
54
+ Int2Data int2data_;
55
+ Str2Int name2id_;
56
+ /**
57
+ @brief load list of Atnd Users / Events
58
+ @param[in] name filename of list
59
+ @param[in] isEvent Is it an event list?
60
+ */
61
+ bool loadList(const std::string& name, bool isEvent)
62
+ {
63
+ std::ifstream ifs(name.c_str(), std::ios::binary);
64
+ if (!ifs) return false;
65
+ for (;;) {
66
+ AtndData t;
67
+ int id;
68
+ if (!(ifs >> id)) break;
69
+ if (isEvent) {
70
+ std::string str;
71
+ ifs >> str;
72
+ if (str.empty()) return false;
73
+ if (str.size() < 6) {
74
+ fprintf(stderr, "bad format %s\n", str.c_str());
75
+ return false;
76
+ }
77
+ str = str.substr(0, str.size() - 6); // "+09:00"
78
+ cybozu::Time time(str);
79
+ time.setTime(time.getTime() + 9 * 3600);
80
+ time.toString(t.date, "%Y/%m/%d", false);
81
+ }
82
+ std::getline(ifs, t.name);
83
+ cybozu::Trim(t.name);
84
+ if (!ifs) break;
85
+ int2data_[id] = t;
86
+ name2id_[t.name] = id;
87
+ }
88
+ return true;
89
+ }
90
+ /**
91
+ @brief load list of Atnd Users / Events. (generates filename from isEvent parameter)
92
+ @param[in] dir directory name where list exists
93
+ @param[in] isEvent Is it an event list?
94
+ */
95
+ bool load(const std::string& dir, bool isEvent)
96
+ {
97
+ const std::string key = isEvent ? "event" : "user";
98
+ std::string name;
99
+ name = dir + "/atnd-" + key + ".txt";
100
+ if (!loadList(name, isEvent)) {
101
+ fprintf(stderr, "can't read %s (%d)\n", name.c_str(), isEvent);
102
+ return false;
103
+ }
104
+ return true;
105
+ }
106
+ };
107
+
108
+ int main(int argc, char** argv)
109
+ {
110
+ std::string data_dir = cybozu::GetExePath() + "../sample/data/plsi/";
111
+
112
+ int K = 20;
113
+ int Iter = 100;
114
+ argc--, argv++;
115
+ while (argc > 0) {
116
+ if (argc > 1 && strcmp(*argv, "-d") == 0) {
117
+ argc--, argv++;
118
+ data_dir = *argv;
119
+ } else if (argc > 1 && strcmp(*argv, "-k") == 0) {
120
+ argc--, argv++;
121
+ K = cybozu::atoi(*argv);
122
+ } else if (argc > 1 && strcmp(*argv, "-i") == 0) {
123
+ argc--, argv++;
124
+ Iter = cybozu::atoi(*argv);
125
+ } else {
126
+ usage();
127
+ }
128
+ argc--, argv++;
129
+ }
130
+ const std::string name = data_dir + "/atnd-user-matrix.txt";
131
+
132
+ cybozu::nlp::Plsi plsi;
133
+ try {
134
+ AtndInfo event_master, user_master;
135
+ event_master.load(data_dir, true);
136
+ user_master.load(data_dir, false);
137
+
138
+ load(plsi, name);
139
+ plsi.startLearning(K);
140
+ {
141
+ puts("learning");
142
+ double pre_likelihood = -1e30;
143
+ double beta = 1;
144
+ for (int i = 0; i < Iter; ++i) {
145
+ double likelihood = plsi.step();
146
+ printf("%d : %.3f %.3f %.3f\n", i, beta, likelihood, likelihood - pre_likelihood);
147
+ if (likelihood - pre_likelihood < 1) {
148
+ beta *= 0.9;
149
+ if (beta < 0.01) break;
150
+ }
151
+ pre_likelihood = likelihood;
152
+ }
153
+ }
154
+
155
+ int mode = 0;
156
+ cybozu::nlp::Plsi::SEARCH_TYPE search_type = cybozu::nlp::Plsi::JOINT;
157
+
158
+ for(;;) {
159
+ std::string st;
160
+ std::cin >> st;
161
+ if (st == "") break;
162
+ if (st == "ui") {
163
+ mode = 0;
164
+ printf("user => items\n");
165
+ continue;
166
+ }
167
+ if (st == "ii") {
168
+ mode = 1;
169
+ printf("item => items\n");
170
+ continue;
171
+ }
172
+ if (st == "sj") {
173
+ search_type = cybozu::nlp::Plsi::JOINT;
174
+ printf("search type: JOINT probability\n");
175
+ continue;
176
+ }
177
+ if (st == "sc") {
178
+ search_type = cybozu::nlp::Plsi::CONDITIONAL;
179
+ printf("search type: CONDITIONAL probability\n");
180
+ continue;
181
+ }
182
+ if (st == "sp") {
183
+ search_type = cybozu::nlp::Plsi::POSTERIOR;
184
+ printf("search type: POSTERIOR probability\n");
185
+ continue;
186
+ }
187
+
188
+ cybozu::nlp::TopScore<size_t>::Table tbl;
189
+ switch(mode) {
190
+ case 0:
191
+ tbl = plsi.search_items(cybozu::atoi(st), 10);
192
+ break;
193
+ case 1:
194
+ tbl = plsi.similar_items(cybozu::atoi(st), search_type, 10);
195
+ break;
196
+ }
197
+
198
+ for (size_t i = 0; i < tbl.size(); i++) {
199
+ cybozu::nlp::Plsi::ITEM_TYPE key = plsi.get_item_key(tbl[i].idx);
200
+ printf("%1.3f %d:%s\n", log(tbl[i].score), key, event_master.int2data_[key].name.c_str());
201
+ }
202
+ }
203
+
204
+ } catch (std::exception& e) {
205
+ printf("error : %s\n", e.what());
206
+ }
207
+ }