ooxml_crypt 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,486 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief fast non-probabilistic SVD
5
+
6
+ @author MITSUNARI Shigeo(@herumi)
7
+ @author MITSUNARI Shigeo
8
+ */
9
+ #include <assert.h>
10
+ #include <vector>
11
+ #include <string>
12
+ #include <fstream>
13
+ #include <sstream>
14
+ #include <iomanip>
15
+ //#define CYBOZU_NLP_SVD_USE_RANDOM
16
+ #ifdef CYBOZU_NLP_SVD_USE_RANDOM
17
+ #include <cybozu/nlp/random.hpp>
18
+ #endif
19
+ #ifdef _MSC_VER
20
+ #pragma warning(push)
21
+ #pragma warning(disable : 4714) // force inline
22
+ #endif
23
+ #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
24
+ #include <eigen3/Eigen/Sparse>
25
+ #include <eigen3/Eigen/Dense>
26
+ #include <eigen3/Eigen/Eigenvalues>
27
+ #ifdef _MSC_VER
28
+ // #pragma warning(pop)
29
+ #endif
30
+
31
+ /***
32
+ text format
33
+
34
+ Matrix(dense)
35
+ ---
36
+ # M D <row> <col>
37
+ data1_1 data1_2 data1_3 ...
38
+ data2_1 data2_2 ...
39
+ ....
40
+ ---
41
+
42
+ Matrix(sparse)
43
+ ---
44
+ # M S <row> <col>
45
+ c1:data1_c1 c2:data1_c2 c3:data1_c3 ...
46
+ c1:data2_c1 c2:data2_c2 c3:data2_c3 ...
47
+ ....
48
+ ---
49
+
50
+ ex.
51
+ M = (1.0 2.0 3.0)
52
+ (1.2 2.4 3.5)
53
+ ---
54
+ # M D 2 3
55
+ 1.0 2.0 3.0
56
+ 1.2 2.4 3.5
57
+ ---
58
+
59
+ M = (1.0 0 3.0)
60
+ (0 4.2 0 )
61
+ ---
62
+ # M S 2 3
63
+ 0:1.0 2:3.0
64
+ 1:4.2
65
+ ---
66
+ */
67
+ namespace cybozu { namespace nlp {
68
+
69
+ namespace svd {
70
+
71
+ #ifdef CYBOZU_NLP_SVD_USE_RANDOM
72
+ template<class Matrix>
73
+ void InitRandomMatrix(Matrix& M)
74
+ {
75
+ cybozu::nlp::NormalRandomGenerator r;
76
+ for (int i = 0; i < M.rows(); i++) {
77
+ for (int j = 0; j < M.cols(); j++) {
78
+ M(i, j) = typename Matrix::Scalar(r.get());
79
+ }
80
+ }
81
+ }
82
+ #endif
83
+
84
+ template<class Matrix>
85
+ void InitUnitMatrix(Matrix& M)
86
+ {
87
+ M.setZero();
88
+ const int row = M.rows();
89
+ const int col = M.cols();
90
+ assert(col <= row);
91
+ #if 1
92
+ const int adj = 0;//(col & 1) ? row/2 : 0;
93
+ for (int i = 0; i < row; i++) {
94
+ M(i, (i * col + adj) / row) = 1;
95
+ }
96
+ #else
97
+ typedef typename Matrix::Scalar Double;
98
+ const int q0 = row / col;
99
+ const int r0 = row % col;
100
+ const double rcol = 1.0 / col;
101
+ int b = 0;
102
+ int q = q0;
103
+ int e = r0;
104
+ int rowIdx = 0;
105
+ int colIdx = 0;
106
+ for (;;) {
107
+ if (b > 0) {
108
+ M(rowIdx, colIdx) = Double(b * rcol);
109
+ rowIdx++;
110
+ }
111
+ for (int j = 0; j < q; j++) {
112
+ M(rowIdx, colIdx) = 1;
113
+ rowIdx++;
114
+ }
115
+ if (e > 0) {
116
+ M(rowIdx, colIdx) = Double(e * rcol);
117
+ }
118
+ if (colIdx == col - 1) break;
119
+ b = e == 0 ? 0 : col - e;
120
+ e = r0 - b;
121
+ if (e < 0) {
122
+ q = q0 - 1;
123
+ e += col;
124
+ } else {
125
+ q = q0;
126
+ }
127
+ colIdx++;
128
+ }
129
+ assert(rowIdx == row);
130
+ #endif
131
+ }
132
+ /*
133
+ m(row, col) => M(row, r)
134
+ r <= col
135
+ */
136
+ template<class Matrix1, class Matrix2>
137
+ void CompressCol(Matrix1& out, const Matrix2& m, int r)
138
+ {
139
+ typedef typename Matrix1::Scalar Double;
140
+ const int row = m.rows();
141
+ const int col = m.cols();
142
+ assert(r <= col);
143
+ out.resize(row, r);
144
+ #if 1
145
+ int begin = 0;
146
+ for (int j = 0; j < r; j++) {
147
+ int end = std::min(((j + 1) * col + r - 1) / r, col);
148
+ // printf("%d [%d, %d)\n", j, begin, end);
149
+ for (int i = 0; i < row; i++) {
150
+ double x = 0;
151
+ for (int k = begin; k < end; k++) {
152
+ x += m(i, k);
153
+ }
154
+ out(i, j) = Double(x);
155
+ }
156
+ begin = end;
157
+ }
158
+ #else
159
+ const int q0 = col / r;
160
+ const int r0 = col % r;
161
+ const double rr = 1.0 / r;
162
+ int b = 0;
163
+ int q = q0;
164
+ int e = r0;
165
+ int colIdx = 0;
166
+ int rIdx = 0;
167
+ for (;;) {
168
+ for (int i = 0; i < row; i++) {
169
+ double x = 0;
170
+ int k = colIdx;
171
+ if (b > 0) {
172
+ x += m(i, k) * b * rr;
173
+ k++;
174
+ }
175
+ for (int j = 0; j < q; j++) {
176
+ x += m(i, k);
177
+ k++;
178
+ }
179
+ if (e > 0) {
180
+ x += m(i, k) * e * rr;
181
+ }
182
+ out(i, rIdx) = Double(x);
183
+ }
184
+ if (b > 0) colIdx++;
185
+ colIdx += q;
186
+ if (rIdx == r - 1) break;
187
+ b = e == 0 ? 0 : r - e;
188
+ e = r0 - b;
189
+ if (e < 0) {
190
+ q = q0 - 1;
191
+ e += r;
192
+ } else {
193
+ q = q0;
194
+ }
195
+ rIdx++;
196
+ }
197
+ assert(colIdx == col);
198
+ #endif
199
+ }
200
+
201
+ template<class Matrix>
202
+ void OrthonormalizeMatrix(Matrix& M)
203
+ {
204
+ const double eps = 1e-5;
205
+ typedef typename Matrix::Scalar Double;
206
+ for (int i = 0; i < M.cols(); i++) {
207
+ double norm = M.col(i).norm();
208
+ if (norm < eps) {
209
+ M.col(i).setZero();
210
+ } else {
211
+ Double rev = Double(1.0 / norm);
212
+ M.col(i) *= rev;
213
+ for (int j = i + 1; j < M.cols(); j++) {
214
+ Double x = M.col(i).dot(M.col(j));
215
+ M.col(j) -= M.col(i) * x;
216
+ }
217
+ }
218
+ }
219
+ }
220
+
221
+ inline bool LoadHeader(bool *isMatrix, bool *isSparse, int *row, int *col, std::ifstream& ifs, const std::string& input)
222
+ {
223
+ ifs.open(input.c_str(), std::ios::binary);
224
+ if (!ifs) {
225
+ fprintf(stderr, "can't open %s\n", input.c_str());
226
+ return false;
227
+ }
228
+ std::string line;
229
+ if (std::getline(ifs, line)) {
230
+ std::istringstream is(line);
231
+ char c, vec, type;
232
+ is >> c >> vec >> type >> *row >> *col;
233
+ if (c != '#') {
234
+ fprintf(stderr, "top char is #(%c)\n", c);
235
+ goto ERR;
236
+ }
237
+ if (*row <= 0) {
238
+ fprintf(stderr, "row(%d) should be positive\n", *row);
239
+ goto ERR;
240
+ }
241
+ if (type != 'S' && type != 'D') {
242
+ fprintf(stderr, "type is D(dense) or S(sparse) (%c)\n", type);
243
+ goto ERR;
244
+ }
245
+ *isSparse = type == 'S';
246
+ switch (vec) {
247
+ case 'M':
248
+ if (*col <= 0) {
249
+ fprintf(stderr, "col(%d) should be positive\n", *col);
250
+ goto ERR;
251
+ }
252
+ *isMatrix = true;
253
+ break;
254
+ case 'V':
255
+ *col = 1;
256
+ *isMatrix = false;
257
+ break;
258
+ default:
259
+ fprintf(stderr, "vec is M(matrix) or V(vector) (%c)\n", vec);
260
+ goto ERR;
261
+ }
262
+ fprintf(stderr, "input (%c, %c, %d, %d)\n", vec, type, *row, *col);
263
+ return true;
264
+ }
265
+ ERR:
266
+ fprintf(stderr, "bad format top line must be '# (M|V) (D|S) <row> <col>'\n");
267
+ return false;
268
+ }
269
+
270
+ template<class Matrix>
271
+ bool LoadMatrix(Matrix& M, const std::string& input)
272
+ {
273
+ std::ifstream ifs;
274
+ bool isMatrix = false;
275
+ bool isSparse = false;
276
+ int row = 0, col = 0;
277
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
278
+ return false;
279
+ }
280
+ M.resize(row, col);
281
+ if (isSparse) {
282
+ for (int i = 0; i < row; i++) {
283
+ M.row(i).setZero();
284
+ std::string line;
285
+ if (!std::getline(ifs, line)) {
286
+ fprintf(stderr, "can't read %d line\n", i);
287
+ return false;
288
+ }
289
+ std::istringstream is(line);
290
+ for (;;) {
291
+ int idx;
292
+ char sep;
293
+ double v;
294
+ is >> idx >> sep >> v;
295
+ if (!is) break;
296
+ if (sep != ':' || idx < 0 || idx >= col) {
297
+ fprintf(stderr, "can't read %s\n", line.c_str());
298
+ return false;
299
+ }
300
+ M(i, idx) = typename Matrix::Scalar(v);
301
+ }
302
+ }
303
+ } else {
304
+ for (int i = 0; i < row; i++) {
305
+ for (int j = 0; j < col; j++) {
306
+ double v;
307
+ ifs >> v;
308
+ if (!ifs) {
309
+ fprintf(stderr, "can't read (%d,%d)\n", i, j);
310
+ return false;
311
+ }
312
+ M(i, j) = typename Matrix::Scalar(v);
313
+ }
314
+ }
315
+ }
316
+ return true;
317
+ }
318
+
319
+ template<class Matrix>
320
+ bool LoadSparseMatrix(Matrix& M, const std::string& input)
321
+ {
322
+ std::ifstream ifs;
323
+ bool isMatrix = false;
324
+ bool isSparse = false;
325
+ int row = 0, col = 0;
326
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
327
+ return false;
328
+ }
329
+ if (!isSparse) {
330
+ fprintf(stderr, "ERR not sparse\n");
331
+ return false;
332
+ }
333
+ M.resize(row, col);
334
+ for (int i = 0; i < row; i++) {
335
+ std::string line;
336
+ if (!std::getline(ifs, line)) {
337
+ fprintf(stderr, "can't read %d line\n", i);
338
+ return false;
339
+ }
340
+ std::istringstream is(line);
341
+ M.startVec(i);
342
+ for (;;) {
343
+ int idx;
344
+ char sep;
345
+ double v;
346
+ is >> idx >> sep >> v;
347
+ if (!is) break;
348
+ if (sep != ':' || idx < 0 || idx >= col) {
349
+ fprintf(stderr, "can't read %s\n", line.c_str());
350
+ return false;
351
+ }
352
+ M.insertBack(i, idx) = typename Matrix::Scalar(v);
353
+ }
354
+ }
355
+ M.finalize();
356
+ return true;
357
+ }
358
+
359
+ template<class Vector>
360
+ bool LoadVector(Vector& V, const std::string& input)
361
+ {
362
+ std::ifstream ifs;
363
+ bool isMatrix = false;
364
+ bool isSparse = false;
365
+ int row = 0, col = 0;
366
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || isMatrix) {
367
+ return false;
368
+ }
369
+ V.resize(row, 1);
370
+ for (int i = 0; i < row; i++) {
371
+ double v;
372
+ ifs >> v;
373
+ if (!ifs) {
374
+ fprintf(stderr, "can't read (%d)\n", i);
375
+ return false;
376
+ }
377
+ V(i) = typename Vector::Scalar(v);
378
+ }
379
+ return true;
380
+ }
381
+
382
+ template<class Matrix>
383
+ bool SaveMatrix(const std::string& outName, const Matrix& M)
384
+ {
385
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
386
+ ofs << std::setprecision(8);
387
+
388
+ ofs << "# M D " << M.rows() << " " << M.cols() << std::endl;
389
+ for (int i = 0; i < M.rows(); i++) {
390
+ for (int j = 0; j < M.cols(); j++) {
391
+ if (j > 0) ofs << ' ';
392
+ ofs << M(i, j);
393
+ }
394
+ ofs << std::endl;
395
+ }
396
+ return ofs.good();
397
+ }
398
+
399
+ template<class Matrix>
400
+ bool SaveSparseMatrix(const std::string& outName, const Matrix& M)
401
+ {
402
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
403
+ ofs << std::setprecision(8);
404
+
405
+ ofs << "# M S " << M.rows() << " " << M.cols() << std::endl;
406
+ for (int i = 0; i < M.outerSize(); i++) {
407
+ bool isFirst = true;
408
+ for (typename Matrix::InnerIterator j(M, i); j; ++j) {
409
+ if (isFirst) {
410
+ isFirst = false;
411
+ } else {
412
+ ofs << ' ';
413
+ }
414
+ ofs << j.col() << ':' << j.value();
415
+ }
416
+ ofs << std::endl;
417
+ }
418
+ return ofs.good();
419
+ }
420
+
421
+ template<class Vector>
422
+ bool SaveVector(const std::string& outName, const Vector& V)
423
+ {
424
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
425
+ ofs << std::setprecision(8);
426
+ ofs << "# V D " << V.rows() << std::endl;
427
+ for (int i = 0; i < V.rows(); i++) {
428
+ ofs << V(i) << std::endl;
429
+ }
430
+ return ofs.good();
431
+ }
432
+
433
+ } // svd
434
+
435
+ /*
436
+ approximate singular value decomposition
437
+ A = U S t(V) with rank r
438
+
439
+ t(M) : transpose of M
440
+ t(U) U = I
441
+ t(V) V = I
442
+
443
+ R : compressed unit matrix
444
+ Y = t(A) R
445
+ Y = orthonormalize(Y) ; t(Y) Y = I
446
+ B = A Y
447
+ Z = orthonormalize(B) ; t(Z) Z = I
448
+ C = t(Z) B
449
+ C = U' S t(V')
450
+ A \simeq A Y t(Y)
451
+ = B t(Y)
452
+ \simeq Z t(Z) B t(Y)
453
+ = Z C t(Y)
454
+ = Z U' S t(V') t(Y)
455
+ = (Z U') S t(YV')
456
+ = U S V
457
+ */
458
+ template<class Matrix, class Matrix2, class Vector>
459
+ bool ComputeSVD(Matrix& U, Vector& S, Matrix& V, const Matrix2& A, int rank)
460
+ {
461
+ const int r = std::min<int>(static_cast<int>(std::min(A.cols(), A.rows())), rank);
462
+ if (r <= 0) return false;
463
+
464
+ #if 1
465
+ Matrix R(A.rows(), r);
466
+ // svd::InitRandomMatrix(R);
467
+ svd::InitUnitMatrix(R);
468
+ Matrix Y = A.transpose() * R;
469
+ #else
470
+ Matrix Y;
471
+ svd::CompressCol(Y, A.transpose(), r);
472
+ #endif
473
+ svd::OrthonormalizeMatrix(Y);
474
+ const Matrix B = A * Y;
475
+ Matrix Z = B;
476
+ svd::OrthonormalizeMatrix(Z);
477
+ const Matrix C = Z.transpose() * B;
478
+ const Eigen::JacobiSVD<Matrix> svd(C, Eigen::ComputeThinU | Eigen::ComputeThinV);
479
+ U = Z * svd.matrixU();
480
+ S = svd.singularValues();
481
+ V = Y * svd.matrixV();
482
+ return true;
483
+ }
484
+
485
+ } } // cybozu::nlp
486
+
@@ -0,0 +1,226 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief TF-IDF
5
+
6
+ @author MITSUNARI Shigeo(@herumi)
7
+ */
8
+ #include <set>
9
+ #include <map>
10
+ #include <string>
11
+ #include <stdio.h>
12
+ #include <cybozu/string_operation.hpp>
13
+ #include <cybozu/nlp/sparse.hpp>
14
+
15
+ namespace cybozu { namespace nlp {
16
+
17
+ struct Str2Int : std::map<std::string, int> {
18
+ void put() const
19
+ {
20
+ for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
21
+ printf("%s:%d\n", i->first.c_str(), i->second);
22
+ }
23
+ }
24
+ };
25
+ struct Int2Int : std::map<int, int> {
26
+ void put() const
27
+ {
28
+ for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
29
+ printf("%d:%d ", i->first, i->second);
30
+ }
31
+ printf("\n");
32
+ }
33
+ };
34
+ struct StrVec : std::vector<std::string> {
35
+ void put() const
36
+ {
37
+ for (size_t i = 0, n = size(); i < n; i++) {
38
+ printf("%d:%s\n", (int)i, (*this)[i].c_str());
39
+ }
40
+ }
41
+ };
42
+ typedef std::vector<double> DoubleVec;
43
+ typedef std::vector<Int2Int> Int2IntVec;
44
+ typedef std::set<std::string> StrSet;
45
+ typedef cybozu::nlp::SparseVector<double> DoubleSvec;
46
+ typedef std::vector<DoubleSvec> DoubleSvecVec;
47
+ typedef std::vector<int> IntVec;
48
+
49
+ struct Df {
50
+ struct Pair {
51
+ int id;
52
+ int freq;
53
+ Pair(int _id = 0, int _freq = 0) : id(_id), freq(_freq) { }
54
+ bool operator<(const Pair& rhs) const { return freq < rhs.freq; }
55
+ };
56
+ typedef std::vector<Pair> PairVec;
57
+ int docNum_;
58
+ Str2Int word2id_;
59
+ StrVec id2word_;
60
+ IntVec df_;
61
+ StrSet set_; // for one doc
62
+ PairVec pv_;
63
+ Df()
64
+ : docNum_(0)
65
+ {
66
+ }
67
+ void append(const std::string& word)
68
+ {
69
+ std::string lower;
70
+ cybozu::ToLower(lower, word);
71
+ std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(lower, (int)id2word_.size()));
72
+ //printf("word=%s, id=%d, ret=%d\n", ret.first->first.c_str(), ret.first->second, ret.second);
73
+ if (ret.second) {
74
+ id2word_.push_back(lower);
75
+ df_.resize(id2word_.size());
76
+ }
77
+ if (set_.insert(word).second) {
78
+ df_[ret.first->second]++;
79
+ }
80
+ }
81
+ void endDoc()
82
+ {
83
+ docNum_++;
84
+ set_.clear();
85
+ }
86
+ // sort freq order
87
+ void term(int lowerLimit = 3, double upperRateLimit = 0.98)
88
+ {
89
+ fprintf(stderr, "#doc=%d, #word=%d\n", docNum_, (int)df_.size());
90
+ for (size_t i = 0, n = id2word_.size(); i < n; i++) {
91
+ const int freq = df_[i];
92
+ if (freq <= lowerLimit) continue;
93
+ pv_.push_back(Pair(i, freq));
94
+ }
95
+ int pvNum = (int)(pv_.size() * upperRateLimit);
96
+ fprintf(stderr, "shrink %d -> %d\n", (int)pv_.size(), pvNum);
97
+ std::partial_sort(pv_.begin(), pv_.begin() + pvNum, pv_.end());
98
+ pv_.resize(pvNum);
99
+ }
100
+ };
101
+
102
+ inline std::ostream& operator<<(std::ostream& os, const Df& df)
103
+ {
104
+ const double logN = log(double(df.docNum_));
105
+ for (size_t i = 0, n = df.pv_.size(); i < n; i++) {
106
+ int freq = df.pv_[i].freq;
107
+ double idf = logN - log(double(freq));
108
+ os << df.id2word_[df.pv_[i].id] << '\t' << freq << '\t' << idf << std::endl;
109
+ }
110
+ return os;
111
+ }
112
+
113
+ struct TfIdf {
114
+ Str2Int word2id_;
115
+ StrVec id2word_;
116
+ IntVec df_;
117
+ Int2IntVec tf_;
118
+
119
+ DoubleVec idf_;
120
+ DoubleSvecVec sv_;
121
+
122
+ // work area
123
+ Int2Int *curTf_;
124
+ StrSet set_; // for one doc
125
+
126
+ TfIdf()
127
+ : curTf_(0)
128
+ {
129
+ }
130
+ bool loadKeywordFile(const std::string& keyFile)
131
+ {
132
+ std::ifstream ifs(keyFile.c_str(), std::ios::binary);
133
+ if (!ifs) return false;
134
+ std::string word;
135
+ while (std::getline(ifs, word)) {
136
+ size_t pos = word.find('\t');
137
+ if (pos == std::string::npos) break;
138
+ word.resize(pos);
139
+ std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(word, (int)id2word_.size()));
140
+ if (ret.second) {
141
+ id2word_.push_back(word);
142
+ } else {
143
+ fprintf(stderr, "ERR already set %s\n", word.c_str());
144
+ }
145
+ }
146
+ df_.resize(id2word_.size());
147
+ fprintf(stderr, "#word = %d\n", (int)df_.size());
148
+ return true;
149
+ }
150
+
151
+ void append(const std::string& word)
152
+ {
153
+ std::string lower;
154
+ cybozu::ToLower(lower, word);
155
+ Str2Int::const_iterator i = word2id_.find(lower);
156
+ if (i == word2id_.end()) return;
157
+ const int id = i->second;
158
+ if (curTf_ == 0) {
159
+ tf_.push_back(Int2Int());
160
+ curTf_ = &tf_.back();
161
+ }
162
+ (*curTf_)[id]++;
163
+ if (set_.insert(lower).second) {
164
+ df_[id]++;
165
+ }
166
+ }
167
+ void endDoc()
168
+ {
169
+ curTf_ = 0;
170
+ set_.clear();
171
+ }
172
+ void put() const
173
+ {
174
+ printf("docNum=%d\n", (int)tf_.size());
175
+ for (size_t i = 0, n = tf_.size(); i < n; i++) {
176
+ printf("%d ", (int)i);
177
+ tf_[i].put();
178
+ }
179
+ puts("word:idx");
180
+ word2id_.put();
181
+ }
182
+
183
+ void term()
184
+ {
185
+ const double logN = log(double(tf_.size()));
186
+ idf_.resize(df_.size());
187
+ for (size_t i = 0, n = df_.size(); i < n; i++) {
188
+ idf_[i] = logN - log(double(df_[i]));
189
+ }
190
+ for (size_t i = 0, n = df_.size(); i < n; i++) {
191
+ const Int2Int& iv = tf_[i];
192
+ DoubleSvec v;
193
+ for (Int2Int::const_iterator j = iv.begin(), je = iv.end(); j != je; ++j) {
194
+ v.push_back(j->first, j->second * idf_[j->first]);
195
+ }
196
+ sv_.push_back(v);
197
+ }
198
+ }
199
+ void put(int maxNum = 0x7fffffff) const
200
+ {
201
+ printf("docNum=%d, wordNum=%d\n", (int)tf_.size(), (int)df_.size());
202
+ for (int i = 0, n = std::min(maxNum, (int)sv_.size()); i < n; i++) {
203
+ const DoubleSvec& v = sv_[i];
204
+ for (DoubleSvec::const_iterator j = v.begin(), je = v.end(); j != je; ++j) {
205
+ printf("%d:%f ", (int)j->pos(), j->val());
206
+ }
207
+ printf("\n");
208
+ }
209
+ }
210
+ };
211
+
212
+ inline std::ostream& operator<<(std::ostream& os, const TfIdf& /*tfIdf*/)
213
+ {
214
+ #if 0
215
+ int num = 0;
216
+ for (TfIdf::Rank::const_iterator i = tfIdf.rank_.begin(), ie = tfIdf.rank_.end(); i != ie; ++i) {
217
+ TfIdf::Counter::const_iterator c = tfIdf.counter_.find(i->second);
218
+ assert(c != tfIdf.counter_.end());
219
+ os << i->first << ' ' << c->second.tf_ << ' ' << c->second.df_ << ' ' << i->second << std::endl;
220
+ num++;
221
+ }
222
+ #endif
223
+ return os;
224
+ }
225
+
226
+ } } // cybozu::nlp