ooxml_crypt 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (264) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +4 -0
  3. data/LICENSE.txt +21 -0
  4. data/README.md +58 -0
  5. data/Rakefile +12 -0
  6. data/bin/console +15 -0
  7. data/bin/setup +8 -0
  8. data/ext/ooxml_crypt/extconf.rb +18 -0
  9. data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
  10. data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
  11. data/lib/ooxml_crypt/version.rb +5 -0
  12. data/lib/ooxml_crypt.rb +75 -0
  13. data/vendor/cybozulib/.github/workflows/main.yml +12 -0
  14. data/vendor/cybozulib/.gitignore +5 -0
  15. data/vendor/cybozulib/CMakeLists.txt +6 -0
  16. data/vendor/cybozulib/COPYRIGHT +27 -0
  17. data/vendor/cybozulib/Makefile +26 -0
  18. data/vendor/cybozulib/bin/libeay32.dll +0 -0
  19. data/vendor/cybozulib/bin/libmecab.dll +0 -0
  20. data/vendor/cybozulib/bin/ssleay32.dll +0 -0
  21. data/vendor/cybozulib/common.mk +116 -0
  22. data/vendor/cybozulib/common.props +25 -0
  23. data/vendor/cybozulib/cybozulib.sln +286 -0
  24. data/vendor/cybozulib/debug.props +14 -0
  25. data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
  26. data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
  27. data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
  28. data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
  29. data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
  30. data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
  31. data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
  32. data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
  33. data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
  34. data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
  35. data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
  36. data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
  37. data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
  38. data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
  39. data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
  40. data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
  41. data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
  42. data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
  43. data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
  44. data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
  45. data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
  46. data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
  47. data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
  48. data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
  49. data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
  50. data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
  51. data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
  52. data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
  53. data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
  54. data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
  55. data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
  56. data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
  57. data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
  58. data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
  59. data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
  60. data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
  61. data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
  62. data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
  63. data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
  64. data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
  65. data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
  66. data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
  67. data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
  68. data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
  69. data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
  70. data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
  71. data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
  72. data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
  73. data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
  74. data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
  75. data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
  76. data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
  77. data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
  78. data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
  79. data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
  80. data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
  81. data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
  82. data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
  83. data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
  84. data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
  85. data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
  86. data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
  87. data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
  88. data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
  89. data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
  90. data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
  91. data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
  92. data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
  93. data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
  94. data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
  95. data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
  96. data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
  97. data/vendor/cybozulib/include/sais.hxx +364 -0
  98. data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
  99. data/vendor/cybozulib/mk.bat +37 -0
  100. data/vendor/cybozulib/readme.md +29 -0
  101. data/vendor/cybozulib/release.props +12 -0
  102. data/vendor/cybozulib/sample/Makefile +30 -0
  103. data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
  104. data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
  105. data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
  106. data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
  107. data/vendor/cybozulib/sample/data/svd/test1 +4 -0
  108. data/vendor/cybozulib/sample/data/svd/test2 +4 -0
  109. data/vendor/cybozulib/sample/desymbol.cpp +127 -0
  110. data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
  111. data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
  112. data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
  113. data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
  114. data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
  115. data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
  116. data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
  117. data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
  118. data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
  119. data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
  120. data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
  121. data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
  122. data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
  123. data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
  124. data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
  125. data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
  126. data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
  127. data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
  128. data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
  129. data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
  130. data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
  131. data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
  132. data/vendor/cybozulib/src/Makefile +8 -0
  133. data/vendor/cybozulib/src/base/Makefile +19 -0
  134. data/vendor/cybozulib/test/Makefile +12 -0
  135. data/vendor/cybozulib/test/base/Makefile +37 -0
  136. data/vendor/cybozulib/test/base/array_test.cpp +173 -0
  137. data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
  138. data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
  139. data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
  140. data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
  141. data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
  142. data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
  143. data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
  144. data/vendor/cybozulib/test/base/config_test.cpp +236 -0
  145. data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
  146. data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
  147. data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
  148. data/vendor/cybozulib/test/base/data/a.xml +26 -0
  149. data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
  150. data/vendor/cybozulib/test/base/env_test.cpp +22 -0
  151. data/vendor/cybozulib/test/base/event_test.cpp +41 -0
  152. data/vendor/cybozulib/test/base/file_test.cpp +233 -0
  153. data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
  154. data/vendor/cybozulib/test/base/format_test.cpp +12 -0
  155. data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
  156. data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
  157. data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
  158. data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
  159. data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
  160. data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
  161. data/vendor/cybozulib/test/base/option_test.cpp +487 -0
  162. data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
  163. data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
  164. data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
  165. data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
  166. data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
  167. data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
  168. data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
  169. data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
  170. data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
  171. data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
  172. data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
  173. data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
  174. data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
  175. data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
  176. data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
  177. data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
  178. data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
  179. data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
  180. data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
  181. data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
  182. data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
  183. data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
  184. data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
  185. data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
  186. data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
  187. data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
  188. data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
  189. data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
  190. data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
  191. data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
  192. data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
  193. data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
  194. data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
  195. data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
  196. data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
  197. data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
  198. data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
  199. data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
  200. data/vendor/cybozulib/test/base/time_test.cpp +164 -0
  201. data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
  202. data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
  203. data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
  204. data/vendor/cybozulib/test/nlp/Makefile +27 -0
  205. data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
  206. data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
  207. data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
  208. data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
  209. data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
  210. data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
  211. data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
  212. data/vendor/cybozulib/tool/create_vcproj.py +186 -0
  213. data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
  214. data/vendor/msoffice/COPYRIGHT +27 -0
  215. data/vendor/msoffice/Makefile +29 -0
  216. data/vendor/msoffice/bin/64/msoc.dll +0 -0
  217. data/vendor/msoffice/bin/64/msocsample.exe +0 -0
  218. data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
  219. data/vendor/msoffice/bin/msoc.dll +0 -0
  220. data/vendor/msoffice/bin/msocsample.exe +0 -0
  221. data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
  222. data/vendor/msoffice/common.mk +71 -0
  223. data/vendor/msoffice/common.props +26 -0
  224. data/vendor/msoffice/debug.props +14 -0
  225. data/vendor/msoffice/include/attack.hpp +211 -0
  226. data/vendor/msoffice/include/cfb.hpp +777 -0
  227. data/vendor/msoffice/include/crypto_util.hpp +450 -0
  228. data/vendor/msoffice/include/custom_sha1.hpp +342 -0
  229. data/vendor/msoffice/include/decode.hpp +240 -0
  230. data/vendor/msoffice/include/encode.hpp +221 -0
  231. data/vendor/msoffice/include/make_dataspace.hpp +316 -0
  232. data/vendor/msoffice/include/msoc.h +129 -0
  233. data/vendor/msoffice/include/resource.hpp +7 -0
  234. data/vendor/msoffice/include/standard_encryption.hpp +145 -0
  235. data/vendor/msoffice/include/uint32vec.hpp +179 -0
  236. data/vendor/msoffice/include/util.hpp +212 -0
  237. data/vendor/msoffice/lib/.emptydir +0 -0
  238. data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
  239. data/vendor/msoffice/mk.bat +1 -0
  240. data/vendor/msoffice/mkdll.bat +3 -0
  241. data/vendor/msoffice/msoc.def +13 -0
  242. data/vendor/msoffice/msocsample.py +178 -0
  243. data/vendor/msoffice/msoffice12.sln +31 -0
  244. data/vendor/msoffice/readme.md +110 -0
  245. data/vendor/msoffice/release.props +28 -0
  246. data/vendor/msoffice/src/Makefile +19 -0
  247. data/vendor/msoffice/src/attack.cpp +124 -0
  248. data/vendor/msoffice/src/cfb_test.cpp +77 -0
  249. data/vendor/msoffice/src/minisample.c +54 -0
  250. data/vendor/msoffice/src/msocdll.cpp +276 -0
  251. data/vendor/msoffice/src/msocsample.c +136 -0
  252. data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
  253. data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
  254. data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
  255. data/vendor/msoffice/src/sha1.cpp +234 -0
  256. data/vendor/msoffice/test/Makefile +20 -0
  257. data/vendor/msoffice/test/cfb_test.cpp +74 -0
  258. data/vendor/msoffice/test/hash_test.cpp +59 -0
  259. data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
  260. data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
  261. data/vendor/msoffice/test/sampl.bat +8 -0
  262. data/vendor/msoffice/test_all.py +46 -0
  263. data/vendor/update +4 -0
  264. metadata +351 -0
@@ -0,0 +1,486 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief fast non-probabilistic SVD
5
+
6
+ @author MITSUNARI Shigeo(@herumi)
7
+ @author MITSUNARI Shigeo
8
+ */
9
+ #include <assert.h>
10
+ #include <vector>
11
+ #include <string>
12
+ #include <fstream>
13
+ #include <sstream>
14
+ #include <iomanip>
15
+ //#define CYBOZU_NLP_SVD_USE_RANDOM
16
+ #ifdef CYBOZU_NLP_SVD_USE_RANDOM
17
+ #include <cybozu/nlp/random.hpp>
18
+ #endif
19
+ #ifdef _MSC_VER
20
+ #pragma warning(push)
21
+ #pragma warning(disable : 4714) // force inline
22
+ #endif
23
+ #define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
24
+ #include <eigen3/Eigen/Sparse>
25
+ #include <eigen3/Eigen/Dense>
26
+ #include <eigen3/Eigen/Eigenvalues>
27
+ #ifdef _MSC_VER
28
+ // #pragma warning(pop)
29
+ #endif
30
+
31
+ /***
32
+ text format
33
+
34
+ Matrix(dense)
35
+ ---
36
+ # M D <row> <col>
37
+ data1_1 data1_2 data1_3 ...
38
+ data2_1 data2_2 ...
39
+ ....
40
+ ---
41
+
42
+ Matrix(sparse)
43
+ ---
44
+ # M S <row> <col>
45
+ c1:data1_c1 c2:data1_c2 c3:data1_c3 ...
46
+ c1:data2_c1 c2:data2_c2 c3:data2_c3 ...
47
+ ....
48
+ ---
49
+
50
+ ex.
51
+ M = (1.0 2.0 3.0)
52
+ (1.2 2.4 3.5)
53
+ ---
54
+ # M D 2 3
55
+ 1.0 2.0 3.0
56
+ 1.2 2.4 3.5
57
+ ---
58
+
59
+ M = (1.0 0 3.0)
60
+ (0 4.2 0 )
61
+ ---
62
+ # M S 2 3
63
+ 0:1.0 2:3.0
64
+ 1:4.2
65
+ ---
66
+ */
67
+ namespace cybozu { namespace nlp {
68
+
69
+ namespace svd {
70
+
71
+ #ifdef CYBOZU_NLP_SVD_USE_RANDOM
72
+ template<class Matrix>
73
+ void InitRandomMatrix(Matrix& M)
74
+ {
75
+ cybozu::nlp::NormalRandomGenerator r;
76
+ for (int i = 0; i < M.rows(); i++) {
77
+ for (int j = 0; j < M.cols(); j++) {
78
+ M(i, j) = typename Matrix::Scalar(r.get());
79
+ }
80
+ }
81
+ }
82
+ #endif
83
+
84
+ template<class Matrix>
85
+ void InitUnitMatrix(Matrix& M)
86
+ {
87
+ M.setZero();
88
+ const int row = M.rows();
89
+ const int col = M.cols();
90
+ assert(col <= row);
91
+ #if 1
92
+ const int adj = 0;//(col & 1) ? row/2 : 0;
93
+ for (int i = 0; i < row; i++) {
94
+ M(i, (i * col + adj) / row) = 1;
95
+ }
96
+ #else
97
+ typedef typename Matrix::Scalar Double;
98
+ const int q0 = row / col;
99
+ const int r0 = row % col;
100
+ const double rcol = 1.0 / col;
101
+ int b = 0;
102
+ int q = q0;
103
+ int e = r0;
104
+ int rowIdx = 0;
105
+ int colIdx = 0;
106
+ for (;;) {
107
+ if (b > 0) {
108
+ M(rowIdx, colIdx) = Double(b * rcol);
109
+ rowIdx++;
110
+ }
111
+ for (int j = 0; j < q; j++) {
112
+ M(rowIdx, colIdx) = 1;
113
+ rowIdx++;
114
+ }
115
+ if (e > 0) {
116
+ M(rowIdx, colIdx) = Double(e * rcol);
117
+ }
118
+ if (colIdx == col - 1) break;
119
+ b = e == 0 ? 0 : col - e;
120
+ e = r0 - b;
121
+ if (e < 0) {
122
+ q = q0 - 1;
123
+ e += col;
124
+ } else {
125
+ q = q0;
126
+ }
127
+ colIdx++;
128
+ }
129
+ assert(rowIdx == row);
130
+ #endif
131
+ }
132
+ /*
133
+ m(row, col) => M(row, r)
134
+ r <= col
135
+ */
136
+ template<class Matrix1, class Matrix2>
137
+ void CompressCol(Matrix1& out, const Matrix2& m, int r)
138
+ {
139
+ typedef typename Matrix1::Scalar Double;
140
+ const int row = m.rows();
141
+ const int col = m.cols();
142
+ assert(r <= col);
143
+ out.resize(row, r);
144
+ #if 1
145
+ int begin = 0;
146
+ for (int j = 0; j < r; j++) {
147
+ int end = std::min(((j + 1) * col + r - 1) / r, col);
148
+ // printf("%d [%d, %d)\n", j, begin, end);
149
+ for (int i = 0; i < row; i++) {
150
+ double x = 0;
151
+ for (int k = begin; k < end; k++) {
152
+ x += m(i, k);
153
+ }
154
+ out(i, j) = Double(x);
155
+ }
156
+ begin = end;
157
+ }
158
+ #else
159
+ const int q0 = col / r;
160
+ const int r0 = col % r;
161
+ const double rr = 1.0 / r;
162
+ int b = 0;
163
+ int q = q0;
164
+ int e = r0;
165
+ int colIdx = 0;
166
+ int rIdx = 0;
167
+ for (;;) {
168
+ for (int i = 0; i < row; i++) {
169
+ double x = 0;
170
+ int k = colIdx;
171
+ if (b > 0) {
172
+ x += m(i, k) * b * rr;
173
+ k++;
174
+ }
175
+ for (int j = 0; j < q; j++) {
176
+ x += m(i, k);
177
+ k++;
178
+ }
179
+ if (e > 0) {
180
+ x += m(i, k) * e * rr;
181
+ }
182
+ out(i, rIdx) = Double(x);
183
+ }
184
+ if (b > 0) colIdx++;
185
+ colIdx += q;
186
+ if (rIdx == r - 1) break;
187
+ b = e == 0 ? 0 : r - e;
188
+ e = r0 - b;
189
+ if (e < 0) {
190
+ q = q0 - 1;
191
+ e += r;
192
+ } else {
193
+ q = q0;
194
+ }
195
+ rIdx++;
196
+ }
197
+ assert(colIdx == col);
198
+ #endif
199
+ }
200
+
201
+ template<class Matrix>
202
+ void OrthonormalizeMatrix(Matrix& M)
203
+ {
204
+ const double eps = 1e-5;
205
+ typedef typename Matrix::Scalar Double;
206
+ for (int i = 0; i < M.cols(); i++) {
207
+ double norm = M.col(i).norm();
208
+ if (norm < eps) {
209
+ M.col(i).setZero();
210
+ } else {
211
+ Double rev = Double(1.0 / norm);
212
+ M.col(i) *= rev;
213
+ for (int j = i + 1; j < M.cols(); j++) {
214
+ Double x = M.col(i).dot(M.col(j));
215
+ M.col(j) -= M.col(i) * x;
216
+ }
217
+ }
218
+ }
219
+ }
220
+
221
+ inline bool LoadHeader(bool *isMatrix, bool *isSparse, int *row, int *col, std::ifstream& ifs, const std::string& input)
222
+ {
223
+ ifs.open(input.c_str(), std::ios::binary);
224
+ if (!ifs) {
225
+ fprintf(stderr, "can't open %s\n", input.c_str());
226
+ return false;
227
+ }
228
+ std::string line;
229
+ if (std::getline(ifs, line)) {
230
+ std::istringstream is(line);
231
+ char c, vec, type;
232
+ is >> c >> vec >> type >> *row >> *col;
233
+ if (c != '#') {
234
+ fprintf(stderr, "top char is #(%c)\n", c);
235
+ goto ERR;
236
+ }
237
+ if (*row <= 0) {
238
+ fprintf(stderr, "row(%d) should be positive\n", *row);
239
+ goto ERR;
240
+ }
241
+ if (type != 'S' && type != 'D') {
242
+ fprintf(stderr, "type is D(dense) or S(sparse) (%c)\n", type);
243
+ goto ERR;
244
+ }
245
+ *isSparse = type == 'S';
246
+ switch (vec) {
247
+ case 'M':
248
+ if (*col <= 0) {
249
+ fprintf(stderr, "col(%d) should be positive\n", *col);
250
+ goto ERR;
251
+ }
252
+ *isMatrix = true;
253
+ break;
254
+ case 'V':
255
+ *col = 1;
256
+ *isMatrix = false;
257
+ break;
258
+ default:
259
+ fprintf(stderr, "vec is M(matrix) or V(vector) (%c)\n", vec);
260
+ goto ERR;
261
+ }
262
+ fprintf(stderr, "input (%c, %c, %d, %d)\n", vec, type, *row, *col);
263
+ return true;
264
+ }
265
+ ERR:
266
+ fprintf(stderr, "bad format top line must be '# (M|V) (D|S) <row> <col>'\n");
267
+ return false;
268
+ }
269
+
270
+ template<class Matrix>
271
+ bool LoadMatrix(Matrix& M, const std::string& input)
272
+ {
273
+ std::ifstream ifs;
274
+ bool isMatrix = false;
275
+ bool isSparse = false;
276
+ int row = 0, col = 0;
277
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
278
+ return false;
279
+ }
280
+ M.resize(row, col);
281
+ if (isSparse) {
282
+ for (int i = 0; i < row; i++) {
283
+ M.row(i).setZero();
284
+ std::string line;
285
+ if (!std::getline(ifs, line)) {
286
+ fprintf(stderr, "can't read %d line\n", i);
287
+ return false;
288
+ }
289
+ std::istringstream is(line);
290
+ for (;;) {
291
+ int idx;
292
+ char sep;
293
+ double v;
294
+ is >> idx >> sep >> v;
295
+ if (!is) break;
296
+ if (sep != ':' || idx < 0 || idx >= col) {
297
+ fprintf(stderr, "can't read %s\n", line.c_str());
298
+ return false;
299
+ }
300
+ M(i, idx) = typename Matrix::Scalar(v);
301
+ }
302
+ }
303
+ } else {
304
+ for (int i = 0; i < row; i++) {
305
+ for (int j = 0; j < col; j++) {
306
+ double v;
307
+ ifs >> v;
308
+ if (!ifs) {
309
+ fprintf(stderr, "can't read (%d,%d)\n", i, j);
310
+ return false;
311
+ }
312
+ M(i, j) = typename Matrix::Scalar(v);
313
+ }
314
+ }
315
+ }
316
+ return true;
317
+ }
318
+
319
+ template<class Matrix>
320
+ bool LoadSparseMatrix(Matrix& M, const std::string& input)
321
+ {
322
+ std::ifstream ifs;
323
+ bool isMatrix = false;
324
+ bool isSparse = false;
325
+ int row = 0, col = 0;
326
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
327
+ return false;
328
+ }
329
+ if (!isSparse) {
330
+ fprintf(stderr, "ERR not sparse\n");
331
+ return false;
332
+ }
333
+ M.resize(row, col);
334
+ for (int i = 0; i < row; i++) {
335
+ std::string line;
336
+ if (!std::getline(ifs, line)) {
337
+ fprintf(stderr, "can't read %d line\n", i);
338
+ return false;
339
+ }
340
+ std::istringstream is(line);
341
+ M.startVec(i);
342
+ for (;;) {
343
+ int idx;
344
+ char sep;
345
+ double v;
346
+ is >> idx >> sep >> v;
347
+ if (!is) break;
348
+ if (sep != ':' || idx < 0 || idx >= col) {
349
+ fprintf(stderr, "can't read %s\n", line.c_str());
350
+ return false;
351
+ }
352
+ M.insertBack(i, idx) = typename Matrix::Scalar(v);
353
+ }
354
+ }
355
+ M.finalize();
356
+ return true;
357
+ }
358
+
359
+ template<class Vector>
360
+ bool LoadVector(Vector& V, const std::string& input)
361
+ {
362
+ std::ifstream ifs;
363
+ bool isMatrix = false;
364
+ bool isSparse = false;
365
+ int row = 0, col = 0;
366
+ if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || isMatrix) {
367
+ return false;
368
+ }
369
+ V.resize(row, 1);
370
+ for (int i = 0; i < row; i++) {
371
+ double v;
372
+ ifs >> v;
373
+ if (!ifs) {
374
+ fprintf(stderr, "can't read (%d)\n", i);
375
+ return false;
376
+ }
377
+ V(i) = typename Vector::Scalar(v);
378
+ }
379
+ return true;
380
+ }
381
+
382
+ template<class Matrix>
383
+ bool SaveMatrix(const std::string& outName, const Matrix& M)
384
+ {
385
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
386
+ ofs << std::setprecision(8);
387
+
388
+ ofs << "# M D " << M.rows() << " " << M.cols() << std::endl;
389
+ for (int i = 0; i < M.rows(); i++) {
390
+ for (int j = 0; j < M.cols(); j++) {
391
+ if (j > 0) ofs << ' ';
392
+ ofs << M(i, j);
393
+ }
394
+ ofs << std::endl;
395
+ }
396
+ return ofs.good();
397
+ }
398
+
399
+ template<class Matrix>
400
+ bool SaveSparseMatrix(const std::string& outName, const Matrix& M)
401
+ {
402
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
403
+ ofs << std::setprecision(8);
404
+
405
+ ofs << "# M S " << M.rows() << " " << M.cols() << std::endl;
406
+ for (int i = 0; i < M.outerSize(); i++) {
407
+ bool isFirst = true;
408
+ for (typename Matrix::InnerIterator j(M, i); j; ++j) {
409
+ if (isFirst) {
410
+ isFirst = false;
411
+ } else {
412
+ ofs << ' ';
413
+ }
414
+ ofs << j.col() << ':' << j.value();
415
+ }
416
+ ofs << std::endl;
417
+ }
418
+ return ofs.good();
419
+ }
420
+
421
+ template<class Vector>
422
+ bool SaveVector(const std::string& outName, const Vector& V)
423
+ {
424
+ std::ofstream ofs(outName.c_str(), std::ios::binary);
425
+ ofs << std::setprecision(8);
426
+ ofs << "# V D " << V.rows() << std::endl;
427
+ for (int i = 0; i < V.rows(); i++) {
428
+ ofs << V(i) << std::endl;
429
+ }
430
+ return ofs.good();
431
+ }
432
+
433
+ } // svd
434
+
435
+ /*
436
+ approximate singular value decomposition
437
+ A = U S t(V) with rank r
438
+
439
+ t(M) : transpose of M
440
+ t(U) U = I
441
+ t(V) V = I
442
+
443
+ R : compressed unit matrix
444
+ Y = t(A) R
445
+ Y = orthonormalize(Y) ; t(Y) Y = I
446
+ B = A Y
447
+ Z = orthonormalize(B) ; t(Z) Z = I
448
+ C = t(Z) B
449
+ C = U' S t(V')
450
+ A \simeq A Y t(Y)
451
+ = B t(Y)
452
+ \simeq Z t(Z) B t(Y)
453
+ = Z C t(Y)
454
+ = Z U' S t(V') t(Y)
455
+ = (Z U') S t(YV')
456
+ = U S V
457
+ */
458
+ template<class Matrix, class Matrix2, class Vector>
459
+ bool ComputeSVD(Matrix& U, Vector& S, Matrix& V, const Matrix2& A, int rank)
460
+ {
461
+ const int r = std::min<int>(static_cast<int>(std::min(A.cols(), A.rows())), rank);
462
+ if (r <= 0) return false;
463
+
464
+ #if 1
465
+ Matrix R(A.rows(), r);
466
+ // svd::InitRandomMatrix(R);
467
+ svd::InitUnitMatrix(R);
468
+ Matrix Y = A.transpose() * R;
469
+ #else
470
+ Matrix Y;
471
+ svd::CompressCol(Y, A.transpose(), r);
472
+ #endif
473
+ svd::OrthonormalizeMatrix(Y);
474
+ const Matrix B = A * Y;
475
+ Matrix Z = B;
476
+ svd::OrthonormalizeMatrix(Z);
477
+ const Matrix C = Z.transpose() * B;
478
+ const Eigen::JacobiSVD<Matrix> svd(C, Eigen::ComputeThinU | Eigen::ComputeThinV);
479
+ U = Z * svd.matrixU();
480
+ S = svd.singularValues();
481
+ V = Y * svd.matrixV();
482
+ return true;
483
+ }
484
+
485
+ } } // cybozu::nlp
486
+
@@ -0,0 +1,226 @@
1
+ #pragma once
2
+ /**
3
+ @file
4
+ @brief TF-IDF
5
+
6
+ @author MITSUNARI Shigeo(@herumi)
7
+ */
8
+ #include <set>
9
+ #include <map>
10
+ #include <string>
11
+ #include <stdio.h>
12
+ #include <cybozu/string_operation.hpp>
13
+ #include <cybozu/nlp/sparse.hpp>
14
+
15
+ namespace cybozu { namespace nlp {
16
+
17
+ struct Str2Int : std::map<std::string, int> {
18
+ void put() const
19
+ {
20
+ for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
21
+ printf("%s:%d\n", i->first.c_str(), i->second);
22
+ }
23
+ }
24
+ };
25
+ struct Int2Int : std::map<int, int> {
26
+ void put() const
27
+ {
28
+ for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
29
+ printf("%d:%d ", i->first, i->second);
30
+ }
31
+ printf("\n");
32
+ }
33
+ };
34
+ struct StrVec : std::vector<std::string> {
35
+ void put() const
36
+ {
37
+ for (size_t i = 0, n = size(); i < n; i++) {
38
+ printf("%d:%s\n", (int)i, (*this)[i].c_str());
39
+ }
40
+ }
41
+ };
42
+ typedef std::vector<double> DoubleVec;
43
+ typedef std::vector<Int2Int> Int2IntVec;
44
+ typedef std::set<std::string> StrSet;
45
+ typedef cybozu::nlp::SparseVector<double> DoubleSvec;
46
+ typedef std::vector<DoubleSvec> DoubleSvecVec;
47
+ typedef std::vector<int> IntVec;
48
+
49
+ struct Df {
50
+ struct Pair {
51
+ int id;
52
+ int freq;
53
+ Pair(int _id = 0, int _freq = 0) : id(_id), freq(_freq) { }
54
+ bool operator<(const Pair& rhs) const { return freq < rhs.freq; }
55
+ };
56
+ typedef std::vector<Pair> PairVec;
57
+ int docNum_;
58
+ Str2Int word2id_;
59
+ StrVec id2word_;
60
+ IntVec df_;
61
+ StrSet set_; // for one doc
62
+ PairVec pv_;
63
+ Df()
64
+ : docNum_(0)
65
+ {
66
+ }
67
+ void append(const std::string& word)
68
+ {
69
+ std::string lower;
70
+ cybozu::ToLower(lower, word);
71
+ std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(lower, (int)id2word_.size()));
72
+ //printf("word=%s, id=%d, ret=%d\n", ret.first->first.c_str(), ret.first->second, ret.second);
73
+ if (ret.second) {
74
+ id2word_.push_back(lower);
75
+ df_.resize(id2word_.size());
76
+ }
77
+ if (set_.insert(word).second) {
78
+ df_[ret.first->second]++;
79
+ }
80
+ }
81
+ void endDoc()
82
+ {
83
+ docNum_++;
84
+ set_.clear();
85
+ }
86
+ // sort freq order
87
+ void term(int lowerLimit = 3, double upperRateLimit = 0.98)
88
+ {
89
+ fprintf(stderr, "#doc=%d, #word=%d\n", docNum_, (int)df_.size());
90
+ for (size_t i = 0, n = id2word_.size(); i < n; i++) {
91
+ const int freq = df_[i];
92
+ if (freq <= lowerLimit) continue;
93
+ pv_.push_back(Pair(i, freq));
94
+ }
95
+ int pvNum = (int)(pv_.size() * upperRateLimit);
96
+ fprintf(stderr, "shrink %d -> %d\n", (int)pv_.size(), pvNum);
97
+ std::partial_sort(pv_.begin(), pv_.begin() + pvNum, pv_.end());
98
+ pv_.resize(pvNum);
99
+ }
100
+ };
101
+
102
+ inline std::ostream& operator<<(std::ostream& os, const Df& df)
103
+ {
104
+ const double logN = log(double(df.docNum_));
105
+ for (size_t i = 0, n = df.pv_.size(); i < n; i++) {
106
+ int freq = df.pv_[i].freq;
107
+ double idf = logN - log(double(freq));
108
+ os << df.id2word_[df.pv_[i].id] << '\t' << freq << '\t' << idf << std::endl;
109
+ }
110
+ return os;
111
+ }
112
+
113
+ struct TfIdf {
114
+ Str2Int word2id_;
115
+ StrVec id2word_;
116
+ IntVec df_;
117
+ Int2IntVec tf_;
118
+
119
+ DoubleVec idf_;
120
+ DoubleSvecVec sv_;
121
+
122
+ // work area
123
+ Int2Int *curTf_;
124
+ StrSet set_; // for one doc
125
+
126
+ TfIdf()
127
+ : curTf_(0)
128
+ {
129
+ }
130
+ bool loadKeywordFile(const std::string& keyFile)
131
+ {
132
+ std::ifstream ifs(keyFile.c_str(), std::ios::binary);
133
+ if (!ifs) return false;
134
+ std::string word;
135
+ while (std::getline(ifs, word)) {
136
+ size_t pos = word.find('\t');
137
+ if (pos == std::string::npos) break;
138
+ word.resize(pos);
139
+ std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(word, (int)id2word_.size()));
140
+ if (ret.second) {
141
+ id2word_.push_back(word);
142
+ } else {
143
+ fprintf(stderr, "ERR already set %s\n", word.c_str());
144
+ }
145
+ }
146
+ df_.resize(id2word_.size());
147
+ fprintf(stderr, "#word = %d\n", (int)df_.size());
148
+ return true;
149
+ }
150
+
151
+ void append(const std::string& word)
152
+ {
153
+ std::string lower;
154
+ cybozu::ToLower(lower, word);
155
+ Str2Int::const_iterator i = word2id_.find(lower);
156
+ if (i == word2id_.end()) return;
157
+ const int id = i->second;
158
+ if (curTf_ == 0) {
159
+ tf_.push_back(Int2Int());
160
+ curTf_ = &tf_.back();
161
+ }
162
+ (*curTf_)[id]++;
163
+ if (set_.insert(lower).second) {
164
+ df_[id]++;
165
+ }
166
+ }
167
+ void endDoc()
168
+ {
169
+ curTf_ = 0;
170
+ set_.clear();
171
+ }
172
+ void put() const
173
+ {
174
+ printf("docNum=%d\n", (int)tf_.size());
175
+ for (size_t i = 0, n = tf_.size(); i < n; i++) {
176
+ printf("%d ", (int)i);
177
+ tf_[i].put();
178
+ }
179
+ puts("word:idx");
180
+ word2id_.put();
181
+ }
182
+
183
+ void term()
184
+ {
185
+ const double logN = log(double(tf_.size()));
186
+ idf_.resize(df_.size());
187
+ for (size_t i = 0, n = df_.size(); i < n; i++) {
188
+ idf_[i] = logN - log(double(df_[i]));
189
+ }
190
+ for (size_t i = 0, n = df_.size(); i < n; i++) {
191
+ const Int2Int& iv = tf_[i];
192
+ DoubleSvec v;
193
+ for (Int2Int::const_iterator j = iv.begin(), je = iv.end(); j != je; ++j) {
194
+ v.push_back(j->first, j->second * idf_[j->first]);
195
+ }
196
+ sv_.push_back(v);
197
+ }
198
+ }
199
+ void put(int maxNum = 0x7fffffff) const
200
+ {
201
+ printf("docNum=%d, wordNum=%d\n", (int)tf_.size(), (int)df_.size());
202
+ for (int i = 0, n = std::min(maxNum, (int)sv_.size()); i < n; i++) {
203
+ const DoubleSvec& v = sv_[i];
204
+ for (DoubleSvec::const_iterator j = v.begin(), je = v.end(); j != je; ++j) {
205
+ printf("%d:%f ", (int)j->pos(), j->val());
206
+ }
207
+ printf("\n");
208
+ }
209
+ }
210
+ };
211
+
212
+ inline std::ostream& operator<<(std::ostream& os, const TfIdf& /*tfIdf*/)
213
+ {
214
+ #if 0
215
+ int num = 0;
216
+ for (TfIdf::Rank::const_iterator i = tfIdf.rank_.begin(), ie = tfIdf.rank_.end(); i != ie; ++i) {
217
+ TfIdf::Counter::const_iterator c = tfIdf.counter_.find(i->second);
218
+ assert(c != tfIdf.counter_.end());
219
+ os << i->first << ' ' << c->second.tf_ << ' ' << c->second.df_ << ' ' << i->second << std::endl;
220
+ num++;
221
+ }
222
+ #endif
223
+ return os;
224
+ }
225
+
226
+ } } // cybozu::nlp