ooxml_crypt 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +58 -0
- data/Rakefile +12 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/ext/ooxml_crypt/extconf.rb +18 -0
- data/ext/ooxml_crypt/ooxml_crypt.c +27 -0
- data/ext/ooxml_crypt/ooxml_crypt.h +7 -0
- data/lib/ooxml_crypt/version.rb +5 -0
- data/lib/ooxml_crypt.rb +75 -0
- data/vendor/cybozulib/.github/workflows/main.yml +12 -0
- data/vendor/cybozulib/.gitignore +5 -0
- data/vendor/cybozulib/CMakeLists.txt +6 -0
- data/vendor/cybozulib/COPYRIGHT +27 -0
- data/vendor/cybozulib/Makefile +26 -0
- data/vendor/cybozulib/bin/libeay32.dll +0 -0
- data/vendor/cybozulib/bin/libmecab.dll +0 -0
- data/vendor/cybozulib/bin/ssleay32.dll +0 -0
- data/vendor/cybozulib/common.mk +116 -0
- data/vendor/cybozulib/common.props +25 -0
- data/vendor/cybozulib/cybozulib.sln +286 -0
- data/vendor/cybozulib/debug.props +14 -0
- data/vendor/cybozulib/include/cybozu/array.hpp +197 -0
- data/vendor/cybozulib/include/cybozu/atoi.hpp +238 -0
- data/vendor/cybozulib/include/cybozu/atomic.hpp +146 -0
- data/vendor/cybozulib/include/cybozu/base64.hpp +210 -0
- data/vendor/cybozulib/include/cybozu/benchmark.hpp +212 -0
- data/vendor/cybozulib/include/cybozu/bfd.hpp +105 -0
- data/vendor/cybozulib/include/cybozu/bit_operation.hpp +139 -0
- data/vendor/cybozulib/include/cybozu/bitvector.hpp +358 -0
- data/vendor/cybozulib/include/cybozu/condition_variable.hpp +113 -0
- data/vendor/cybozulib/include/cybozu/condition_variable_cs.hpp +74 -0
- data/vendor/cybozulib/include/cybozu/config.hpp +392 -0
- data/vendor/cybozulib/include/cybozu/critical_section.hpp +60 -0
- data/vendor/cybozulib/include/cybozu/crypto.hpp +321 -0
- data/vendor/cybozulib/include/cybozu/csucvector.hpp +624 -0
- data/vendor/cybozulib/include/cybozu/csv.hpp +294 -0
- data/vendor/cybozulib/include/cybozu/data_type.hpp +27 -0
- data/vendor/cybozulib/include/cybozu/endian.hpp +224 -0
- data/vendor/cybozulib/include/cybozu/env.hpp +63 -0
- data/vendor/cybozulib/include/cybozu/event.hpp +122 -0
- data/vendor/cybozulib/include/cybozu/exception.hpp +253 -0
- data/vendor/cybozulib/include/cybozu/file.hpp +626 -0
- data/vendor/cybozulib/include/cybozu/fmindex.hpp +291 -0
- data/vendor/cybozulib/include/cybozu/format.hpp +93 -0
- data/vendor/cybozulib/include/cybozu/frequency.hpp +264 -0
- data/vendor/cybozulib/include/cybozu/hash.hpp +67 -0
- data/vendor/cybozulib/include/cybozu/inttype.hpp +174 -0
- data/vendor/cybozulib/include/cybozu/itoa.hpp +336 -0
- data/vendor/cybozulib/include/cybozu/json.hpp +120 -0
- data/vendor/cybozulib/include/cybozu/line_stream.hpp +149 -0
- data/vendor/cybozulib/include/cybozu/link_libeay32.hpp +21 -0
- data/vendor/cybozulib/include/cybozu/link_mpir.hpp +18 -0
- data/vendor/cybozulib/include/cybozu/link_ssleay32.hpp +19 -0
- data/vendor/cybozulib/include/cybozu/log.hpp +237 -0
- data/vendor/cybozulib/include/cybozu/minixml.hpp +452 -0
- data/vendor/cybozulib/include/cybozu/mmap.hpp +143 -0
- data/vendor/cybozulib/include/cybozu/mutex.hpp +144 -0
- data/vendor/cybozulib/include/cybozu/nlp/mecab.hpp +96 -0
- data/vendor/cybozulib/include/cybozu/nlp/plsi.hpp +315 -0
- data/vendor/cybozulib/include/cybozu/nlp/random.hpp +74 -0
- data/vendor/cybozulib/include/cybozu/nlp/sparse.hpp +529 -0
- data/vendor/cybozulib/include/cybozu/nlp/svd.hpp +486 -0
- data/vendor/cybozulib/include/cybozu/nlp/tfidf.hpp +226 -0
- data/vendor/cybozulib/include/cybozu/nlp/top_score.hpp +75 -0
- data/vendor/cybozulib/include/cybozu/option.hpp +743 -0
- data/vendor/cybozulib/include/cybozu/parallel.hpp +88 -0
- data/vendor/cybozulib/include/cybozu/pcg.hpp +72 -0
- data/vendor/cybozulib/include/cybozu/process.hpp +324 -0
- data/vendor/cybozulib/include/cybozu/quit_signal_handler.hpp +66 -0
- data/vendor/cybozulib/include/cybozu/random_generator.hpp +144 -0
- data/vendor/cybozulib/include/cybozu/regex.hpp +463 -0
- data/vendor/cybozulib/include/cybozu/select8.hpp +279 -0
- data/vendor/cybozulib/include/cybozu/serializer.hpp +363 -0
- data/vendor/cybozulib/include/cybozu/sha1.hpp +209 -0
- data/vendor/cybozulib/include/cybozu/sha2.hpp +506 -0
- data/vendor/cybozulib/include/cybozu/siphash.hpp +105 -0
- data/vendor/cybozulib/include/cybozu/socket.hpp +785 -0
- data/vendor/cybozulib/include/cybozu/ssl.hpp +203 -0
- data/vendor/cybozulib/include/cybozu/stacktrace.hpp +291 -0
- data/vendor/cybozulib/include/cybozu/stream.hpp +269 -0
- data/vendor/cybozulib/include/cybozu/string.hpp +1746 -0
- data/vendor/cybozulib/include/cybozu/string_operation.hpp +365 -0
- data/vendor/cybozulib/include/cybozu/sucvector.hpp +378 -0
- data/vendor/cybozulib/include/cybozu/test.hpp +373 -0
- data/vendor/cybozulib/include/cybozu/thread.hpp +229 -0
- data/vendor/cybozulib/include/cybozu/time.hpp +281 -0
- data/vendor/cybozulib/include/cybozu/tls.hpp +115 -0
- data/vendor/cybozulib/include/cybozu/unordered_map.hpp +13 -0
- data/vendor/cybozulib/include/cybozu/unordered_set.hpp +13 -0
- data/vendor/cybozulib/include/cybozu/v128.hpp +376 -0
- data/vendor/cybozulib/include/cybozu/wavelet_matrix.hpp +345 -0
- data/vendor/cybozulib/include/cybozu/xorshift.hpp +189 -0
- data/vendor/cybozulib/include/cybozu/zlib.hpp +325 -0
- data/vendor/cybozulib/include/sais.hxx +364 -0
- data/vendor/cybozulib/misc/make_select8tbl.cpp +26 -0
- data/vendor/cybozulib/mk.bat +37 -0
- data/vendor/cybozulib/readme.md +29 -0
- data/vendor/cybozulib/release.props +12 -0
- data/vendor/cybozulib/sample/Makefile +30 -0
- data/vendor/cybozulib/sample/csucvector_smpl.cpp +42 -0
- data/vendor/cybozulib/sample/data/svd/org/test1.S +4 -0
- data/vendor/cybozulib/sample/data/svd/org/test1.U +4 -0
- data/vendor/cybozulib/sample/data/svd/org/test1.V +6 -0
- data/vendor/cybozulib/sample/data/svd/test1 +4 -0
- data/vendor/cybozulib/sample/data/svd/test2 +4 -0
- data/vendor/cybozulib/sample/desymbol.cpp +127 -0
- data/vendor/cybozulib/sample/exception_smpl.cpp +46 -0
- data/vendor/cybozulib/sample/fmindex_smpl.cpp +231 -0
- data/vendor/cybozulib/sample/log_smpl.cpp +19 -0
- data/vendor/cybozulib/sample/mecab_smpl.cpp +37 -0
- data/vendor/cybozulib/sample/option2_smpl.cpp +68 -0
- data/vendor/cybozulib/sample/option_smpl.cpp +42 -0
- data/vendor/cybozulib/sample/plsi_smpl.cpp +207 -0
- data/vendor/cybozulib/sample/proj/exception_smpl.vcproj +184 -0
- data/vendor/cybozulib/sample/proj/mecab_smpl.vcproj +184 -0
- data/vendor/cybozulib/sample/proj/ssl_smpl/ssl_smpl.vcxproj +85 -0
- data/vendor/cybozulib/sample/proj/ssl_smpl.vcproj +347 -0
- data/vendor/cybozulib/sample/proj/stacktrace_smpl/stacktrace_smpl.vcxproj +85 -0
- data/vendor/cybozulib/sample/proj/svd_smpl.vcproj +184 -0
- data/vendor/cybozulib/sample/quit_signal_handler.cpp +30 -0
- data/vendor/cybozulib/sample/serializer_smpl.cpp +196 -0
- data/vendor/cybozulib/sample/socket_smpl.cpp +82 -0
- data/vendor/cybozulib/sample/ssl_smpl.cpp +39 -0
- data/vendor/cybozulib/sample/stacktrace_smpl.cpp +52 -0
- data/vendor/cybozulib/sample/svd_bench_smpl.cpp +143 -0
- data/vendor/cybozulib/sample/svd_smpl.cpp +94 -0
- data/vendor/cybozulib/sample/wm_bench_smpl.cpp +182 -0
- data/vendor/cybozulib/sample/zlib_smpl.cpp +41 -0
- data/vendor/cybozulib/src/Makefile +8 -0
- data/vendor/cybozulib/src/base/Makefile +19 -0
- data/vendor/cybozulib/test/Makefile +12 -0
- data/vendor/cybozulib/test/base/Makefile +37 -0
- data/vendor/cybozulib/test/base/array_test.cpp +173 -0
- data/vendor/cybozulib/test/base/atoi_test.cpp +774 -0
- data/vendor/cybozulib/test/base/atomic_test.cpp +49 -0
- data/vendor/cybozulib/test/base/base64_test.cpp +113 -0
- data/vendor/cybozulib/test/base/bit_operation_test.cpp +134 -0
- data/vendor/cybozulib/test/base/bitvector_test.cpp +204 -0
- data/vendor/cybozulib/test/base/condition_variable_cs_test.cpp +92 -0
- data/vendor/cybozulib/test/base/condition_variable_test.cpp +88 -0
- data/vendor/cybozulib/test/base/config_test.cpp +236 -0
- data/vendor/cybozulib/test/base/crypto_test.cpp +122 -0
- data/vendor/cybozulib/test/base/csucvector_test.cpp +63 -0
- data/vendor/cybozulib/test/base/csv_test.cpp +182 -0
- data/vendor/cybozulib/test/base/data/a.xml +26 -0
- data/vendor/cybozulib/test/base/endian_test.cpp +56 -0
- data/vendor/cybozulib/test/base/env_test.cpp +22 -0
- data/vendor/cybozulib/test/base/event_test.cpp +41 -0
- data/vendor/cybozulib/test/base/file_test.cpp +233 -0
- data/vendor/cybozulib/test/base/fmindex_test.cpp +118 -0
- data/vendor/cybozulib/test/base/format_test.cpp +12 -0
- data/vendor/cybozulib/test/base/frequency_test.cpp +104 -0
- data/vendor/cybozulib/test/base/itoa_test.cpp +522 -0
- data/vendor/cybozulib/test/base/line_stream_test.cpp +208 -0
- data/vendor/cybozulib/test/base/mecab_test.cpp +41 -0
- data/vendor/cybozulib/test/base/minixml_test.cpp +103 -0
- data/vendor/cybozulib/test/base/mmap_test.cpp +15 -0
- data/vendor/cybozulib/test/base/option_test.cpp +487 -0
- data/vendor/cybozulib/test/base/parallel_test.cpp +48 -0
- data/vendor/cybozulib/test/base/proj/array_test/array_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/atoi_test/atoi_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/atomic_test/atomic_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/base64_test/base64_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/condition_variable_cs_test/condition_variable_cs_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/condition_variable_test/condition_variable_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/config_test/config_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/csv_test/csv_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/endian_test/endian_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/env_test/env_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/event_test/event_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/file_test/file_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/itoa_test/itoa_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/mecab_test/mecab_test.vcxproj +88 -0
- data/vendor/cybozulib/test/base/proj/minixml_test/minixml_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/mmap_test/mmap_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/serializer_test/serializer_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/sha1_test/sha1_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/stream_test/stream_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/string_operation_test/string_operation_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/string_test/string_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/thread_test/thread_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/time_test/time_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/tls_test/tls_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/proj/zlib_test/zlib_test.vcxproj +86 -0
- data/vendor/cybozulib/test/base/random_generator_test.cpp +28 -0
- data/vendor/cybozulib/test/base/regex_test.cpp +74 -0
- data/vendor/cybozulib/test/base/serializer_test.cpp +483 -0
- data/vendor/cybozulib/test/base/sha1_test.cpp +61 -0
- data/vendor/cybozulib/test/base/sha2_test.cpp +191 -0
- data/vendor/cybozulib/test/base/siphash_test.cpp +33 -0
- data/vendor/cybozulib/test/base/socket_test.cpp +76 -0
- data/vendor/cybozulib/test/base/stream_test.cpp +101 -0
- data/vendor/cybozulib/test/base/string_operation_test.cpp +340 -0
- data/vendor/cybozulib/test/base/string_test.cpp +1705 -0
- data/vendor/cybozulib/test/base/sucvector_test.cpp +312 -0
- data/vendor/cybozulib/test/base/thread_test.cpp +62 -0
- data/vendor/cybozulib/test/base/time_test.cpp +164 -0
- data/vendor/cybozulib/test/base/tls_test.cpp +50 -0
- data/vendor/cybozulib/test/base/wavelet_matrix_test.cpp +145 -0
- data/vendor/cybozulib/test/base/zlib_test.cpp +371 -0
- data/vendor/cybozulib/test/nlp/Makefile +27 -0
- data/vendor/cybozulib/test/nlp/proj/random_test.vcproj +184 -0
- data/vendor/cybozulib/test/nlp/proj/sparse_test.vcproj +184 -0
- data/vendor/cybozulib/test/nlp/proj/svd_test.vcproj +184 -0
- data/vendor/cybozulib/test/nlp/random_test.cpp +62 -0
- data/vendor/cybozulib/test/nlp/sparse_test.cpp +347 -0
- data/vendor/cybozulib/test/nlp/svd_test.cpp +234 -0
- data/vendor/cybozulib/test/nlp/top_score_test.cpp +40 -0
- data/vendor/cybozulib/tool/create_vcproj.py +186 -0
- data/vendor/cybozulib/tool/vcproj_tmpl.py +185 -0
- data/vendor/msoffice/COPYRIGHT +27 -0
- data/vendor/msoffice/Makefile +29 -0
- data/vendor/msoffice/bin/64/msoc.dll +0 -0
- data/vendor/msoffice/bin/64/msocsample.exe +0 -0
- data/vendor/msoffice/bin/64/msoffice-crypt.exe +0 -0
- data/vendor/msoffice/bin/msoc.dll +0 -0
- data/vendor/msoffice/bin/msocsample.exe +0 -0
- data/vendor/msoffice/bin/msoffice-crypt.exe +0 -0
- data/vendor/msoffice/common.mk +71 -0
- data/vendor/msoffice/common.props +26 -0
- data/vendor/msoffice/debug.props +14 -0
- data/vendor/msoffice/include/attack.hpp +211 -0
- data/vendor/msoffice/include/cfb.hpp +777 -0
- data/vendor/msoffice/include/crypto_util.hpp +450 -0
- data/vendor/msoffice/include/custom_sha1.hpp +342 -0
- data/vendor/msoffice/include/decode.hpp +240 -0
- data/vendor/msoffice/include/encode.hpp +221 -0
- data/vendor/msoffice/include/make_dataspace.hpp +316 -0
- data/vendor/msoffice/include/msoc.h +129 -0
- data/vendor/msoffice/include/resource.hpp +7 -0
- data/vendor/msoffice/include/standard_encryption.hpp +145 -0
- data/vendor/msoffice/include/uint32vec.hpp +179 -0
- data/vendor/msoffice/include/util.hpp +212 -0
- data/vendor/msoffice/lib/.emptydir +0 -0
- data/vendor/msoffice/misc/decrypt-xls.vbs +46 -0
- data/vendor/msoffice/mk.bat +1 -0
- data/vendor/msoffice/mkdll.bat +3 -0
- data/vendor/msoffice/msoc.def +13 -0
- data/vendor/msoffice/msocsample.py +178 -0
- data/vendor/msoffice/msoffice12.sln +31 -0
- data/vendor/msoffice/readme.md +110 -0
- data/vendor/msoffice/release.props +28 -0
- data/vendor/msoffice/src/Makefile +19 -0
- data/vendor/msoffice/src/attack.cpp +124 -0
- data/vendor/msoffice/src/cfb_test.cpp +77 -0
- data/vendor/msoffice/src/minisample.c +54 -0
- data/vendor/msoffice/src/msocdll.cpp +276 -0
- data/vendor/msoffice/src/msocsample.c +136 -0
- data/vendor/msoffice/src/msoffice-crypt.cpp +219 -0
- data/vendor/msoffice/src/proj/attack/attack.vcxproj +88 -0
- data/vendor/msoffice/src/proj/main/msoffice-crypt.vcxproj +88 -0
- data/vendor/msoffice/src/sha1.cpp +234 -0
- data/vendor/msoffice/test/Makefile +20 -0
- data/vendor/msoffice/test/cfb_test.cpp +74 -0
- data/vendor/msoffice/test/hash_test.cpp +59 -0
- data/vendor/msoffice/test/proj/cfb/cfb_test.vcxproj +90 -0
- data/vendor/msoffice/test/proj/hash/hash_test.vcxproj +90 -0
- data/vendor/msoffice/test/sampl.bat +8 -0
- data/vendor/msoffice/test_all.py +46 -0
- data/vendor/update +4 -0
- metadata +351 -0
@@ -0,0 +1,486 @@
|
|
1
|
+
#pragma once
|
2
|
+
/**
|
3
|
+
@file
|
4
|
+
@brief fast non-probabilistic SVD
|
5
|
+
|
6
|
+
@author MITSUNARI Shigeo(@herumi)
|
7
|
+
@author MITSUNARI Shigeo
|
8
|
+
*/
|
9
|
+
#include <assert.h>
|
10
|
+
#include <vector>
|
11
|
+
#include <string>
|
12
|
+
#include <fstream>
|
13
|
+
#include <sstream>
|
14
|
+
#include <iomanip>
|
15
|
+
//#define CYBOZU_NLP_SVD_USE_RANDOM
|
16
|
+
#ifdef CYBOZU_NLP_SVD_USE_RANDOM
|
17
|
+
#include <cybozu/nlp/random.hpp>
|
18
|
+
#endif
|
19
|
+
#ifdef _MSC_VER
|
20
|
+
#pragma warning(push)
|
21
|
+
#pragma warning(disable : 4714) // force inline
|
22
|
+
#endif
|
23
|
+
#define EIGEN_YES_I_KNOW_SPARSE_MODULE_IS_NOT_STABLE_YET
|
24
|
+
#include <eigen3/Eigen/Sparse>
|
25
|
+
#include <eigen3/Eigen/Dense>
|
26
|
+
#include <eigen3/Eigen/Eigenvalues>
|
27
|
+
#ifdef _MSC_VER
|
28
|
+
// #pragma warning(pop)
|
29
|
+
#endif
|
30
|
+
|
31
|
+
/***
|
32
|
+
text format
|
33
|
+
|
34
|
+
Matrix(dense)
|
35
|
+
---
|
36
|
+
# M D <row> <col>
|
37
|
+
data1_1 data1_2 data1_3 ...
|
38
|
+
data2_1 data2_2 ...
|
39
|
+
....
|
40
|
+
---
|
41
|
+
|
42
|
+
Matrix(sparse)
|
43
|
+
---
|
44
|
+
# M S <row> <col>
|
45
|
+
c1:data1_c1 c2:data1_c2 c3:data1_c3 ...
|
46
|
+
c1:data2_c1 c2:data2_c2 c3:data2_c3 ...
|
47
|
+
....
|
48
|
+
---
|
49
|
+
|
50
|
+
ex.
|
51
|
+
M = (1.0 2.0 3.0)
|
52
|
+
(1.2 2.4 3.5)
|
53
|
+
---
|
54
|
+
# M D 2 3
|
55
|
+
1.0 2.0 3.0
|
56
|
+
1.2 2.4 3.5
|
57
|
+
---
|
58
|
+
|
59
|
+
M = (1.0 0 3.0)
|
60
|
+
(0 4.2 0 )
|
61
|
+
---
|
62
|
+
# M S 2 3
|
63
|
+
0:1.0 2:3.0
|
64
|
+
1:4.2
|
65
|
+
---
|
66
|
+
*/
|
67
|
+
namespace cybozu { namespace nlp {
|
68
|
+
|
69
|
+
namespace svd {
|
70
|
+
|
71
|
+
#ifdef CYBOZU_NLP_SVD_USE_RANDOM
|
72
|
+
template<class Matrix>
|
73
|
+
void InitRandomMatrix(Matrix& M)
|
74
|
+
{
|
75
|
+
cybozu::nlp::NormalRandomGenerator r;
|
76
|
+
for (int i = 0; i < M.rows(); i++) {
|
77
|
+
for (int j = 0; j < M.cols(); j++) {
|
78
|
+
M(i, j) = typename Matrix::Scalar(r.get());
|
79
|
+
}
|
80
|
+
}
|
81
|
+
}
|
82
|
+
#endif
|
83
|
+
|
84
|
+
template<class Matrix>
|
85
|
+
void InitUnitMatrix(Matrix& M)
|
86
|
+
{
|
87
|
+
M.setZero();
|
88
|
+
const int row = M.rows();
|
89
|
+
const int col = M.cols();
|
90
|
+
assert(col <= row);
|
91
|
+
#if 1
|
92
|
+
const int adj = 0;//(col & 1) ? row/2 : 0;
|
93
|
+
for (int i = 0; i < row; i++) {
|
94
|
+
M(i, (i * col + adj) / row) = 1;
|
95
|
+
}
|
96
|
+
#else
|
97
|
+
typedef typename Matrix::Scalar Double;
|
98
|
+
const int q0 = row / col;
|
99
|
+
const int r0 = row % col;
|
100
|
+
const double rcol = 1.0 / col;
|
101
|
+
int b = 0;
|
102
|
+
int q = q0;
|
103
|
+
int e = r0;
|
104
|
+
int rowIdx = 0;
|
105
|
+
int colIdx = 0;
|
106
|
+
for (;;) {
|
107
|
+
if (b > 0) {
|
108
|
+
M(rowIdx, colIdx) = Double(b * rcol);
|
109
|
+
rowIdx++;
|
110
|
+
}
|
111
|
+
for (int j = 0; j < q; j++) {
|
112
|
+
M(rowIdx, colIdx) = 1;
|
113
|
+
rowIdx++;
|
114
|
+
}
|
115
|
+
if (e > 0) {
|
116
|
+
M(rowIdx, colIdx) = Double(e * rcol);
|
117
|
+
}
|
118
|
+
if (colIdx == col - 1) break;
|
119
|
+
b = e == 0 ? 0 : col - e;
|
120
|
+
e = r0 - b;
|
121
|
+
if (e < 0) {
|
122
|
+
q = q0 - 1;
|
123
|
+
e += col;
|
124
|
+
} else {
|
125
|
+
q = q0;
|
126
|
+
}
|
127
|
+
colIdx++;
|
128
|
+
}
|
129
|
+
assert(rowIdx == row);
|
130
|
+
#endif
|
131
|
+
}
|
132
|
+
/*
|
133
|
+
m(row, col) => M(row, r)
|
134
|
+
r <= col
|
135
|
+
*/
|
136
|
+
template<class Matrix1, class Matrix2>
|
137
|
+
void CompressCol(Matrix1& out, const Matrix2& m, int r)
|
138
|
+
{
|
139
|
+
typedef typename Matrix1::Scalar Double;
|
140
|
+
const int row = m.rows();
|
141
|
+
const int col = m.cols();
|
142
|
+
assert(r <= col);
|
143
|
+
out.resize(row, r);
|
144
|
+
#if 1
|
145
|
+
int begin = 0;
|
146
|
+
for (int j = 0; j < r; j++) {
|
147
|
+
int end = std::min(((j + 1) * col + r - 1) / r, col);
|
148
|
+
// printf("%d [%d, %d)\n", j, begin, end);
|
149
|
+
for (int i = 0; i < row; i++) {
|
150
|
+
double x = 0;
|
151
|
+
for (int k = begin; k < end; k++) {
|
152
|
+
x += m(i, k);
|
153
|
+
}
|
154
|
+
out(i, j) = Double(x);
|
155
|
+
}
|
156
|
+
begin = end;
|
157
|
+
}
|
158
|
+
#else
|
159
|
+
const int q0 = col / r;
|
160
|
+
const int r0 = col % r;
|
161
|
+
const double rr = 1.0 / r;
|
162
|
+
int b = 0;
|
163
|
+
int q = q0;
|
164
|
+
int e = r0;
|
165
|
+
int colIdx = 0;
|
166
|
+
int rIdx = 0;
|
167
|
+
for (;;) {
|
168
|
+
for (int i = 0; i < row; i++) {
|
169
|
+
double x = 0;
|
170
|
+
int k = colIdx;
|
171
|
+
if (b > 0) {
|
172
|
+
x += m(i, k) * b * rr;
|
173
|
+
k++;
|
174
|
+
}
|
175
|
+
for (int j = 0; j < q; j++) {
|
176
|
+
x += m(i, k);
|
177
|
+
k++;
|
178
|
+
}
|
179
|
+
if (e > 0) {
|
180
|
+
x += m(i, k) * e * rr;
|
181
|
+
}
|
182
|
+
out(i, rIdx) = Double(x);
|
183
|
+
}
|
184
|
+
if (b > 0) colIdx++;
|
185
|
+
colIdx += q;
|
186
|
+
if (rIdx == r - 1) break;
|
187
|
+
b = e == 0 ? 0 : r - e;
|
188
|
+
e = r0 - b;
|
189
|
+
if (e < 0) {
|
190
|
+
q = q0 - 1;
|
191
|
+
e += r;
|
192
|
+
} else {
|
193
|
+
q = q0;
|
194
|
+
}
|
195
|
+
rIdx++;
|
196
|
+
}
|
197
|
+
assert(colIdx == col);
|
198
|
+
#endif
|
199
|
+
}
|
200
|
+
|
201
|
+
template<class Matrix>
|
202
|
+
void OrthonormalizeMatrix(Matrix& M)
|
203
|
+
{
|
204
|
+
const double eps = 1e-5;
|
205
|
+
typedef typename Matrix::Scalar Double;
|
206
|
+
for (int i = 0; i < M.cols(); i++) {
|
207
|
+
double norm = M.col(i).norm();
|
208
|
+
if (norm < eps) {
|
209
|
+
M.col(i).setZero();
|
210
|
+
} else {
|
211
|
+
Double rev = Double(1.0 / norm);
|
212
|
+
M.col(i) *= rev;
|
213
|
+
for (int j = i + 1; j < M.cols(); j++) {
|
214
|
+
Double x = M.col(i).dot(M.col(j));
|
215
|
+
M.col(j) -= M.col(i) * x;
|
216
|
+
}
|
217
|
+
}
|
218
|
+
}
|
219
|
+
}
|
220
|
+
|
221
|
+
inline bool LoadHeader(bool *isMatrix, bool *isSparse, int *row, int *col, std::ifstream& ifs, const std::string& input)
|
222
|
+
{
|
223
|
+
ifs.open(input.c_str(), std::ios::binary);
|
224
|
+
if (!ifs) {
|
225
|
+
fprintf(stderr, "can't open %s\n", input.c_str());
|
226
|
+
return false;
|
227
|
+
}
|
228
|
+
std::string line;
|
229
|
+
if (std::getline(ifs, line)) {
|
230
|
+
std::istringstream is(line);
|
231
|
+
char c, vec, type;
|
232
|
+
is >> c >> vec >> type >> *row >> *col;
|
233
|
+
if (c != '#') {
|
234
|
+
fprintf(stderr, "top char is #(%c)\n", c);
|
235
|
+
goto ERR;
|
236
|
+
}
|
237
|
+
if (*row <= 0) {
|
238
|
+
fprintf(stderr, "row(%d) should be positive\n", *row);
|
239
|
+
goto ERR;
|
240
|
+
}
|
241
|
+
if (type != 'S' && type != 'D') {
|
242
|
+
fprintf(stderr, "type is D(dense) or S(sparse) (%c)\n", type);
|
243
|
+
goto ERR;
|
244
|
+
}
|
245
|
+
*isSparse = type == 'S';
|
246
|
+
switch (vec) {
|
247
|
+
case 'M':
|
248
|
+
if (*col <= 0) {
|
249
|
+
fprintf(stderr, "col(%d) should be positive\n", *col);
|
250
|
+
goto ERR;
|
251
|
+
}
|
252
|
+
*isMatrix = true;
|
253
|
+
break;
|
254
|
+
case 'V':
|
255
|
+
*col = 1;
|
256
|
+
*isMatrix = false;
|
257
|
+
break;
|
258
|
+
default:
|
259
|
+
fprintf(stderr, "vec is M(matrix) or V(vector) (%c)\n", vec);
|
260
|
+
goto ERR;
|
261
|
+
}
|
262
|
+
fprintf(stderr, "input (%c, %c, %d, %d)\n", vec, type, *row, *col);
|
263
|
+
return true;
|
264
|
+
}
|
265
|
+
ERR:
|
266
|
+
fprintf(stderr, "bad format top line must be '# (M|V) (D|S) <row> <col>'\n");
|
267
|
+
return false;
|
268
|
+
}
|
269
|
+
|
270
|
+
template<class Matrix>
|
271
|
+
bool LoadMatrix(Matrix& M, const std::string& input)
|
272
|
+
{
|
273
|
+
std::ifstream ifs;
|
274
|
+
bool isMatrix = false;
|
275
|
+
bool isSparse = false;
|
276
|
+
int row = 0, col = 0;
|
277
|
+
if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
|
278
|
+
return false;
|
279
|
+
}
|
280
|
+
M.resize(row, col);
|
281
|
+
if (isSparse) {
|
282
|
+
for (int i = 0; i < row; i++) {
|
283
|
+
M.row(i).setZero();
|
284
|
+
std::string line;
|
285
|
+
if (!std::getline(ifs, line)) {
|
286
|
+
fprintf(stderr, "can't read %d line\n", i);
|
287
|
+
return false;
|
288
|
+
}
|
289
|
+
std::istringstream is(line);
|
290
|
+
for (;;) {
|
291
|
+
int idx;
|
292
|
+
char sep;
|
293
|
+
double v;
|
294
|
+
is >> idx >> sep >> v;
|
295
|
+
if (!is) break;
|
296
|
+
if (sep != ':' || idx < 0 || idx >= col) {
|
297
|
+
fprintf(stderr, "can't read %s\n", line.c_str());
|
298
|
+
return false;
|
299
|
+
}
|
300
|
+
M(i, idx) = typename Matrix::Scalar(v);
|
301
|
+
}
|
302
|
+
}
|
303
|
+
} else {
|
304
|
+
for (int i = 0; i < row; i++) {
|
305
|
+
for (int j = 0; j < col; j++) {
|
306
|
+
double v;
|
307
|
+
ifs >> v;
|
308
|
+
if (!ifs) {
|
309
|
+
fprintf(stderr, "can't read (%d,%d)\n", i, j);
|
310
|
+
return false;
|
311
|
+
}
|
312
|
+
M(i, j) = typename Matrix::Scalar(v);
|
313
|
+
}
|
314
|
+
}
|
315
|
+
}
|
316
|
+
return true;
|
317
|
+
}
|
318
|
+
|
319
|
+
template<class Matrix>
|
320
|
+
bool LoadSparseMatrix(Matrix& M, const std::string& input)
|
321
|
+
{
|
322
|
+
std::ifstream ifs;
|
323
|
+
bool isMatrix = false;
|
324
|
+
bool isSparse = false;
|
325
|
+
int row = 0, col = 0;
|
326
|
+
if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || !isMatrix) {
|
327
|
+
return false;
|
328
|
+
}
|
329
|
+
if (!isSparse) {
|
330
|
+
fprintf(stderr, "ERR not sparse\n");
|
331
|
+
return false;
|
332
|
+
}
|
333
|
+
M.resize(row, col);
|
334
|
+
for (int i = 0; i < row; i++) {
|
335
|
+
std::string line;
|
336
|
+
if (!std::getline(ifs, line)) {
|
337
|
+
fprintf(stderr, "can't read %d line\n", i);
|
338
|
+
return false;
|
339
|
+
}
|
340
|
+
std::istringstream is(line);
|
341
|
+
M.startVec(i);
|
342
|
+
for (;;) {
|
343
|
+
int idx;
|
344
|
+
char sep;
|
345
|
+
double v;
|
346
|
+
is >> idx >> sep >> v;
|
347
|
+
if (!is) break;
|
348
|
+
if (sep != ':' || idx < 0 || idx >= col) {
|
349
|
+
fprintf(stderr, "can't read %s\n", line.c_str());
|
350
|
+
return false;
|
351
|
+
}
|
352
|
+
M.insertBack(i, idx) = typename Matrix::Scalar(v);
|
353
|
+
}
|
354
|
+
}
|
355
|
+
M.finalize();
|
356
|
+
return true;
|
357
|
+
}
|
358
|
+
|
359
|
+
template<class Vector>
|
360
|
+
bool LoadVector(Vector& V, const std::string& input)
|
361
|
+
{
|
362
|
+
std::ifstream ifs;
|
363
|
+
bool isMatrix = false;
|
364
|
+
bool isSparse = false;
|
365
|
+
int row = 0, col = 0;
|
366
|
+
if (!LoadHeader(&isMatrix, &isSparse, &row, &col, ifs, input) || isMatrix) {
|
367
|
+
return false;
|
368
|
+
}
|
369
|
+
V.resize(row, 1);
|
370
|
+
for (int i = 0; i < row; i++) {
|
371
|
+
double v;
|
372
|
+
ifs >> v;
|
373
|
+
if (!ifs) {
|
374
|
+
fprintf(stderr, "can't read (%d)\n", i);
|
375
|
+
return false;
|
376
|
+
}
|
377
|
+
V(i) = typename Vector::Scalar(v);
|
378
|
+
}
|
379
|
+
return true;
|
380
|
+
}
|
381
|
+
|
382
|
+
template<class Matrix>
|
383
|
+
bool SaveMatrix(const std::string& outName, const Matrix& M)
|
384
|
+
{
|
385
|
+
std::ofstream ofs(outName.c_str(), std::ios::binary);
|
386
|
+
ofs << std::setprecision(8);
|
387
|
+
|
388
|
+
ofs << "# M D " << M.rows() << " " << M.cols() << std::endl;
|
389
|
+
for (int i = 0; i < M.rows(); i++) {
|
390
|
+
for (int j = 0; j < M.cols(); j++) {
|
391
|
+
if (j > 0) ofs << ' ';
|
392
|
+
ofs << M(i, j);
|
393
|
+
}
|
394
|
+
ofs << std::endl;
|
395
|
+
}
|
396
|
+
return ofs.good();
|
397
|
+
}
|
398
|
+
|
399
|
+
template<class Matrix>
|
400
|
+
bool SaveSparseMatrix(const std::string& outName, const Matrix& M)
|
401
|
+
{
|
402
|
+
std::ofstream ofs(outName.c_str(), std::ios::binary);
|
403
|
+
ofs << std::setprecision(8);
|
404
|
+
|
405
|
+
ofs << "# M S " << M.rows() << " " << M.cols() << std::endl;
|
406
|
+
for (int i = 0; i < M.outerSize(); i++) {
|
407
|
+
bool isFirst = true;
|
408
|
+
for (typename Matrix::InnerIterator j(M, i); j; ++j) {
|
409
|
+
if (isFirst) {
|
410
|
+
isFirst = false;
|
411
|
+
} else {
|
412
|
+
ofs << ' ';
|
413
|
+
}
|
414
|
+
ofs << j.col() << ':' << j.value();
|
415
|
+
}
|
416
|
+
ofs << std::endl;
|
417
|
+
}
|
418
|
+
return ofs.good();
|
419
|
+
}
|
420
|
+
|
421
|
+
template<class Vector>
|
422
|
+
bool SaveVector(const std::string& outName, const Vector& V)
|
423
|
+
{
|
424
|
+
std::ofstream ofs(outName.c_str(), std::ios::binary);
|
425
|
+
ofs << std::setprecision(8);
|
426
|
+
ofs << "# V D " << V.rows() << std::endl;
|
427
|
+
for (int i = 0; i < V.rows(); i++) {
|
428
|
+
ofs << V(i) << std::endl;
|
429
|
+
}
|
430
|
+
return ofs.good();
|
431
|
+
}
|
432
|
+
|
433
|
+
} // svd
|
434
|
+
|
435
|
+
/*
|
436
|
+
approximate singular value decomposition
|
437
|
+
A = U S t(V) with rank r
|
438
|
+
|
439
|
+
t(M) : transpose of M
|
440
|
+
t(U) U = I
|
441
|
+
t(V) V = I
|
442
|
+
|
443
|
+
R : compressed unit matrix
|
444
|
+
Y = t(A) R
|
445
|
+
Y = orthonormalize(Y) ; t(Y) Y = I
|
446
|
+
B = A Y
|
447
|
+
Z = orthonormalize(B) ; t(Z) Z = I
|
448
|
+
C = t(Z) B
|
449
|
+
C = U' S t(V')
|
450
|
+
A \simeq A Y t(Y)
|
451
|
+
= B t(Y)
|
452
|
+
\simeq Z t(Z) B t(Y)
|
453
|
+
= Z C t(Y)
|
454
|
+
= Z U' S t(V') t(Y)
|
455
|
+
= (Z U') S t(YV')
|
456
|
+
= U S V
|
457
|
+
*/
|
458
|
+
template<class Matrix, class Matrix2, class Vector>
|
459
|
+
bool ComputeSVD(Matrix& U, Vector& S, Matrix& V, const Matrix2& A, int rank)
|
460
|
+
{
|
461
|
+
const int r = std::min<int>(static_cast<int>(std::min(A.cols(), A.rows())), rank);
|
462
|
+
if (r <= 0) return false;
|
463
|
+
|
464
|
+
#if 1
|
465
|
+
Matrix R(A.rows(), r);
|
466
|
+
// svd::InitRandomMatrix(R);
|
467
|
+
svd::InitUnitMatrix(R);
|
468
|
+
Matrix Y = A.transpose() * R;
|
469
|
+
#else
|
470
|
+
Matrix Y;
|
471
|
+
svd::CompressCol(Y, A.transpose(), r);
|
472
|
+
#endif
|
473
|
+
svd::OrthonormalizeMatrix(Y);
|
474
|
+
const Matrix B = A * Y;
|
475
|
+
Matrix Z = B;
|
476
|
+
svd::OrthonormalizeMatrix(Z);
|
477
|
+
const Matrix C = Z.transpose() * B;
|
478
|
+
const Eigen::JacobiSVD<Matrix> svd(C, Eigen::ComputeThinU | Eigen::ComputeThinV);
|
479
|
+
U = Z * svd.matrixU();
|
480
|
+
S = svd.singularValues();
|
481
|
+
V = Y * svd.matrixV();
|
482
|
+
return true;
|
483
|
+
}
|
484
|
+
|
485
|
+
} } // cybozu::nlp
|
486
|
+
|
@@ -0,0 +1,226 @@
|
|
1
|
+
#pragma once
|
2
|
+
/**
|
3
|
+
@file
|
4
|
+
@brief TF-IDF
|
5
|
+
|
6
|
+
@author MITSUNARI Shigeo(@herumi)
|
7
|
+
*/
|
8
|
+
#include <set>
|
9
|
+
#include <map>
|
10
|
+
#include <string>
|
11
|
+
#include <stdio.h>
|
12
|
+
#include <cybozu/string_operation.hpp>
|
13
|
+
#include <cybozu/nlp/sparse.hpp>
|
14
|
+
|
15
|
+
namespace cybozu { namespace nlp {
|
16
|
+
|
17
|
+
struct Str2Int : std::map<std::string, int> {
|
18
|
+
void put() const
|
19
|
+
{
|
20
|
+
for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
|
21
|
+
printf("%s:%d\n", i->first.c_str(), i->second);
|
22
|
+
}
|
23
|
+
}
|
24
|
+
};
|
25
|
+
struct Int2Int : std::map<int, int> {
|
26
|
+
void put() const
|
27
|
+
{
|
28
|
+
for (const_iterator i = begin(), ie = end(); i != ie; ++i) {
|
29
|
+
printf("%d:%d ", i->first, i->second);
|
30
|
+
}
|
31
|
+
printf("\n");
|
32
|
+
}
|
33
|
+
};
|
34
|
+
struct StrVec : std::vector<std::string> {
|
35
|
+
void put() const
|
36
|
+
{
|
37
|
+
for (size_t i = 0, n = size(); i < n; i++) {
|
38
|
+
printf("%d:%s\n", (int)i, (*this)[i].c_str());
|
39
|
+
}
|
40
|
+
}
|
41
|
+
};
|
42
|
+
typedef std::vector<double> DoubleVec;
|
43
|
+
typedef std::vector<Int2Int> Int2IntVec;
|
44
|
+
typedef std::set<std::string> StrSet;
|
45
|
+
typedef cybozu::nlp::SparseVector<double> DoubleSvec;
|
46
|
+
typedef std::vector<DoubleSvec> DoubleSvecVec;
|
47
|
+
typedef std::vector<int> IntVec;
|
48
|
+
|
49
|
+
struct Df {
|
50
|
+
struct Pair {
|
51
|
+
int id;
|
52
|
+
int freq;
|
53
|
+
Pair(int _id = 0, int _freq = 0) : id(_id), freq(_freq) { }
|
54
|
+
bool operator<(const Pair& rhs) const { return freq < rhs.freq; }
|
55
|
+
};
|
56
|
+
typedef std::vector<Pair> PairVec;
|
57
|
+
int docNum_;
|
58
|
+
Str2Int word2id_;
|
59
|
+
StrVec id2word_;
|
60
|
+
IntVec df_;
|
61
|
+
StrSet set_; // for one doc
|
62
|
+
PairVec pv_;
|
63
|
+
Df()
|
64
|
+
: docNum_(0)
|
65
|
+
{
|
66
|
+
}
|
67
|
+
void append(const std::string& word)
|
68
|
+
{
|
69
|
+
std::string lower;
|
70
|
+
cybozu::ToLower(lower, word);
|
71
|
+
std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(lower, (int)id2word_.size()));
|
72
|
+
//printf("word=%s, id=%d, ret=%d\n", ret.first->first.c_str(), ret.first->second, ret.second);
|
73
|
+
if (ret.second) {
|
74
|
+
id2word_.push_back(lower);
|
75
|
+
df_.resize(id2word_.size());
|
76
|
+
}
|
77
|
+
if (set_.insert(word).second) {
|
78
|
+
df_[ret.first->second]++;
|
79
|
+
}
|
80
|
+
}
|
81
|
+
void endDoc()
|
82
|
+
{
|
83
|
+
docNum_++;
|
84
|
+
set_.clear();
|
85
|
+
}
|
86
|
+
// sort freq order
|
87
|
+
void term(int lowerLimit = 3, double upperRateLimit = 0.98)
|
88
|
+
{
|
89
|
+
fprintf(stderr, "#doc=%d, #word=%d\n", docNum_, (int)df_.size());
|
90
|
+
for (size_t i = 0, n = id2word_.size(); i < n; i++) {
|
91
|
+
const int freq = df_[i];
|
92
|
+
if (freq <= lowerLimit) continue;
|
93
|
+
pv_.push_back(Pair(i, freq));
|
94
|
+
}
|
95
|
+
int pvNum = (int)(pv_.size() * upperRateLimit);
|
96
|
+
fprintf(stderr, "shrink %d -> %d\n", (int)pv_.size(), pvNum);
|
97
|
+
std::partial_sort(pv_.begin(), pv_.begin() + pvNum, pv_.end());
|
98
|
+
pv_.resize(pvNum);
|
99
|
+
}
|
100
|
+
};
|
101
|
+
|
102
|
+
inline std::ostream& operator<<(std::ostream& os, const Df& df)
|
103
|
+
{
|
104
|
+
const double logN = log(double(df.docNum_));
|
105
|
+
for (size_t i = 0, n = df.pv_.size(); i < n; i++) {
|
106
|
+
int freq = df.pv_[i].freq;
|
107
|
+
double idf = logN - log(double(freq));
|
108
|
+
os << df.id2word_[df.pv_[i].id] << '\t' << freq << '\t' << idf << std::endl;
|
109
|
+
}
|
110
|
+
return os;
|
111
|
+
}
|
112
|
+
|
113
|
+
struct TfIdf {
|
114
|
+
Str2Int word2id_;
|
115
|
+
StrVec id2word_;
|
116
|
+
IntVec df_;
|
117
|
+
Int2IntVec tf_;
|
118
|
+
|
119
|
+
DoubleVec idf_;
|
120
|
+
DoubleSvecVec sv_;
|
121
|
+
|
122
|
+
// work area
|
123
|
+
Int2Int *curTf_;
|
124
|
+
StrSet set_; // for one doc
|
125
|
+
|
126
|
+
TfIdf()
|
127
|
+
: curTf_(0)
|
128
|
+
{
|
129
|
+
}
|
130
|
+
bool loadKeywordFile(const std::string& keyFile)
|
131
|
+
{
|
132
|
+
std::ifstream ifs(keyFile.c_str(), std::ios::binary);
|
133
|
+
if (!ifs) return false;
|
134
|
+
std::string word;
|
135
|
+
while (std::getline(ifs, word)) {
|
136
|
+
size_t pos = word.find('\t');
|
137
|
+
if (pos == std::string::npos) break;
|
138
|
+
word.resize(pos);
|
139
|
+
std::pair<Str2Int::iterator, bool> ret = word2id_.insert(Str2Int::value_type(word, (int)id2word_.size()));
|
140
|
+
if (ret.second) {
|
141
|
+
id2word_.push_back(word);
|
142
|
+
} else {
|
143
|
+
fprintf(stderr, "ERR already set %s\n", word.c_str());
|
144
|
+
}
|
145
|
+
}
|
146
|
+
df_.resize(id2word_.size());
|
147
|
+
fprintf(stderr, "#word = %d\n", (int)df_.size());
|
148
|
+
return true;
|
149
|
+
}
|
150
|
+
|
151
|
+
void append(const std::string& word)
|
152
|
+
{
|
153
|
+
std::string lower;
|
154
|
+
cybozu::ToLower(lower, word);
|
155
|
+
Str2Int::const_iterator i = word2id_.find(lower);
|
156
|
+
if (i == word2id_.end()) return;
|
157
|
+
const int id = i->second;
|
158
|
+
if (curTf_ == 0) {
|
159
|
+
tf_.push_back(Int2Int());
|
160
|
+
curTf_ = &tf_.back();
|
161
|
+
}
|
162
|
+
(*curTf_)[id]++;
|
163
|
+
if (set_.insert(lower).second) {
|
164
|
+
df_[id]++;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
void endDoc()
|
168
|
+
{
|
169
|
+
curTf_ = 0;
|
170
|
+
set_.clear();
|
171
|
+
}
|
172
|
+
void put() const
|
173
|
+
{
|
174
|
+
printf("docNum=%d\n", (int)tf_.size());
|
175
|
+
for (size_t i = 0, n = tf_.size(); i < n; i++) {
|
176
|
+
printf("%d ", (int)i);
|
177
|
+
tf_[i].put();
|
178
|
+
}
|
179
|
+
puts("word:idx");
|
180
|
+
word2id_.put();
|
181
|
+
}
|
182
|
+
|
183
|
+
void term()
|
184
|
+
{
|
185
|
+
const double logN = log(double(tf_.size()));
|
186
|
+
idf_.resize(df_.size());
|
187
|
+
for (size_t i = 0, n = df_.size(); i < n; i++) {
|
188
|
+
idf_[i] = logN - log(double(df_[i]));
|
189
|
+
}
|
190
|
+
for (size_t i = 0, n = df_.size(); i < n; i++) {
|
191
|
+
const Int2Int& iv = tf_[i];
|
192
|
+
DoubleSvec v;
|
193
|
+
for (Int2Int::const_iterator j = iv.begin(), je = iv.end(); j != je; ++j) {
|
194
|
+
v.push_back(j->first, j->second * idf_[j->first]);
|
195
|
+
}
|
196
|
+
sv_.push_back(v);
|
197
|
+
}
|
198
|
+
}
|
199
|
+
void put(int maxNum = 0x7fffffff) const
|
200
|
+
{
|
201
|
+
printf("docNum=%d, wordNum=%d\n", (int)tf_.size(), (int)df_.size());
|
202
|
+
for (int i = 0, n = std::min(maxNum, (int)sv_.size()); i < n; i++) {
|
203
|
+
const DoubleSvec& v = sv_[i];
|
204
|
+
for (DoubleSvec::const_iterator j = v.begin(), je = v.end(); j != je; ++j) {
|
205
|
+
printf("%d:%f ", (int)j->pos(), j->val());
|
206
|
+
}
|
207
|
+
printf("\n");
|
208
|
+
}
|
209
|
+
}
|
210
|
+
};
|
211
|
+
|
212
|
+
inline std::ostream& operator<<(std::ostream& os, const TfIdf& /*tfIdf*/)
|
213
|
+
{
|
214
|
+
#if 0
|
215
|
+
int num = 0;
|
216
|
+
for (TfIdf::Rank::const_iterator i = tfIdf.rank_.begin(), ie = tfIdf.rank_.end(); i != ie; ++i) {
|
217
|
+
TfIdf::Counter::const_iterator c = tfIdf.counter_.find(i->second);
|
218
|
+
assert(c != tfIdf.counter_.end());
|
219
|
+
os << i->first << ' ' << c->second.tf_ << ' ' << c->second.df_ << ' ' << i->second << std::endl;
|
220
|
+
num++;
|
221
|
+
}
|
222
|
+
#endif
|
223
|
+
return os;
|
224
|
+
}
|
225
|
+
|
226
|
+
} } // cybozu::nlp
|