libmf 0.1.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -1
- data/LICENSE.txt +26 -18
- data/README.md +88 -32
- data/lib/libmf.rb +10 -7
- data/lib/libmf/ffi.rb +2 -6
- data/lib/libmf/model.rb +51 -25
- data/lib/libmf/version.rb +1 -1
- data/vendor/{libmf/COPYRIGHT → COPYRIGHT} +0 -0
- data/vendor/{libmf/demo → demo}/real_matrix.te.txt +0 -0
- data/vendor/{libmf/demo → demo}/real_matrix.tr.txt +0 -0
- data/vendor/libmf.arm64.dylib +0 -0
- data/vendor/libmf.dylib +0 -0
- data/vendor/libmf.so +0 -0
- data/vendor/mf.dll +0 -0
- metadata +17 -89
- data/ext/libmf/extconf.rb +0 -18
- data/lib/libmf.bundle +0 -0
- data/vendor/libmf/Makefile +0 -34
- data/vendor/libmf/Makefile.win +0 -36
- data/vendor/libmf/README +0 -637
- data/vendor/libmf/demo/all_one_matrix.te.txt +0 -1382
- data/vendor/libmf/demo/all_one_matrix.tr.txt +0 -5172
- data/vendor/libmf/demo/binary_matrix.te.txt +0 -1312
- data/vendor/libmf/demo/binary_matrix.tr.txt +0 -4937
- data/vendor/libmf/demo/demo.bat +0 -40
- data/vendor/libmf/demo/demo.sh +0 -58
- data/vendor/libmf/mf-predict.cpp +0 -207
- data/vendor/libmf/mf-train.cpp +0 -378
- data/vendor/libmf/mf.cpp +0 -4683
- data/vendor/libmf/mf.def +0 -21
- data/vendor/libmf/mf.h +0 -130
- data/vendor/libmf/windows/mf-predict.exe +0 -0
- data/vendor/libmf/windows/mf-train.exe +0 -0
- data/vendor/libmf/windows/mf.dll +0 -0
data/ext/libmf/extconf.rb
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
require "mkmf"
|
2
|
-
|
3
|
-
arch = RbConfig::CONFIG["arch"]
|
4
|
-
case arch
|
5
|
-
when /mingw/
|
6
|
-
File.write("Makefile", dummy_makefile("libmf").join)
|
7
|
-
else
|
8
|
-
abort "Missing stdc++" unless have_library("stdc++")
|
9
|
-
$CXXFLAGS << " -std=c++11"
|
10
|
-
|
11
|
-
# TODO
|
12
|
-
# if have_library("libomp")
|
13
|
-
# end
|
14
|
-
|
15
|
-
$objs = ["mf.o"]
|
16
|
-
vendor_path = File.expand_path("../../vendor/libmf", __dir__)
|
17
|
-
create_makefile("libmf", vendor_path)
|
18
|
-
end
|
data/lib/libmf.bundle
DELETED
Binary file
|
data/vendor/libmf/Makefile
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
CXX = g++
|
2
|
-
CXXFLAGS = -Wall -O3 -pthread -std=c++0x -march=native
|
3
|
-
OMPFLAG = -fopenmp
|
4
|
-
SHVER = 2
|
5
|
-
|
6
|
-
# run `make clean all' if you change the following flags.
|
7
|
-
|
8
|
-
# comment the following flag if you want to disable SSE or enable AVX
|
9
|
-
DFLAG = -DUSESSE
|
10
|
-
|
11
|
-
# uncomment the following flags if you want to use AVX
|
12
|
-
#DFLAG = -DUSEAVX
|
13
|
-
#CXXFLAGS += -mavx
|
14
|
-
|
15
|
-
# uncomment the following flags if you do not want to use OpenMP
|
16
|
-
DFLAG += -DUSEOMP
|
17
|
-
CXXFLAGS += $(OMPFLAG)
|
18
|
-
|
19
|
-
all: mf-train mf-predict
|
20
|
-
|
21
|
-
lib:
|
22
|
-
$(CXX) -shared -Wl,-soname,libmf.so.$(SHVER) -o libmf.so.$(SHVER) mf.o
|
23
|
-
|
24
|
-
mf-train: mf-train.cpp mf.o
|
25
|
-
$(CXX) $(CXXFLAGS) $(DFLAG) -o $@ $^
|
26
|
-
|
27
|
-
mf-predict: mf-predict.cpp mf.o
|
28
|
-
$(CXX) $(CXXFLAGS) $(DFLAG) -o $@ $^
|
29
|
-
|
30
|
-
mf.o: mf.cpp mf.h
|
31
|
-
$(CXX) $(CXXFLAGS) $(DFLAG) -c -fPIC -o $@ $<
|
32
|
-
|
33
|
-
clean:
|
34
|
-
rm -f mf-train mf-predict mf.o libmf.so.$(SHVER)
|
data/vendor/libmf/Makefile.win
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
CXX = cl.exe
|
2
|
-
CFLAGS = /W4 /nologo /O2 /EHsc /D "_CRT_SECURE_NO_DEPRECATE"
|
3
|
-
|
4
|
-
# Choose one instruction set to accelerate LIBMF
|
5
|
-
# 1. use SSE
|
6
|
-
CPUSET = /D "USESSE"
|
7
|
-
# 2. use AVX
|
8
|
-
#CPUSET = /D "USEAVX" /arch:AVX
|
9
|
-
# 3. no acceleration
|
10
|
-
#CPUSET =
|
11
|
-
|
12
|
-
# Always use OpenMP because Visual Studio supports it.
|
13
|
-
CFLAGS = $(CFLAGS) $(CPUSET) /D "USEOMP" /openmp
|
14
|
-
# To disable OpenMP, please use the command below.
|
15
|
-
#CFLAGS = $(CFLAGS) $(CPUSET) /D
|
16
|
-
|
17
|
-
TARGET = windows
|
18
|
-
|
19
|
-
all: $(TARGET)\mf-train.exe $(TARGET)\mf-predict.exe lib
|
20
|
-
|
21
|
-
$(TARGET)\mf-predict.exe: mf.h mf-predict.cpp mf.obj
|
22
|
-
$(CXX) $(CFLAGS) mf-predict.cpp mf.obj -Fe$(TARGET)\mf-predict.exe
|
23
|
-
|
24
|
-
$(TARGET)\mf-train.exe: mf.h mf-train.cpp mf.obj
|
25
|
-
$(CXX) $(CFLAGS) mf-train.cpp mf.obj -Fe$(TARGET)\mf-train.exe
|
26
|
-
|
27
|
-
mf.obj: mf.cpp mf.h
|
28
|
-
$(CXX) $(CFLAGS) -c mf.cpp
|
29
|
-
|
30
|
-
lib: mf.cpp mf.def mf.h
|
31
|
-
$(CXX) $(CFLAGS) -LD mf.cpp -Fe$(TARGET)\mf -link -DEF:mf.def
|
32
|
-
|
33
|
-
clean:
|
34
|
-
-erase /Q *.obj *.dll *.lib *.exe $(TARGET)\.
|
35
|
-
|
36
|
-
|
data/vendor/libmf/README
DELETED
@@ -1,637 +0,0 @@
|
|
1
|
-
LIBMF is a library for large-scale sparse matrix factorization. For the
|
2
|
-
optimization problem it solves and the overall framework, please refer to [3].
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
Table of Contents
|
7
|
-
=================
|
8
|
-
|
9
|
-
- Installation
|
10
|
-
- Data Format
|
11
|
-
- Model Format
|
12
|
-
- Command Line Usage
|
13
|
-
- Examples
|
14
|
-
- Library Usage
|
15
|
-
- SSE, AVX, and OpenMP
|
16
|
-
- Building Windows and Mac Binaries
|
17
|
-
- References
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
Installation
|
22
|
-
============
|
23
|
-
|
24
|
-
- Requirements
|
25
|
-
To compile LIBMF, a compiler which supports C++11 is required. LIBMF can
|
26
|
-
use SSE, AVX, and OpenMP for acceleration. See Section SSE, AVX, and OpenMP
|
27
|
-
if you want to disable or enable these features.
|
28
|
-
|
29
|
-
- Unix & Cygwin
|
30
|
-
|
31
|
-
Type `make' to build `mf-train' and `mf-precict.'
|
32
|
-
|
33
|
-
- Windows & Mac
|
34
|
-
|
35
|
-
See `Building Windows and Mac Binaries' to compile. For Windows, pre-built
|
36
|
-
binaries are available in the directory `windows.'
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
Data Format
|
41
|
-
===========
|
42
|
-
|
43
|
-
LIBMF's command-line tool can be used to factorize matrices with real or binary
|
44
|
-
values. Each line in the training file stores a tuple,
|
45
|
-
|
46
|
-
<row_idx> <col_idx> <value>
|
47
|
-
|
48
|
-
which records an entry of the training matrix. In the `demo' directory, the
|
49
|
-
files `real_matrix.tr.txt' and `real_matrix.te.txt' are the training and test
|
50
|
-
sets for a demonstration of real-valued matrix factorization (RVMF). For binary
|
51
|
-
matrix factorization (BMF), the set of <value> is {-1, 1} as shown in
|
52
|
-
`binary_matrix.tr.txt' and `binary_matrix.te.txt.' For one-class MF, all
|
53
|
-
<value>'s are positive. See `all_one_matrix.tr.txt' and `all_one_matrix.te.txt'
|
54
|
-
as examples.
|
55
|
-
|
56
|
-
Note: If the values in the test set are unknown, please put dummy zeros.
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
Model Format
|
61
|
-
============
|
62
|
-
|
63
|
-
LIBMF factorizes a training matrix `R' into a k-by-m matrix `P' and a
|
64
|
-
k-by-n matrix `Q' such that `R' is approximated by P'Q. After the training
|
65
|
-
process is finished, the two factor matrices `P' and `Q' are stored into a
|
66
|
-
model file. The file starts with a header including:
|
67
|
-
|
68
|
-
`f': the loss function of the solved MF problem
|
69
|
-
`m': the number of rows in the training matrix,
|
70
|
-
`n': the number of columns in the training matrix,
|
71
|
-
`k': the number of latent factors,
|
72
|
-
`b': the average of all elements in the training matrix.
|
73
|
-
|
74
|
-
From the 5th line, the columns of `P' and `Q' are stored line by line. In
|
75
|
-
each line, there are two leading tokens followed by the values of a
|
76
|
-
column. The first token is the name of the stored column, and the second
|
77
|
-
word indicates the type of values. If the second word is `T', the column is
|
78
|
-
a real vector. Otherwise, all values in the column are NaN. For example, if
|
79
|
-
|
80
|
-
[1 NaN 2] [-1 -2]
|
81
|
-
P = |3 NaN 4|, Q = |-3 -4|,
|
82
|
-
[5 NaN 6] [-5 -6]
|
83
|
-
|
84
|
-
and the value `b' is 0.5, the content of the model file is:
|
85
|
-
|
86
|
-
--------model file--------
|
87
|
-
m 3
|
88
|
-
n 2
|
89
|
-
k 3
|
90
|
-
b 0.5
|
91
|
-
p0 T 1 3 5
|
92
|
-
p1 F 0 0 0
|
93
|
-
p2 T 2 4 6
|
94
|
-
q0 T -1 -3 -5
|
95
|
-
q1 T -2 -4 -6
|
96
|
-
--------------------------
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
Command Line Usage
|
101
|
-
==================
|
102
|
-
|
103
|
-
- `mf-train'
|
104
|
-
|
105
|
-
usage: mf-train [options] training_set_file [model_file]
|
106
|
-
|
107
|
-
options:
|
108
|
-
-l1 <lambda>,<lambda>: set L1-regularization parameters for P and Q.
|
109
|
-
(default 0) If only one value is specified, P and Q share the same
|
110
|
-
lambda.
|
111
|
-
-l2 <lambda>,<lambda>: set L2-regularization parameters for P and Q.
|
112
|
-
(default 0.1) If only one value is specified, P and Q share the same
|
113
|
-
lambda.
|
114
|
-
-f <loss>: specify loss function (default 0)
|
115
|
-
for real-valued matrix factorization
|
116
|
-
0 -- squared error (L2-norm)
|
117
|
-
1 -- absolute error (L1-norm)
|
118
|
-
2 -- generalized KL-divergence (--nmf is required)
|
119
|
-
for binary matrix factorization
|
120
|
-
5 -- logarithmic error
|
121
|
-
6 -- squared hinge loss
|
122
|
-
7 -- hinge loss
|
123
|
-
for one-class matrix factorization
|
124
|
-
10 -- row-oriented pair-wise logarithmic loss
|
125
|
-
11 -- column-oriented pair-wise logarithmic loss
|
126
|
-
12 -- squared error (L2-norm)
|
127
|
-
-k <dimensions>: set number of dimensions (default 8)
|
128
|
-
-t <iter>: set number of iterations (default 20)
|
129
|
-
-r <eta>: set initial learning rate (default 0.1)
|
130
|
-
-a <alpha>: set coefficient of negative entries' loss (default 1)
|
131
|
-
-c <c>: set value of negative entries (default 0.0001).
|
132
|
-
Every positive entry is assumed to be 1.
|
133
|
-
-s <threads>: set number of threads (default 12)
|
134
|
-
-n <bins>: set number of bins (may be adjusted by LIBMF for speed)
|
135
|
-
-p <path>: set path to the validation set
|
136
|
-
-v <fold>: set number of folds for cross validation
|
137
|
-
--quiet: quiet mode (no outputs)
|
138
|
-
--nmf: perform non-negative matrix factorization
|
139
|
-
--disk: perform disk-level training (will create a buffer file)
|
140
|
-
|
141
|
-
`mf-train' is the main training command of LIBMF. At each iteration, the
|
142
|
-
following information is printed.
|
143
|
-
|
144
|
-
- iter: the index of iteration.
|
145
|
-
- tr_*: * is the evaluation criterion on the training set.
|
146
|
-
- tr_*+: * is the evaluation criterion on the positive entries in the
|
147
|
-
training set.
|
148
|
-
- tr_*-: * is the evaluation criterion on the negative entries in the
|
149
|
-
training set.
|
150
|
-
- va_*: the same criterion on the validation set if `-p' is set
|
151
|
-
- va_*+: * is the evaluation criterion on the positive entries in the
|
152
|
-
validation set.
|
153
|
-
- va_*-: * is the evaluation criterion on the negative entries in the
|
154
|
-
validation set.
|
155
|
-
- obj: objective function value.
|
156
|
-
- reg: regularization term.
|
157
|
-
|
158
|
-
Here `tr_*' and `obj' are estimations because calculating true values
|
159
|
-
can be time-consuming. Different solvers can print different combinations
|
160
|
-
those values.
|
161
|
-
|
162
|
-
For different losses, the criterion to be printed is listed below.
|
163
|
-
|
164
|
-
<loss>: <evaluation criterion>
|
165
|
-
- 0: root mean square error (RMSE)
|
166
|
-
- 1: mean absolute error (MAE)
|
167
|
-
- 2: generalized KL-divergence (KL)
|
168
|
-
- 5: logarithmic loss
|
169
|
-
- 6 & 7: accuracy
|
170
|
-
- 10 & 11: pair-wise logarithmic loss in Bayesian personalized ranking
|
171
|
-
- 12: sum of squared errors. The label of positive entries is 1
|
172
|
-
- while negative entries' value is set using command line
|
173
|
-
- option -c.
|
174
|
-
|
175
|
-
- `mf-predict'
|
176
|
-
|
177
|
-
usage: mf-predict [options] test_file model_file output_file
|
178
|
-
|
179
|
-
options:
|
180
|
-
-e <criterion>: set the evaluation criterion (default 0)
|
181
|
-
0: root mean square error
|
182
|
-
1: mean absolute error
|
183
|
-
2: generalized KL-divergence
|
184
|
-
5: logarithmic loss
|
185
|
-
6: accuracy
|
186
|
-
10: row-oriented mean percentile rank (row-oriented MPR)
|
187
|
-
11: colum-oriented mean percentile rank (column-oriented MPR)
|
188
|
-
12: row-oriented area under ROC curve (row-oriented AUC)
|
189
|
-
13: column-oriented area under ROC curve (column-oriented AUC)
|
190
|
-
|
191
|
-
`mf-predict' outputs the prediction values of the entries specified in
|
192
|
-
`test_file' to the `output_file.' The selected criterion will be printed
|
193
|
-
as well.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
Examples
|
198
|
-
========
|
199
|
-
This section gives example commands of LIBMF using the data sets in `demo'
|
200
|
-
directory. In `demo,' a shell script `demo.sh' can be run for demonstration.
|
201
|
-
|
202
|
-
> mf-train real_matrix.tr.txt model
|
203
|
-
|
204
|
-
train a model using the default parameters
|
205
|
-
|
206
|
-
> mf-train -l1 0.05 -l2 0.01 real_matrix.tr.txt model
|
207
|
-
|
208
|
-
train a model with the following regularization coefficients:
|
209
|
-
|
210
|
-
coefficient of L1-norm regularization on P = 0.05
|
211
|
-
coefficient of L1-norm regularization on Q = 0.05
|
212
|
-
coefficient of L2-norm regularization on P = 0.01
|
213
|
-
coefficient of L2-norm regularization on Q = 0.01
|
214
|
-
|
215
|
-
> mf-train -l1 0.015,0 -l2 0.01,0.005 real_matrix.tr.txt model
|
216
|
-
|
217
|
-
train a model with the following regularization coefficients:
|
218
|
-
|
219
|
-
coefficient of L1-norm regularization on P = 0.05
|
220
|
-
coefficient of L1-norm regularization on Q = 0
|
221
|
-
coefficient of L2-norm regularization on P = 0.01
|
222
|
-
coefficient of L2-norm regularization on Q = 0.03
|
223
|
-
|
224
|
-
> mf-train -f 5 -l1 0,0.02 -k 100 -t 30 -r 0.02 -s 4 binary_matrix.tr.txt model
|
225
|
-
|
226
|
-
train a BMF model using logarithmic loss and the following parameters:
|
227
|
-
|
228
|
-
coefficient of L1-norm regularization on P = 0
|
229
|
-
coefficient of L1-norm regularization on Q = 0.01
|
230
|
-
latent factors = 100
|
231
|
-
iterations = 30
|
232
|
-
learning rate = 0.02
|
233
|
-
threads = 4
|
234
|
-
|
235
|
-
> mf-train -p real_matrix.te.txt real_matrix.tr.txt model
|
236
|
-
|
237
|
-
use real_matrix.te.txt for hold-out validation
|
238
|
-
|
239
|
-
> mf-train -v 5 real_matrix.tr.txt
|
240
|
-
|
241
|
-
do five fold cross validation
|
242
|
-
|
243
|
-
> mf-train -f 2 --nmf real_matrix.tr.txt
|
244
|
-
|
245
|
-
do non-negative matrix factorization with generalized KL-divergence
|
246
|
-
|
247
|
-
> mf-train --quiet real_matrix.tr.txt
|
248
|
-
|
249
|
-
do not print message to screen
|
250
|
-
|
251
|
-
> mf-train --disk real_matrix.tr.txt
|
252
|
-
|
253
|
-
do disk-level training
|
254
|
-
|
255
|
-
> mf-predict real_matrix.te.txt model output
|
256
|
-
|
257
|
-
do prediction
|
258
|
-
|
259
|
-
> mf-predict -e 1 real_matrix.te.txt model output
|
260
|
-
|
261
|
-
do prediction and output MAE
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
Library Usage
|
266
|
-
=============
|
267
|
-
|
268
|
-
These structures and functions are declared in the header file `mf.h.' You need
|
269
|
-
to #include `mf.h' in your C/C++ source files and link your program with
|
270
|
-
`mf.cpp.' Users can read `mf-train.cpp' and `mf-predict.cpp' as usage examples.
|
271
|
-
|
272
|
-
Before predicting test data, we need to construct a model (`mf_model') using
|
273
|
-
training data which is either a C structure `mf_problem' or the path to the
|
274
|
-
training file. For the first case, the whole data set needs to be fitted into
|
275
|
-
memory. For the second case, a binary version of the training file will be
|
276
|
-
created, and only some parts of the binary file are loaded at one time. Note
|
277
|
-
that a model can also be saved in a file for later use. To evaluate the quality
|
278
|
-
of a model, users can call an evaluation function in LIBMF with a `mf_problem'
|
279
|
-
and a `mf_model.'
|
280
|
-
|
281
|
-
|
282
|
-
There are four public data structures in LIBMF.
|
283
|
-
|
284
|
-
- struct mf_node
|
285
|
-
{
|
286
|
-
mf_int u;
|
287
|
-
mf_int v;
|
288
|
-
mf_float r;
|
289
|
-
};
|
290
|
-
|
291
|
-
`mf_node' represents an element in a sparse matrix. `u' represents the row
|
292
|
-
index, `v' represents the column index, and `r' represents the value.
|
293
|
-
|
294
|
-
|
295
|
-
- struct mf_problem
|
296
|
-
{
|
297
|
-
mf_int m;
|
298
|
-
mf_int n;
|
299
|
-
mf_long nnz;
|
300
|
-
struct mf_node *R;
|
301
|
-
};
|
302
|
-
|
303
|
-
`mf_problem' represents a sparse matrix. Each element is represented by
|
304
|
-
`mf_node.' `m' represents the number of rows, `n' represents the number of
|
305
|
-
columns, `nnz' represents the number of non-zero elements, and `R' is an
|
306
|
-
array of `mf_node' whose length is `nnz.'
|
307
|
-
|
308
|
-
|
309
|
-
- struct mf_parameter
|
310
|
-
{
|
311
|
-
mf_int fun;
|
312
|
-
mf_int k;
|
313
|
-
mf_int nr_threads;
|
314
|
-
mf_int nr_bins;
|
315
|
-
mf_int nr_iters;
|
316
|
-
mf_float lambda_p1;
|
317
|
-
mf_float lambda_p2;
|
318
|
-
mf_float lambda_q1;
|
319
|
-
mf_float lambda_q2;
|
320
|
-
mf_float alpha;
|
321
|
-
mf_float c;
|
322
|
-
mf_float eta;
|
323
|
-
bool do_nmf;
|
324
|
-
bool quiet;
|
325
|
-
bool copy_data;
|
326
|
-
};
|
327
|
-
|
328
|
-
`mf_parameter' represents the parameters used for training. The meaning of
|
329
|
-
each variable is:
|
330
|
-
|
331
|
-
variable meaning default
|
332
|
-
================================================================
|
333
|
-
fun loss function 0
|
334
|
-
k number of latent factors 8
|
335
|
-
nr_threads number of threads used 12
|
336
|
-
nr_bins number of bins 20
|
337
|
-
nr_iters number of iterations 20
|
338
|
-
lambda_p1 coefficient of L1-norm regularization on P 0
|
339
|
-
lambda_p2 coefficient of L2-norm regularization on P 0.1
|
340
|
-
lambda_q1 coefficient of L1-norm regularization on Q 0
|
341
|
-
lambda_q2 coefficient of L2-norm regularization on Q 0.1
|
342
|
-
eta learning rate 0.1
|
343
|
-
alpha importance of negative entries 0.1
|
344
|
-
c desired value of negative entries 0.0001
|
345
|
-
do_nmf perform non-negative MF (NMF) false
|
346
|
-
quiet no outputs to stdout false
|
347
|
-
copy_data copy data in training procedure true
|
348
|
-
|
349
|
-
There are two major algorithm categories in LIBMF. One is for stochastic
|
350
|
-
gradient method and the other one is for coordinate descent method. Both
|
351
|
-
of them support multi-threading. Currently, the only solver used
|
352
|
-
coordinate descent method is implemented for fun=12. All other types of loss
|
353
|
-
functions such as fun=0 may use stochastic gradient method. Notice that
|
354
|
-
when a framework does support the parameters specified, LIBMF may ignore
|
355
|
-
them or throw an error.
|
356
|
-
|
357
|
-
LIBMF's framework for stochastic gradient method:
|
358
|
-
|
359
|
-
In LIBMF, we parallelize the computation by griding the data matrix
|
360
|
-
into nr_bins^2 blocks. According to our experiments, this parameter is
|
361
|
-
not sensitive to both effectiveness and efficiency. In most cases the
|
362
|
-
default value should work well.
|
363
|
-
|
364
|
-
For disk-level training, `nr_bins' controls the memory usage of
|
365
|
-
because one thread accesss an entire block at one time. If `nr_bins'
|
366
|
-
is 4 and `nr_threads' is 1, the expected usage of memory is 25% of the
|
367
|
-
memory to store the whole training matrix.
|
368
|
-
|
369
|
-
Let the training data is a `mf_problem.' By default, at the beginning
|
370
|
-
of the training procedure, the data matrix is copied because it would
|
371
|
-
be modified in the training process. To save memory, `copy_data' can
|
372
|
-
be set to false with the following effects.
|
373
|
-
|
374
|
-
(1) The raw data is directly used without being copied.
|
375
|
-
(2) The order of nodes may be changed.
|
376
|
-
(3) The value in each node may become slightly different.
|
377
|
-
|
378
|
-
Note that `copy_data' is invalid for disk-level training.
|
379
|
-
|
380
|
-
To obtain a parameter with default values, use the function
|
381
|
-
`get_default_parameter.'
|
382
|
-
|
383
|
-
Note that parameter alpha and c are not ignored under this framework.
|
384
|
-
|
385
|
-
LIBMF's framework for coordinate descent method:
|
386
|
-
|
387
|
-
Currently, only one solver is implemented under this framework. It
|
388
|
-
minimizes the squared errors overall the whole training matrix. Its
|
389
|
-
regularization function is Frobenius norm on the two factor matrices
|
390
|
-
P and Q. Note that the the original training matrix R (m-by-n) is
|
391
|
-
approximated by P^TQ. This solver requires two copies of the original
|
392
|
-
positive entries if `copy_data' is false. That is, if your input data is
|
393
|
-
50MB, LIBMF may need 150MB memory in total for data storage. By
|
394
|
-
setting `copy_data' to false, LIBMF will only make one extra copy.
|
395
|
-
Disk-level training is not supported.
|
396
|
-
|
397
|
-
Parameters recognized by this framework are `fun,' `k,' `nr_threads,'
|
398
|
-
`nr_iters,' `lambda_p2,' `lambda_q2,' `alpha,' `c,' `quiet,' and
|
399
|
-
`copy_data.'
|
400
|
-
|
401
|
-
Unlike the standard C++ thread class used in stochastic gradient
|
402
|
-
method's framework, the parallel computation here relies on OpenMP, so
|
403
|
-
please make sure your complier can support it.
|
404
|
-
|
405
|
-
|
406
|
-
- struct mf_model
|
407
|
-
{
|
408
|
-
mf_int fun;
|
409
|
-
mf_int m;
|
410
|
-
mf_int n;
|
411
|
-
mf_int k;
|
412
|
-
mf_float b;
|
413
|
-
mf_float *P;
|
414
|
-
mf_float *Q;
|
415
|
-
};
|
416
|
-
|
417
|
-
`mf_model' is used to store models learned by LIBMF. `fun' indicates the
|
418
|
-
loss function of the solved MF problem. `m' represents the number of rows,
|
419
|
-
`n' represents the number of columns, `k' represents the number of latent
|
420
|
-
factors, and `b' is the average of all elements in the training matrix. `P'
|
421
|
-
is used to store a kxm matrix in column oriented format. For example, if
|
422
|
-
`P' stores a 3x4 matrix, then the content of `P' is:
|
423
|
-
|
424
|
-
P11 P21 P31 P12 P22 P32 P13 P23 P33 P14 P24 P34
|
425
|
-
|
426
|
-
`Q' is used to store a kxn matrix in the same manner.
|
427
|
-
|
428
|
-
|
429
|
-
Functions available in LIBMF include:
|
430
|
-
|
431
|
-
|
432
|
-
- mf_parameter mf_get_default_param();
|
433
|
-
|
434
|
-
Get default parameters.
|
435
|
-
|
436
|
-
- mf_int mf_save_model(struct mf_model const *model, char const *path);
|
437
|
-
|
438
|
-
Save a model. It returns 0 on sucess and 1 on failure.
|
439
|
-
|
440
|
-
- struct mf_model* mf_load_model(char const *path);
|
441
|
-
|
442
|
-
Load a model. If the model could not be loaded, a nullptr is returned.
|
443
|
-
|
444
|
-
- void mf_destroy_model(struct mf_model **model);
|
445
|
-
|
446
|
-
Destroy a model.
|
447
|
-
|
448
|
-
- struct mf_model* mf_train(
|
449
|
-
struct mf_problem const *prob,
|
450
|
-
mf_parameter param);
|
451
|
-
|
452
|
-
Train a model. A nullptr is returned if fail.
|
453
|
-
|
454
|
-
- struct mf_model* mf_train_on_disk(
|
455
|
-
char const *tr_path,
|
456
|
-
mf_parameter param);
|
457
|
-
|
458
|
-
Train a model while parts of data is put in disk to reduce memory usage. A
|
459
|
-
nullptr is returned if fail.
|
460
|
-
|
461
|
-
Notice: the model is still fully loaded during the training process.
|
462
|
-
|
463
|
-
- struct mf_model* mf_train_with_validation(
|
464
|
-
struct mf_problem const *tr,
|
465
|
-
struct mf_problem const *va,
|
466
|
-
mf_parameter param);
|
467
|
-
|
468
|
-
Train a model with training set `tr' and validation set `va.' The
|
469
|
-
evaluation criterion of the validation set is printed at each iteration.
|
470
|
-
|
471
|
-
- struct mf_model* mf_train_with_validation_on_disk(
|
472
|
-
char const *tr_path,
|
473
|
-
char const *va_path,
|
474
|
-
mf_parameter param);
|
475
|
-
|
476
|
-
Train a model using the training file `tr_path' and validation file
|
477
|
-
`va_path' for holdout validation. The same strategy is used to save memory
|
478
|
-
as in `mf_train_on_disk.' It also printed the same information as
|
479
|
-
`mf_train_with_validation.'
|
480
|
-
|
481
|
-
Notice: LIBMF assumes that the model and the validation set can be fully
|
482
|
-
loaded into the memory.
|
483
|
-
|
484
|
-
- mf_float mf_cross_validation(
|
485
|
-
struct mf_problem const *prob,
|
486
|
-
mf_int nr_folds,
|
487
|
-
mf_parameter param);
|
488
|
-
|
489
|
-
Do cross validation with `nr_folds' folds.
|
490
|
-
|
491
|
-
- mf_float mf_predict(
|
492
|
-
struct mf_model const *model,
|
493
|
-
mf_int p_idx,
|
494
|
-
mf_int q_idx);
|
495
|
-
|
496
|
-
Predict the value at the position (p_idx, q_idx). The predicted value is a
|
497
|
-
real number for RVMF or OCMF. For BMF, the range of the prediction values
|
498
|
-
are {-1, 1}. If `p_idx' or `q_idx' can not be found in the training set,
|
499
|
-
the function returns the average (mode if BMF) of all values in the
|
500
|
-
training matrix.
|
501
|
-
|
502
|
-
- mf_double calc_rmse(mf_problem *prob, mf_model *model);
|
503
|
-
|
504
|
-
calculate the RMSE of the model on a test set `prob.' It can be used to
|
505
|
-
evaluate the result of real-valued MF.
|
506
|
-
|
507
|
-
- mf_double calc_mae(mf_problem *prob, mf_model *model);
|
508
|
-
|
509
|
-
calculate the MAE of the model on a test set `prob.' It can be used to
|
510
|
-
evaluate the result of real-valued MF.
|
511
|
-
|
512
|
-
- mf_double calc_gkl(mf_problem *prob, mf_model *model);
|
513
|
-
|
514
|
-
calculate the Generalized KL-divergence of the model on a test set `prob.'
|
515
|
-
It can be used to evaluate the result of non-negative RVMF.
|
516
|
-
|
517
|
-
- calc_logloss(mf_problem *prob, mf_model *model);
|
518
|
-
|
519
|
-
calculate the logarithmic loss of the model on a test `prob.' It can be
|
520
|
-
used to evaluate the result of BMF.
|
521
|
-
|
522
|
-
- mf_double calc_accuracy(mf_problem *prob, mf_model *model);
|
523
|
-
|
524
|
-
calculate the accuracy of the model on a test `prob.' It can be used to
|
525
|
-
evaluate the result of BMF.
|
526
|
-
|
527
|
-
- mf_double calc_mpr(mf_problem *prob, mf_model *model, bool transpose)
|
528
|
-
|
529
|
-
calculate the MPR of the model on a test `prob.' If `transpose' is `false
|
530
|
-
row-oriented MPR is calculated and otherwise column-oriented MPR. It can be
|
531
|
-
used to evaluate the result of OCMF.
|
532
|
-
|
533
|
-
- calc_auc(mf_problem *prob, mf_model *model, bool transpose);
|
534
|
-
|
535
|
-
calculate the row-oriented AUC of the model on a test `prob' if `transpose'
|
536
|
-
is `false.' For column-oriented AUC, set `transpose' to be 'true.' It can
|
537
|
-
be used to evaluate the result of OCMF.
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
SSE, AVX, and OpenMP
|
542
|
-
====================
|
543
|
-
|
544
|
-
LIBMF utilizes SSE instructions to accelerate the computation. If you cannot
|
545
|
-
use SSE on your platform, then please comment out
|
546
|
-
|
547
|
-
DFLAG = -DUSESSE
|
548
|
-
|
549
|
-
in Makefile to disable SSE.
|
550
|
-
|
551
|
-
Some modern CPUs support AVX, which is more powerful than SSE. To enable AVX,
|
552
|
-
please comment out
|
553
|
-
|
554
|
-
DFLAG = -DUSESSE
|
555
|
-
|
556
|
-
and uncomment the following lines in Makefile.
|
557
|
-
|
558
|
-
DFLAG = -DUSEAVX
|
559
|
-
CFLAGS += -mavx
|
560
|
-
|
561
|
-
If OpenMP is not available on your platform, please comment out the following
|
562
|
-
lines in Makefile.
|
563
|
-
|
564
|
-
DFLAG += -DUSEOMP
|
565
|
-
CXXFLAGS += -fopenmp
|
566
|
-
|
567
|
-
Notice: Please always run `make clean all' if these flags are changed.
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
Building Windows and Mac and Binaries
|
572
|
-
=====================================
|
573
|
-
|
574
|
-
- Windows
|
575
|
-
|
576
|
-
Windows binaries are in the directory `windows.' To build them via
|
577
|
-
command-line tools of Microsoft Visual Studio, use the following steps:
|
578
|
-
|
579
|
-
1. Open a DOS command box (or Developer Command Prompt for Visual Studio)
|
580
|
-
and go to libmf directory. If environment variables of VC++ have not been
|
581
|
-
set, type
|
582
|
-
|
583
|
-
"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\amd64\vcvars64.bat"
|
584
|
-
|
585
|
-
You may have to modify the above command according which version of VC++ or
|
586
|
-
where it is installed.
|
587
|
-
|
588
|
-
2. Type
|
589
|
-
|
590
|
-
nmake -f Makefile.win clean all
|
591
|
-
|
592
|
-
3. (optional) To build shared library mf_c.dll, type
|
593
|
-
|
594
|
-
nmake -f Makefile.win lib
|
595
|
-
|
596
|
-
- Mac
|
597
|
-
|
598
|
-
To complie LIBMF on Mac, a GCC complier is required, and users need to
|
599
|
-
slightly modify the Makefile. The following instructions are tested with
|
600
|
-
GCC 4.9.
|
601
|
-
|
602
|
-
1. Set the complier path to your GCC complier. For example, the first
|
603
|
-
line in the Makefile can be
|
604
|
-
|
605
|
-
CXX = g++-4.9
|
606
|
-
|
607
|
-
2. Remove `-march=native' from `CXXFLAGS.' The second line in the Makefile
|
608
|
-
Should be
|
609
|
-
|
610
|
-
CXXFLAGS = -O3 -pthread -std=c++0x
|
611
|
-
|
612
|
-
3. If AVX is enabled, we add `-Wa,-q' to the `CXXFLAGS,' so the previous
|
613
|
-
`CXXFLAGS' becomes
|
614
|
-
|
615
|
-
CXXFLAGS = -O3 -pthread -std=c++0x -Wa,-q
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
References
|
620
|
-
==========
|
621
|
-
|
622
|
-
[1] W.-S. Chin, Y. Zhuang, Y.-C. Juan, and C.-J. Lin. A Fast Parallel
|
623
|
-
Stochastic Gradient Method for Matrix Factorization in Shared Memory Systems.
|
624
|
-
ACM TIST, 2015. (www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_journal.pdf)
|
625
|
-
|
626
|
-
[2] W.-S. Chin, Y. Zhuang, Y.-C. Juan, and C.-J. Lin. A Learning-rate Schedule
|
627
|
-
for Stochastic Gradient Methods to Matrix Factorization. PAKDD, 2015.
|
628
|
-
(www.csie.ntu.edu.tw/~cjlin/papers/libmf/mf_adaptive_pakdd.pdf)
|
629
|
-
|
630
|
-
[3] W.-S. Chin, B.-W. Yuan, M.-Y. Yang, Y. Zhuang, Y.-C. Juan, and C.-J. Lin.
|
631
|
-
LIBMF: A Library for Parallel Matrix Factorization in Shared-memory Systems.
|
632
|
-
JMLR, 2015.
|
633
|
-
(www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_open_source.pdf)
|
634
|
-
|
635
|
-
For any questions and comments, please email:
|
636
|
-
|
637
|
-
cjlin@csie.ntu.edu.tw
|