libmf 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +22 -0
- data/README.md +125 -0
- data/ext/libmf/extconf.rb +18 -0
- data/lib/libmf.bundle +0 -0
- data/lib/libmf.rb +26 -0
- data/lib/libmf/ffi.rb +62 -0
- data/lib/libmf/model.rb +112 -0
- data/lib/libmf/version.rb +3 -0
- data/vendor/libmf/COPYRIGHT +31 -0
- data/vendor/libmf/Makefile +34 -0
- data/vendor/libmf/Makefile.win +36 -0
- data/vendor/libmf/README +637 -0
- data/vendor/libmf/demo/all_one_matrix.te.txt +1382 -0
- data/vendor/libmf/demo/all_one_matrix.tr.txt +5172 -0
- data/vendor/libmf/demo/binary_matrix.te.txt +1312 -0
- data/vendor/libmf/demo/binary_matrix.tr.txt +4937 -0
- data/vendor/libmf/demo/demo.bat +40 -0
- data/vendor/libmf/demo/demo.sh +58 -0
- data/vendor/libmf/demo/real_matrix.te.txt +794 -0
- data/vendor/libmf/demo/real_matrix.tr.txt +5000 -0
- data/vendor/libmf/mf-predict.cpp +207 -0
- data/vendor/libmf/mf-train.cpp +378 -0
- data/vendor/libmf/mf.cpp +4683 -0
- data/vendor/libmf/mf.def +21 -0
- data/vendor/libmf/mf.h +130 -0
- data/vendor/libmf/windows/mf-predict.exe +0 -0
- data/vendor/libmf/windows/mf-train.exe +0 -0
- data/vendor/libmf/windows/mf.dll +0 -0
- metadata +142 -0
@@ -0,0 +1,36 @@
|
|
1
|
+
CXX = cl.exe
|
2
|
+
CFLAGS = /W4 /nologo /O2 /EHsc /D "_CRT_SECURE_NO_DEPRECATE"
|
3
|
+
|
4
|
+
# Choose one instruction set to accelerate LIBMF
|
5
|
+
# 1. use SSE
|
6
|
+
CPUSET = /D "USESSE"
|
7
|
+
# 2. use AVX
|
8
|
+
#CPUSET = /D "USEAVX" /arch:AVX
|
9
|
+
# 3. no acceleration
|
10
|
+
#CPUSET =
|
11
|
+
|
12
|
+
# Always use OpenMP because Visual Studio supports it.
|
13
|
+
CFLAGS = $(CFLAGS) $(CPUSET) /D "USEOMP" /openmp
|
14
|
+
# To disable OpenMP, please use the command below.
|
15
|
+
#CFLAGS = $(CFLAGS) $(CPUSET) /D
|
16
|
+
|
17
|
+
TARGET = windows
|
18
|
+
|
19
|
+
all: $(TARGET)\mf-train.exe $(TARGET)\mf-predict.exe lib
|
20
|
+
|
21
|
+
$(TARGET)\mf-predict.exe: mf.h mf-predict.cpp mf.obj
|
22
|
+
$(CXX) $(CFLAGS) mf-predict.cpp mf.obj -Fe$(TARGET)\mf-predict.exe
|
23
|
+
|
24
|
+
$(TARGET)\mf-train.exe: mf.h mf-train.cpp mf.obj
|
25
|
+
$(CXX) $(CFLAGS) mf-train.cpp mf.obj -Fe$(TARGET)\mf-train.exe
|
26
|
+
|
27
|
+
mf.obj: mf.cpp mf.h
|
28
|
+
$(CXX) $(CFLAGS) -c mf.cpp
|
29
|
+
|
30
|
+
lib: mf.cpp mf.def mf.h
|
31
|
+
$(CXX) $(CFLAGS) -LD mf.cpp -Fe$(TARGET)\mf -link -DEF:mf.def
|
32
|
+
|
33
|
+
clean:
|
34
|
+
-erase /Q *.obj *.dll *.lib *.exe $(TARGET)\.
|
35
|
+
|
36
|
+
|
data/vendor/libmf/README
ADDED
@@ -0,0 +1,637 @@
|
|
1
|
+
LIBMF is a library for large-scale sparse matrix factorization. For the
|
2
|
+
optimization problem it solves and the overall framework, please refer to [3].
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
Table of Contents
|
7
|
+
=================
|
8
|
+
|
9
|
+
- Installation
|
10
|
+
- Data Format
|
11
|
+
- Model Format
|
12
|
+
- Command Line Usage
|
13
|
+
- Examples
|
14
|
+
- Library Usage
|
15
|
+
- SSE, AVX, and OpenMP
|
16
|
+
- Building Windows and Mac Binaries
|
17
|
+
- References
|
18
|
+
|
19
|
+
|
20
|
+
|
21
|
+
Installation
|
22
|
+
============
|
23
|
+
|
24
|
+
- Requirements
|
25
|
+
To compile LIBMF, a compiler which supports C++11 is required. LIBMF can
|
26
|
+
use SSE, AVX, and OpenMP for acceleration. See Section SSE, AVX, and OpenMP
|
27
|
+
if you want to disable or enable these features.
|
28
|
+
|
29
|
+
- Unix & Cygwin
|
30
|
+
|
31
|
+
Type `make' to build `mf-train' and `mf-precict.'
|
32
|
+
|
33
|
+
- Windows & Mac
|
34
|
+
|
35
|
+
See `Building Windows and Mac Binaries' to compile. For Windows, pre-built
|
36
|
+
binaries are available in the directory `windows.'
|
37
|
+
|
38
|
+
|
39
|
+
|
40
|
+
Data Format
|
41
|
+
===========
|
42
|
+
|
43
|
+
LIBMF's command-line tool can be used to factorize matrices with real or binary
|
44
|
+
values. Each line in the training file stores a tuple,
|
45
|
+
|
46
|
+
<row_idx> <col_idx> <value>
|
47
|
+
|
48
|
+
which records an entry of the training matrix. In the `demo' directory, the
|
49
|
+
files `real_matrix.tr.txt' and `real_matrix.te.txt' are the training and test
|
50
|
+
sets for a demonstration of real-valued matrix factorization (RVMF). For binary
|
51
|
+
matrix factorization (BMF), the set of <value> is {-1, 1} as shown in
|
52
|
+
`binary_matrix.tr.txt' and `binary_matrix.te.txt.' For one-class MF, all
|
53
|
+
<value>'s are positive. See `all_one_matrix.tr.txt' and `all_one_matrix.te.txt'
|
54
|
+
as examples.
|
55
|
+
|
56
|
+
Note: If the values in the test set are unknown, please put dummy zeros.
|
57
|
+
|
58
|
+
|
59
|
+
|
60
|
+
Model Format
|
61
|
+
============
|
62
|
+
|
63
|
+
LIBMF factorizes a training matrix `R' into a k-by-m matrix `P' and a
|
64
|
+
k-by-n matrix `Q' such that `R' is approximated by P'Q. After the training
|
65
|
+
process is finished, the two factor matrices `P' and `Q' are stored into a
|
66
|
+
model file. The file starts with a header including:
|
67
|
+
|
68
|
+
`f': the loss function of the solved MF problem
|
69
|
+
`m': the number of rows in the training matrix,
|
70
|
+
`n': the number of columns in the training matrix,
|
71
|
+
`k': the number of latent factors,
|
72
|
+
`b': the average of all elements in the training matrix.
|
73
|
+
|
74
|
+
From the 5th line, the columns of `P' and `Q' are stored line by line. In
|
75
|
+
each line, there are two leading tokens followed by the values of a
|
76
|
+
column. The first token is the name of the stored column, and the second
|
77
|
+
word indicates the type of values. If the second word is `T', the column is
|
78
|
+
a real vector. Otherwise, all values in the column are NaN. For example, if
|
79
|
+
|
80
|
+
[1 NaN 2] [-1 -2]
|
81
|
+
P = |3 NaN 4|, Q = |-3 -4|,
|
82
|
+
[5 NaN 6] [-5 -6]
|
83
|
+
|
84
|
+
and the value `b' is 0.5, the content of the model file is:
|
85
|
+
|
86
|
+
--------model file--------
|
87
|
+
m 3
|
88
|
+
n 2
|
89
|
+
k 3
|
90
|
+
b 0.5
|
91
|
+
p0 T 1 3 5
|
92
|
+
p1 F 0 0 0
|
93
|
+
p2 T 2 4 6
|
94
|
+
q0 T -1 -3 -5
|
95
|
+
q1 T -2 -4 -6
|
96
|
+
--------------------------
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
Command Line Usage
|
101
|
+
==================
|
102
|
+
|
103
|
+
- `mf-train'
|
104
|
+
|
105
|
+
usage: mf-train [options] training_set_file [model_file]
|
106
|
+
|
107
|
+
options:
|
108
|
+
-l1 <lambda>,<lambda>: set L1-regularization parameters for P and Q.
|
109
|
+
(default 0) If only one value is specified, P and Q share the same
|
110
|
+
lambda.
|
111
|
+
-l2 <lambda>,<lambda>: set L2-regularization parameters for P and Q.
|
112
|
+
(default 0.1) If only one value is specified, P and Q share the same
|
113
|
+
lambda.
|
114
|
+
-f <loss>: specify loss function (default 0)
|
115
|
+
for real-valued matrix factorization
|
116
|
+
0 -- squared error (L2-norm)
|
117
|
+
1 -- absolute error (L1-norm)
|
118
|
+
2 -- generalized KL-divergence (--nmf is required)
|
119
|
+
for binary matrix factorization
|
120
|
+
5 -- logarithmic error
|
121
|
+
6 -- squared hinge loss
|
122
|
+
7 -- hinge loss
|
123
|
+
for one-class matrix factorization
|
124
|
+
10 -- row-oriented pair-wise logarithmic loss
|
125
|
+
11 -- column-oriented pair-wise logarithmic loss
|
126
|
+
12 -- squared error (L2-norm)
|
127
|
+
-k <dimensions>: set number of dimensions (default 8)
|
128
|
+
-t <iter>: set number of iterations (default 20)
|
129
|
+
-r <eta>: set initial learning rate (default 0.1)
|
130
|
+
-a <alpha>: set coefficient of negative entries' loss (default 1)
|
131
|
+
-c <c>: set value of negative entries (default 0.0001).
|
132
|
+
Every positive entry is assumed to be 1.
|
133
|
+
-s <threads>: set number of threads (default 12)
|
134
|
+
-n <bins>: set number of bins (may be adjusted by LIBMF for speed)
|
135
|
+
-p <path>: set path to the validation set
|
136
|
+
-v <fold>: set number of folds for cross validation
|
137
|
+
--quiet: quiet mode (no outputs)
|
138
|
+
--nmf: perform non-negative matrix factorization
|
139
|
+
--disk: perform disk-level training (will create a buffer file)
|
140
|
+
|
141
|
+
`mf-train' is the main training command of LIBMF. At each iteration, the
|
142
|
+
following information is printed.
|
143
|
+
|
144
|
+
- iter: the index of iteration.
|
145
|
+
- tr_*: * is the evaluation criterion on the training set.
|
146
|
+
- tr_*+: * is the evaluation criterion on the positive entries in the
|
147
|
+
training set.
|
148
|
+
- tr_*-: * is the evaluation criterion on the negative entries in the
|
149
|
+
training set.
|
150
|
+
- va_*: the same criterion on the validation set if `-p' is set
|
151
|
+
- va_*+: * is the evaluation criterion on the positive entries in the
|
152
|
+
validation set.
|
153
|
+
- va_*-: * is the evaluation criterion on the negative entries in the
|
154
|
+
validation set.
|
155
|
+
- obj: objective function value.
|
156
|
+
- reg: regularization term.
|
157
|
+
|
158
|
+
Here `tr_*' and `obj' are estimations because calculating true values
|
159
|
+
can be time-consuming. Different solvers can print different combinations
|
160
|
+
those values.
|
161
|
+
|
162
|
+
For different losses, the criterion to be printed is listed below.
|
163
|
+
|
164
|
+
<loss>: <evaluation criterion>
|
165
|
+
- 0: root mean square error (RMSE)
|
166
|
+
- 1: mean absolute error (MAE)
|
167
|
+
- 2: generalized KL-divergence (KL)
|
168
|
+
- 5: logarithmic loss
|
169
|
+
- 6 & 7: accuracy
|
170
|
+
- 10 & 11: pair-wise logarithmic loss in Bayesian personalized ranking
|
171
|
+
- 12: sum of squared errors. The label of positive entries is 1
|
172
|
+
- while negative entries' value is set using command line
|
173
|
+
- option -c.
|
174
|
+
|
175
|
+
- `mf-predict'
|
176
|
+
|
177
|
+
usage: mf-predict [options] test_file model_file output_file
|
178
|
+
|
179
|
+
options:
|
180
|
+
-e <criterion>: set the evaluation criterion (default 0)
|
181
|
+
0: root mean square error
|
182
|
+
1: mean absolute error
|
183
|
+
2: generalized KL-divergence
|
184
|
+
5: logarithmic loss
|
185
|
+
6: accuracy
|
186
|
+
10: row-oriented mean percentile rank (row-oriented MPR)
|
187
|
+
11: colum-oriented mean percentile rank (column-oriented MPR)
|
188
|
+
12: row-oriented area under ROC curve (row-oriented AUC)
|
189
|
+
13: column-oriented area under ROC curve (column-oriented AUC)
|
190
|
+
|
191
|
+
`mf-predict' outputs the prediction values of the entries specified in
|
192
|
+
`test_file' to the `output_file.' The selected criterion will be printed
|
193
|
+
as well.
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
Examples
|
198
|
+
========
|
199
|
+
This section gives example commands of LIBMF using the data sets in `demo'
|
200
|
+
directory. In `demo,' a shell script `demo.sh' can be run for demonstration.
|
201
|
+
|
202
|
+
> mf-train real_matrix.tr.txt model
|
203
|
+
|
204
|
+
train a model using the default parameters
|
205
|
+
|
206
|
+
> mf-train -l1 0.05 -l2 0.01 real_matrix.tr.txt model
|
207
|
+
|
208
|
+
train a model with the following regularization coefficients:
|
209
|
+
|
210
|
+
coefficient of L1-norm regularization on P = 0.05
|
211
|
+
coefficient of L1-norm regularization on Q = 0.05
|
212
|
+
coefficient of L2-norm regularization on P = 0.01
|
213
|
+
coefficient of L2-norm regularization on Q = 0.01
|
214
|
+
|
215
|
+
> mf-train -l1 0.015,0 -l2 0.01,0.005 real_matrix.tr.txt model
|
216
|
+
|
217
|
+
train a model with the following regularization coefficients:
|
218
|
+
|
219
|
+
coefficient of L1-norm regularization on P = 0.05
|
220
|
+
coefficient of L1-norm regularization on Q = 0
|
221
|
+
coefficient of L2-norm regularization on P = 0.01
|
222
|
+
coefficient of L2-norm regularization on Q = 0.03
|
223
|
+
|
224
|
+
> mf-train -f 5 -l1 0,0.02 -k 100 -t 30 -r 0.02 -s 4 binary_matrix.tr.txt model
|
225
|
+
|
226
|
+
train a BMF model using logarithmic loss and the following parameters:
|
227
|
+
|
228
|
+
coefficient of L1-norm regularization on P = 0
|
229
|
+
coefficient of L1-norm regularization on Q = 0.01
|
230
|
+
latent factors = 100
|
231
|
+
iterations = 30
|
232
|
+
learning rate = 0.02
|
233
|
+
threads = 4
|
234
|
+
|
235
|
+
> mf-train -p real_matrix.te.txt real_matrix.tr.txt model
|
236
|
+
|
237
|
+
use real_matrix.te.txt for hold-out validation
|
238
|
+
|
239
|
+
> mf-train -v 5 real_matrix.tr.txt
|
240
|
+
|
241
|
+
do five fold cross validation
|
242
|
+
|
243
|
+
> mf-train -f 2 --nmf real_matrix.tr.txt
|
244
|
+
|
245
|
+
do non-negative matrix factorization with generalized KL-divergence
|
246
|
+
|
247
|
+
> mf-train --quiet real_matrix.tr.txt
|
248
|
+
|
249
|
+
do not print message to screen
|
250
|
+
|
251
|
+
> mf-train --disk real_matrix.tr.txt
|
252
|
+
|
253
|
+
do disk-level training
|
254
|
+
|
255
|
+
> mf-predict real_matrix.te.txt model output
|
256
|
+
|
257
|
+
do prediction
|
258
|
+
|
259
|
+
> mf-predict -e 1 real_matrix.te.txt model output
|
260
|
+
|
261
|
+
do prediction and output MAE
|
262
|
+
|
263
|
+
|
264
|
+
|
265
|
+
Library Usage
|
266
|
+
=============
|
267
|
+
|
268
|
+
These structures and functions are declared in the header file `mf.h.' You need
|
269
|
+
to #include `mf.h' in your C/C++ source files and link your program with
|
270
|
+
`mf.cpp.' Users can read `mf-train.cpp' and `mf-predict.cpp' as usage examples.
|
271
|
+
|
272
|
+
Before predicting test data, we need to construct a model (`mf_model') using
|
273
|
+
training data which is either a C structure `mf_problem' or the path to the
|
274
|
+
training file. For the first case, the whole data set needs to be fitted into
|
275
|
+
memory. For the second case, a binary version of the training file will be
|
276
|
+
created, and only some parts of the binary file are loaded at one time. Note
|
277
|
+
that a model can also be saved in a file for later use. To evaluate the quality
|
278
|
+
of a model, users can call an evaluation function in LIBMF with a `mf_problem'
|
279
|
+
and a `mf_model.'
|
280
|
+
|
281
|
+
|
282
|
+
There are four public data structures in LIBMF.
|
283
|
+
|
284
|
+
- struct mf_node
|
285
|
+
{
|
286
|
+
mf_int u;
|
287
|
+
mf_int v;
|
288
|
+
mf_float r;
|
289
|
+
};
|
290
|
+
|
291
|
+
`mf_node' represents an element in a sparse matrix. `u' represents the row
|
292
|
+
index, `v' represents the column index, and `r' represents the value.
|
293
|
+
|
294
|
+
|
295
|
+
- struct mf_problem
|
296
|
+
{
|
297
|
+
mf_int m;
|
298
|
+
mf_int n;
|
299
|
+
mf_long nnz;
|
300
|
+
struct mf_node *R;
|
301
|
+
};
|
302
|
+
|
303
|
+
`mf_problem' represents a sparse matrix. Each element is represented by
|
304
|
+
`mf_node.' `m' represents the number of rows, `n' represents the number of
|
305
|
+
columns, `nnz' represents the number of non-zero elements, and `R' is an
|
306
|
+
array of `mf_node' whose length is `nnz.'
|
307
|
+
|
308
|
+
|
309
|
+
- struct mf_parameter
|
310
|
+
{
|
311
|
+
mf_int fun;
|
312
|
+
mf_int k;
|
313
|
+
mf_int nr_threads;
|
314
|
+
mf_int nr_bins;
|
315
|
+
mf_int nr_iters;
|
316
|
+
mf_float lambda_p1;
|
317
|
+
mf_float lambda_p2;
|
318
|
+
mf_float lambda_q1;
|
319
|
+
mf_float lambda_q2;
|
320
|
+
mf_float alpha;
|
321
|
+
mf_float c;
|
322
|
+
mf_float eta;
|
323
|
+
bool do_nmf;
|
324
|
+
bool quiet;
|
325
|
+
bool copy_data;
|
326
|
+
};
|
327
|
+
|
328
|
+
`mf_parameter' represents the parameters used for training. The meaning of
|
329
|
+
each variable is:
|
330
|
+
|
331
|
+
variable meaning default
|
332
|
+
================================================================
|
333
|
+
fun loss function 0
|
334
|
+
k number of latent factors 8
|
335
|
+
nr_threads number of threads used 12
|
336
|
+
nr_bins number of bins 20
|
337
|
+
nr_iters number of iterations 20
|
338
|
+
lambda_p1 coefficient of L1-norm regularization on P 0
|
339
|
+
lambda_p2 coefficient of L2-norm regularization on P 0.1
|
340
|
+
lambda_q1 coefficient of L1-norm regularization on Q 0
|
341
|
+
lambda_q2 coefficient of L2-norm regularization on Q 0.1
|
342
|
+
eta learning rate 0.1
|
343
|
+
alpha importance of negative entries 0.1
|
344
|
+
c desired value of negative entries 0.0001
|
345
|
+
do_nmf perform non-negative MF (NMF) false
|
346
|
+
quiet no outputs to stdout false
|
347
|
+
copy_data copy data in training procedure true
|
348
|
+
|
349
|
+
There are two major algorithm categories in LIBMF. One is for stochastic
|
350
|
+
gradient method and the other one is for coordinate descent method. Both
|
351
|
+
of them support multi-threading. Currently, the only solver used
|
352
|
+
coordinate descent method is implemented for fun=12. All other types of loss
|
353
|
+
functions such as fun=0 may use stochastic gradient method. Notice that
|
354
|
+
when a framework does support the parameters specified, LIBMF may ignore
|
355
|
+
them or throw an error.
|
356
|
+
|
357
|
+
LIBMF's framework for stochastic gradient method:
|
358
|
+
|
359
|
+
In LIBMF, we parallelize the computation by griding the data matrix
|
360
|
+
into nr_bins^2 blocks. According to our experiments, this parameter is
|
361
|
+
not sensitive to both effectiveness and efficiency. In most cases the
|
362
|
+
default value should work well.
|
363
|
+
|
364
|
+
For disk-level training, `nr_bins' controls the memory usage of
|
365
|
+
because one thread accesss an entire block at one time. If `nr_bins'
|
366
|
+
is 4 and `nr_threads' is 1, the expected usage of memory is 25% of the
|
367
|
+
memory to store the whole training matrix.
|
368
|
+
|
369
|
+
Let the training data is a `mf_problem.' By default, at the beginning
|
370
|
+
of the training procedure, the data matrix is copied because it would
|
371
|
+
be modified in the training process. To save memory, `copy_data' can
|
372
|
+
be set to false with the following effects.
|
373
|
+
|
374
|
+
(1) The raw data is directly used without being copied.
|
375
|
+
(2) The order of nodes may be changed.
|
376
|
+
(3) The value in each node may become slightly different.
|
377
|
+
|
378
|
+
Note that `copy_data' is invalid for disk-level training.
|
379
|
+
|
380
|
+
To obtain a parameter with default values, use the function
|
381
|
+
`get_default_parameter.'
|
382
|
+
|
383
|
+
Note that parameter alpha and c are not ignored under this framework.
|
384
|
+
|
385
|
+
LIBMF's framework for coordinate descent method:
|
386
|
+
|
387
|
+
Currently, only one solver is implemented under this framework. It
|
388
|
+
minimizes the squared errors overall the whole training matrix. Its
|
389
|
+
regularization function is Frobenius norm on the two factor matrices
|
390
|
+
P and Q. Note that the the original training matrix R (m-by-n) is
|
391
|
+
approximated by P^TQ. This solver requires two copies of the original
|
392
|
+
positive entries if `copy_data' is false. That is, if your input data is
|
393
|
+
50MB, LIBMF may need 150MB memory in total for data storage. By
|
394
|
+
setting `copy_data' to false, LIBMF will only make one extra copy.
|
395
|
+
Disk-level training is not supported.
|
396
|
+
|
397
|
+
Parameters recognized by this framework are `fun,' `k,' `nr_threads,'
|
398
|
+
`nr_iters,' `lambda_p2,' `lambda_q2,' `alpha,' `c,' `quiet,' and
|
399
|
+
`copy_data.'
|
400
|
+
|
401
|
+
Unlike the standard C++ thread class used in stochastic gradient
|
402
|
+
method's framework, the parallel computation here relies on OpenMP, so
|
403
|
+
please make sure your complier can support it.
|
404
|
+
|
405
|
+
|
406
|
+
- struct mf_model
|
407
|
+
{
|
408
|
+
mf_int fun;
|
409
|
+
mf_int m;
|
410
|
+
mf_int n;
|
411
|
+
mf_int k;
|
412
|
+
mf_float b;
|
413
|
+
mf_float *P;
|
414
|
+
mf_float *Q;
|
415
|
+
};
|
416
|
+
|
417
|
+
`mf_model' is used to store models learned by LIBMF. `fun' indicates the
|
418
|
+
loss function of the solved MF problem. `m' represents the number of rows,
|
419
|
+
`n' represents the number of columns, `k' represents the number of latent
|
420
|
+
factors, and `b' is the average of all elements in the training matrix. `P'
|
421
|
+
is used to store a kxm matrix in column oriented format. For example, if
|
422
|
+
`P' stores a 3x4 matrix, then the content of `P' is:
|
423
|
+
|
424
|
+
P11 P21 P31 P12 P22 P32 P13 P23 P33 P14 P24 P34
|
425
|
+
|
426
|
+
`Q' is used to store a kxn matrix in the same manner.
|
427
|
+
|
428
|
+
|
429
|
+
Functions available in LIBMF include:
|
430
|
+
|
431
|
+
|
432
|
+
- mf_parameter mf_get_default_param();
|
433
|
+
|
434
|
+
Get default parameters.
|
435
|
+
|
436
|
+
- mf_int mf_save_model(struct mf_model const *model, char const *path);
|
437
|
+
|
438
|
+
Save a model. It returns 0 on sucess and 1 on failure.
|
439
|
+
|
440
|
+
- struct mf_model* mf_load_model(char const *path);
|
441
|
+
|
442
|
+
Load a model. If the model could not be loaded, a nullptr is returned.
|
443
|
+
|
444
|
+
- void mf_destroy_model(struct mf_model **model);
|
445
|
+
|
446
|
+
Destroy a model.
|
447
|
+
|
448
|
+
- struct mf_model* mf_train(
|
449
|
+
struct mf_problem const *prob,
|
450
|
+
mf_parameter param);
|
451
|
+
|
452
|
+
Train a model. A nullptr is returned if fail.
|
453
|
+
|
454
|
+
- struct mf_model* mf_train_on_disk(
|
455
|
+
char const *tr_path,
|
456
|
+
mf_parameter param);
|
457
|
+
|
458
|
+
Train a model while parts of data is put in disk to reduce memory usage. A
|
459
|
+
nullptr is returned if fail.
|
460
|
+
|
461
|
+
Notice: the model is still fully loaded during the training process.
|
462
|
+
|
463
|
+
- struct mf_model* mf_train_with_validation(
|
464
|
+
struct mf_problem const *tr,
|
465
|
+
struct mf_problem const *va,
|
466
|
+
mf_parameter param);
|
467
|
+
|
468
|
+
Train a model with training set `tr' and validation set `va.' The
|
469
|
+
evaluation criterion of the validation set is printed at each iteration.
|
470
|
+
|
471
|
+
- struct mf_model* mf_train_with_validation_on_disk(
|
472
|
+
char const *tr_path,
|
473
|
+
char const *va_path,
|
474
|
+
mf_parameter param);
|
475
|
+
|
476
|
+
Train a model using the training file `tr_path' and validation file
|
477
|
+
`va_path' for holdout validation. The same strategy is used to save memory
|
478
|
+
as in `mf_train_on_disk.' It also printed the same information as
|
479
|
+
`mf_train_with_validation.'
|
480
|
+
|
481
|
+
Notice: LIBMF assumes that the model and the validation set can be fully
|
482
|
+
loaded into the memory.
|
483
|
+
|
484
|
+
- mf_float mf_cross_validation(
|
485
|
+
struct mf_problem const *prob,
|
486
|
+
mf_int nr_folds,
|
487
|
+
mf_parameter param);
|
488
|
+
|
489
|
+
Do cross validation with `nr_folds' folds.
|
490
|
+
|
491
|
+
- mf_float mf_predict(
|
492
|
+
struct mf_model const *model,
|
493
|
+
mf_int p_idx,
|
494
|
+
mf_int q_idx);
|
495
|
+
|
496
|
+
Predict the value at the position (p_idx, q_idx). The predicted value is a
|
497
|
+
real number for RVMF or OCMF. For BMF, the range of the prediction values
|
498
|
+
are {-1, 1}. If `p_idx' or `q_idx' can not be found in the training set,
|
499
|
+
the function returns the average (mode if BMF) of all values in the
|
500
|
+
training matrix.
|
501
|
+
|
502
|
+
- mf_double calc_rmse(mf_problem *prob, mf_model *model);
|
503
|
+
|
504
|
+
calculate the RMSE of the model on a test set `prob.' It can be used to
|
505
|
+
evaluate the result of real-valued MF.
|
506
|
+
|
507
|
+
- mf_double calc_mae(mf_problem *prob, mf_model *model);
|
508
|
+
|
509
|
+
calculate the MAE of the model on a test set `prob.' It can be used to
|
510
|
+
evaluate the result of real-valued MF.
|
511
|
+
|
512
|
+
- mf_double calc_gkl(mf_problem *prob, mf_model *model);
|
513
|
+
|
514
|
+
calculate the Generalized KL-divergence of the model on a test set `prob.'
|
515
|
+
It can be used to evaluate the result of non-negative RVMF.
|
516
|
+
|
517
|
+
- calc_logloss(mf_problem *prob, mf_model *model);
|
518
|
+
|
519
|
+
calculate the logarithmic loss of the model on a test `prob.' It can be
|
520
|
+
used to evaluate the result of BMF.
|
521
|
+
|
522
|
+
- mf_double calc_accuracy(mf_problem *prob, mf_model *model);
|
523
|
+
|
524
|
+
calculate the accuracy of the model on a test `prob.' It can be used to
|
525
|
+
evaluate the result of BMF.
|
526
|
+
|
527
|
+
- mf_double calc_mpr(mf_problem *prob, mf_model *model, bool transpose)
|
528
|
+
|
529
|
+
calculate the MPR of the model on a test `prob.' If `transpose' is `false
|
530
|
+
row-oriented MPR is calculated and otherwise column-oriented MPR. It can be
|
531
|
+
used to evaluate the result of OCMF.
|
532
|
+
|
533
|
+
- calc_auc(mf_problem *prob, mf_model *model, bool transpose);
|
534
|
+
|
535
|
+
calculate the row-oriented AUC of the model on a test `prob' if `transpose'
|
536
|
+
is `false.' For column-oriented AUC, set `transpose' to be 'true.' It can
|
537
|
+
be used to evaluate the result of OCMF.
|
538
|
+
|
539
|
+
|
540
|
+
|
541
|
+
SSE, AVX, and OpenMP
|
542
|
+
====================
|
543
|
+
|
544
|
+
LIBMF utilizes SSE instructions to accelerate the computation. If you cannot
|
545
|
+
use SSE on your platform, then please comment out
|
546
|
+
|
547
|
+
DFLAG = -DUSESSE
|
548
|
+
|
549
|
+
in Makefile to disable SSE.
|
550
|
+
|
551
|
+
Some modern CPUs support AVX, which is more powerful than SSE. To enable AVX,
|
552
|
+
please comment out
|
553
|
+
|
554
|
+
DFLAG = -DUSESSE
|
555
|
+
|
556
|
+
and uncomment the following lines in Makefile.
|
557
|
+
|
558
|
+
DFLAG = -DUSEAVX
|
559
|
+
CFLAGS += -mavx
|
560
|
+
|
561
|
+
If OpenMP is not available on your platform, please comment out the following
|
562
|
+
lines in Makefile.
|
563
|
+
|
564
|
+
DFLAG += -DUSEOMP
|
565
|
+
CXXFLAGS += -fopenmp
|
566
|
+
|
567
|
+
Notice: Please always run `make clean all' if these flags are changed.
|
568
|
+
|
569
|
+
|
570
|
+
|
571
|
+
Building Windows and Mac and Binaries
|
572
|
+
=====================================
|
573
|
+
|
574
|
+
- Windows
|
575
|
+
|
576
|
+
Windows binaries are in the directory `windows.' To build them via
|
577
|
+
command-line tools of Microsoft Visual Studio, use the following steps:
|
578
|
+
|
579
|
+
1. Open a DOS command box (or Developer Command Prompt for Visual Studio)
|
580
|
+
and go to libmf directory. If environment variables of VC++ have not been
|
581
|
+
set, type
|
582
|
+
|
583
|
+
"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\bin\amd64\vcvars64.bat"
|
584
|
+
|
585
|
+
You may have to modify the above command according which version of VC++ or
|
586
|
+
where it is installed.
|
587
|
+
|
588
|
+
2. Type
|
589
|
+
|
590
|
+
nmake -f Makefile.win clean all
|
591
|
+
|
592
|
+
3. (optional) To build shared library mf_c.dll, type
|
593
|
+
|
594
|
+
nmake -f Makefile.win lib
|
595
|
+
|
596
|
+
- Mac
|
597
|
+
|
598
|
+
To complie LIBMF on Mac, a GCC complier is required, and users need to
|
599
|
+
slightly modify the Makefile. The following instructions are tested with
|
600
|
+
GCC 4.9.
|
601
|
+
|
602
|
+
1. Set the complier path to your GCC complier. For example, the first
|
603
|
+
line in the Makefile can be
|
604
|
+
|
605
|
+
CXX = g++-4.9
|
606
|
+
|
607
|
+
2. Remove `-march=native' from `CXXFLAGS.' The second line in the Makefile
|
608
|
+
Should be
|
609
|
+
|
610
|
+
CXXFLAGS = -O3 -pthread -std=c++0x
|
611
|
+
|
612
|
+
3. If AVX is enabled, we add `-Wa,-q' to the `CXXFLAGS,' so the previous
|
613
|
+
`CXXFLAGS' becomes
|
614
|
+
|
615
|
+
CXXFLAGS = -O3 -pthread -std=c++0x -Wa,-q
|
616
|
+
|
617
|
+
|
618
|
+
|
619
|
+
References
|
620
|
+
==========
|
621
|
+
|
622
|
+
[1] W.-S. Chin, Y. Zhuang, Y.-C. Juan, and C.-J. Lin. A Fast Parallel
|
623
|
+
Stochastic Gradient Method for Matrix Factorization in Shared Memory Systems.
|
624
|
+
ACM TIST, 2015. (www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_journal.pdf)
|
625
|
+
|
626
|
+
[2] W.-S. Chin, Y. Zhuang, Y.-C. Juan, and C.-J. Lin. A Learning-rate Schedule
|
627
|
+
for Stochastic Gradient Methods to Matrix Factorization. PAKDD, 2015.
|
628
|
+
(www.csie.ntu.edu.tw/~cjlin/papers/libmf/mf_adaptive_pakdd.pdf)
|
629
|
+
|
630
|
+
[3] W.-S. Chin, B.-W. Yuan, M.-Y. Yang, Y. Zhuang, Y.-C. Juan, and C.-J. Lin.
|
631
|
+
LIBMF: A Library for Parallel Matrix Factorization in Shared-memory Systems.
|
632
|
+
JMLR, 2015.
|
633
|
+
(www.csie.ntu.edu.tw/~cjlin/papers/libmf/libmf_open_source.pdf)
|
634
|
+
|
635
|
+
For any questions and comments, please email:
|
636
|
+
|
637
|
+
cjlin@csie.ntu.edu.tw
|