RubyGems - libmf - Versions diffs - 0.1.0 - Mend

libmf 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +22 -0
data/README.md +125 -0
data/ext/libmf/extconf.rb +18 -0
data/lib/libmf.bundle +0 -0
data/lib/libmf.rb +26 -0
data/lib/libmf/ffi.rb +62 -0
data/lib/libmf/model.rb +112 -0
data/lib/libmf/version.rb +3 -0
data/vendor/libmf/COPYRIGHT +31 -0
data/vendor/libmf/Makefile +34 -0
data/vendor/libmf/Makefile.win +36 -0
data/vendor/libmf/README +637 -0
data/vendor/libmf/demo/all_one_matrix.te.txt +1382 -0
data/vendor/libmf/demo/all_one_matrix.tr.txt +5172 -0
data/vendor/libmf/demo/binary_matrix.te.txt +1312 -0
data/vendor/libmf/demo/binary_matrix.tr.txt +4937 -0
data/vendor/libmf/demo/demo.bat +40 -0
data/vendor/libmf/demo/demo.sh +58 -0
data/vendor/libmf/demo/real_matrix.te.txt +794 -0
data/vendor/libmf/demo/real_matrix.tr.txt +5000 -0
data/vendor/libmf/mf-predict.cpp +207 -0
data/vendor/libmf/mf-train.cpp +378 -0
data/vendor/libmf/mf.cpp +4683 -0
data/vendor/libmf/mf.def +21 -0
data/vendor/libmf/mf.h +130 -0
data/vendor/libmf/windows/mf-predict.exe +0 -0
data/vendor/libmf/windows/mf-train.exe +0 -0
data/vendor/libmf/windows/mf.dll +0 -0
metadata +142 -0

data/vendor/libmf/mf.cpp ADDED Viewed

@@ -0,0 +1,4683 @@
+#include <algorithm>
+#include <cmath>
+#include <condition_variable>
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <memory>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+#include <limits>
+#include "mf.h"
+#if defined USESSE
+#include <pmmintrin.h>
+#endif
+#if defined USEAVX
+#include <immintrin.h>
+#endif
+#if defined USEOMP
+#include <omp.h>
+#endif
+namespace mf
+{
+using namespace std;
+namespace // unnamed namespace
+{
+mf_int const kALIGNByte = 32;
+mf_int const kALIGN = kALIGNByte/sizeof(mf_float);
+//--------------------------------------
+//---------Scheduler of Blocks----------
+//--------------------------------------
+class Scheduler
+{
+public:
+    Scheduler(mf_int nr_bins, mf_int nr_threads, vector<mf_int> cv_blocks);
+    mf_int get_job();
+    mf_int get_bpr_job(mf_int first_block, bool is_column_oriented);
+    void put_job(mf_int block, mf_double loss, mf_double error);
+    void put_bpr_job(mf_int first_block, mf_int second_block);
+    mf_double get_loss();
+    mf_double get_error();
+    mf_int get_negative(mf_int first_block, mf_int second_block,
+                        mf_int m, mf_int n, bool is_column_oriented);
+    void wait_for_jobs_done();
+    void resume();
+    void terminate();
+    bool is_terminated();
+private:
+    mf_int nr_bins;
+    mf_int nr_threads;
+    mf_int nr_done_jobs;
+    mf_int target;
+    mf_int nr_paused_threads;
+    bool terminated;
+    vector<mf_int> counts;
+    vector<mf_int> busy_p_blocks;
+    vector<mf_int> busy_q_blocks;
+    vector<mf_double> block_losses;
+    vector<mf_double> block_errors;
+    vector<minstd_rand0> block_generators;
+    unordered_set<mf_int> cv_blocks;
+    mutex mtx;
+    condition_variable cond_var;
+    default_random_engine generator;
+    uniform_real_distribution<mf_float> distribution;
+    priority_queue<pair<mf_float, mf_int>,
+                   vector<pair<mf_float, mf_int>>,
+                   greater<pair<mf_float, mf_int>>> pq;
+};
+Scheduler::Scheduler(mf_int nr_bins, mf_int nr_threads,
+    vector<mf_int> cv_blocks)
+    : nr_bins(nr_bins),
+      nr_threads(nr_threads),
+      nr_done_jobs(0),
+      target(nr_bins*nr_bins),
+      nr_paused_threads(0),
+      terminated(false),
+      counts(nr_bins*nr_bins, 0),
+      busy_p_blocks(nr_bins, 0),
+      busy_q_blocks(nr_bins, 0),
+      block_losses(nr_bins*nr_bins, 0),
+      block_errors(nr_bins*nr_bins, 0),
+      cv_blocks(cv_blocks.begin(), cv_blocks.end()),
+      distribution(0.0, 1.0)
+{
+    for(mf_int i = 0; i < nr_bins*nr_bins; ++i)
+    {
+        if(this->cv_blocks.find(i) == this->cv_blocks.end())
+            pq.emplace(distribution(generator), i);
+        block_generators.push_back(minstd_rand0(rand()));
+    }
+}
+mf_int Scheduler::get_job()
+{
+    bool is_found = false;
+    pair<mf_float, mf_int> block;
+    while(!is_found)
+    {
+        lock_guard<mutex> lock(mtx);
+        vector<pair<mf_float, mf_int>> locked_blocks;
+        mf_int p_block = 0;
+        mf_int q_block = 0;
+        while(!pq.empty())
+        {
+            block = pq.top();
+            pq.pop();
+            p_block = block.second/nr_bins;
+            q_block = block.second%nr_bins;
+            if(busy_p_blocks[p_block] || busy_q_blocks[q_block])
+                locked_blocks.push_back(block);
+            else
+            {
+                busy_p_blocks[p_block] = 1;
+                busy_q_blocks[q_block] = 1;
+                counts[block.second] += 1;
+                is_found = true;
+                break;
+            }
+        }
+        for(auto &block1 : locked_blocks)
+            pq.push(block1);
+    }
+    return block.second;
+}
+mf_int Scheduler::get_bpr_job(mf_int first_block, bool is_column_oriented)
+{
+    lock_guard<mutex> lock(mtx);
+    mf_int another = first_block;
+    vector<pair<mf_float, mf_int>> locked_blocks;
+    while(!pq.empty())
+    {
+        pair<mf_float, mf_int> block = pq.top();
+        pq.pop();
+        mf_int p_block = block.second/nr_bins;
+        mf_int q_block = block.second%nr_bins;
+        auto is_rejected = [&] ()
+        {
+            if(is_column_oriented)
+                return first_block%nr_bins != q_block ||
+                       busy_p_blocks[p_block];
+            else
+                return first_block/nr_bins != p_block ||
+                         busy_q_blocks[q_block];
+        };
+        if(is_rejected())
+            locked_blocks.push_back(block);
+        else
+        {
+            busy_p_blocks[p_block] = 1;
+            busy_q_blocks[q_block] = 1;
+            another = block.second;
+            break;
+        }
+    }
+    for(auto &block : locked_blocks)
+        pq.push(block);
+    return another;
+}
+void Scheduler::put_job(mf_int block_idx, mf_double loss, mf_double error)
+{
+    // Return the held block to the scheduler
+    {
+        lock_guard<mutex> lock(mtx);
+        busy_p_blocks[block_idx/nr_bins] = 0;
+        busy_q_blocks[block_idx%nr_bins] = 0;
+        block_losses[block_idx] = loss;
+        block_errors[block_idx] = error;
+        ++nr_done_jobs;
+        mf_float priority =
+            (mf_float)counts[block_idx]+distribution(generator);
+        pq.emplace(priority, block_idx);
+        ++nr_paused_threads;
+        // Tell others that a block is available again.
+        cond_var.notify_all();
+    }
+    // Wait if nr_done_jobs (aka the number of processed blocks) is too many
+    // because we want to print out the training status roughly once all blocks
+    // are processed once. This is the only place that a solver thread should
+    // wait for something.
+    {
+        unique_lock<mutex> lock(mtx);
+        cond_var.wait(lock, [&] {
+            return nr_done_jobs < target;
+        });
+    }
+    // Nothing is blocking and this thread is going to take another block
+    {
+        lock_guard<mutex> lock(mtx);
+        --nr_paused_threads;
+    }
+}
+void Scheduler::put_bpr_job(mf_int first_block, mf_int second_block)
+{
+    if(first_block == second_block)
+        return;
+    lock_guard<mutex> lock(mtx);
+    {
+        busy_p_blocks[second_block/nr_bins] = 0;
+        busy_q_blocks[second_block%nr_bins] = 0;
+        mf_float priority =
+            (mf_float)counts[second_block]+distribution(generator);
+        pq.emplace(priority, second_block);
+    }
+}
+mf_double Scheduler::get_loss()
+{
+    lock_guard<mutex> lock(mtx);
+    return accumulate(block_losses.begin(), block_losses.end(), 0.0);
+}
+mf_double Scheduler::get_error()
+{
+    lock_guard<mutex> lock(mtx);
+    return accumulate(block_errors.begin(), block_errors.end(), 0.0);
+}
+mf_int Scheduler::get_negative(mf_int first_block, mf_int second_block,
+        mf_int m, mf_int n, bool is_column_oriented)
+{
+    mf_int rand_val = (mf_int)block_generators[first_block]();
+    auto gen_random = [&] (mf_int block_id)
+    {
+        mf_int v_min, v_max;
+        if(is_column_oriented)
+        {
+            mf_int seg_size = (mf_int)ceil((double)m/nr_bins);
+            v_min = min((block_id/nr_bins)*seg_size, m-1);
+            v_max = min(v_min+seg_size, m-1);
+        }
+        else
+        {
+            mf_int seg_size = (mf_int)ceil((double)n/nr_bins);
+            v_min = min((block_id%nr_bins)*seg_size, n-1);
+            v_max = min(v_min+seg_size, n-1);
+        }
+        if(v_max == v_min)
+            return v_min;
+        else
+            return rand_val%(v_max-v_min)+v_min;
+    };
+    if(rand_val % 2)
+        return (mf_int)gen_random(first_block);
+    else
+        return (mf_int)gen_random(second_block);
+}
+void Scheduler::wait_for_jobs_done()
+{
+    unique_lock<mutex> lock(mtx);
+    // The first thing the main thread should wait for is that solver threads
+    // process enough matrix blocks.
+    // [REVIEW] Is it really needed? Solver threads automatically stop if they
+    // process too many blocks, so the next wait should be enough for stopping
+    // the main thread when nr_done_job is not enough.
+    cond_var.wait(lock, [&] {
+        return nr_done_jobs >= target;
+    });
+    // Wait for all threads to stop. Once a thread realizes that all threads
+    // have processed enough blocks it should stop. Then, the main thread can
+    // print values safely.
+    cond_var.wait(lock, [&] {
+        return nr_paused_threads == nr_threads;
+    });
+}
+void Scheduler::resume()
+{
+    lock_guard<mutex> lock(mtx);
+    target += nr_bins*nr_bins;
+    cond_var.notify_all();
+}
+void Scheduler::terminate()
+{
+    lock_guard<mutex> lock(mtx);
+    terminated = true;
+}
+bool Scheduler::is_terminated()
+{
+    lock_guard<mutex> lock(mtx);
+    return terminated;
+}
+//--------------------------------------
+//------------Block of matrix-----------
+//--------------------------------------
+class BlockBase
+{
+public:
+    virtual bool move_next() { return false; };
+    virtual mf_node* get_current() { return nullptr; }
+    virtual void reload() {};
+    virtual void free() {};
+    virtual mf_long get_nnz() { return 0; };
+    virtual ~BlockBase() {};
+};
+class Block : public BlockBase
+{
+public:
+    Block() : first(nullptr), last(nullptr), current(nullptr) {};
+    Block(mf_node *first_, mf_node *last_)
+        : first(first_), last(last_), current(nullptr) {};
+    bool move_next() { return ++current != last; }
+    mf_node* get_current() { return current; }
+    void tie_to(mf_node *first_, mf_node *last_);
+    void reload() { current = first-1; };
+    mf_long get_nnz() { return last-first; };
+private:
+    mf_node* first;
+    mf_node* last;
+    mf_node* current;
+};
+void Block::tie_to(mf_node *first_, mf_node *last_)
+{
+    first = first_;
+    last = last_;
+};
+class BlockOnDisk : public BlockBase
+{
+public:
+    BlockOnDisk() : first(0), last(0), current(0),
+                    source_path(""), buffer(0) {};
+    bool move_next() { return ++current < last-first; }
+    mf_node* get_current() { return &buffer[static_cast<size_t>(current)]; }
+    void tie_to(string source_path_, mf_long first_, mf_long last_);
+    void reload();
+    void free() { buffer.resize(0); };
+    mf_long get_nnz() { return last-first; };
+private:
+    mf_long first;
+    mf_long last;
+    mf_long current;
+    string source_path;
+    vector<mf_node> buffer;
+};
+void BlockOnDisk::tie_to(string source_path_, mf_long first_, mf_long last_)
+{
+    source_path = source_path_;
+    first = first_;
+    last = last_;
+}
+void BlockOnDisk::reload()
+{
+    ifstream source(source_path, ifstream::in|ifstream::binary);
+    if(!source)
+        throw runtime_error("can not open "+source_path);
+    buffer.resize(static_cast<size_t>(last-first));
+    source.seekg(first*sizeof(mf_node));
+    source.read((char*)buffer.data(), (last-first)*sizeof(mf_node));
+    current = -1;
+}
+//--------------------------------------
+//-------------Miscellaneous------------
+//--------------------------------------
+struct sort_node_by_p
+{
+    bool operator() (mf_node const &lhs, mf_node const &rhs)
+    {
+        return tie(lhs.u, lhs.v) < tie(rhs.u, rhs.v);
+    }
+};
+struct sort_node_by_q
+{
+    bool operator() (mf_node const &lhs, mf_node const &rhs)
+    {
+        return tie(lhs.v, lhs.u) < tie(rhs.v, rhs.u);
+    }
+};
+struct deleter
+{
+    void operator() (mf_problem *prob)
+    {
+        delete[] prob->R;
+        delete prob;
+    }
+};
+class Utility
+{
+public:
+    Utility(mf_int f, mf_int n) : fun(f), nr_threads(n) {};
+    void collect_info(mf_problem &prob, mf_float &avg, mf_float &std_dev);
+    void collect_info_on_disk(string data_path, mf_problem &prob,
+                              mf_float &avg, mf_float &std_dev);
+    void shuffle_problem(mf_problem &prob, vector<mf_int> &p_map,
+                         vector<mf_int> &q_map);
+    vector<mf_node*> grid_problem(mf_problem &prob, mf_int nr_bins,
+                                  vector<mf_int> &omega_p,
+                                  vector<mf_int> &omega_q,
+                                  vector<Block> &blocks);
+    void grid_shuffle_scale_problem_on_disk(mf_int m, mf_int n, mf_int nr_bins,
+                                            mf_float scale, string data_path,
+                                            vector<mf_int> &p_map,
+                                            vector<mf_int> &q_map,
+                                            vector<mf_int> &omega_p,
+                                            vector<mf_int> &omega_q,
+                                            vector<BlockOnDisk> &blocks);
+    void scale_problem(mf_problem &prob, mf_float scale);
+    mf_double calc_reg1(mf_model &model, mf_float lambda_p, mf_float lambda_q,
+                        vector<mf_int> &omega_p, vector<mf_int> &omega_q);
+    mf_double calc_reg2(mf_model &model, mf_float lambda_p, mf_float lambda_q,
+                        vector<mf_int> &omega_p, vector<mf_int> &omega_q);
+    string get_error_legend() const;
+    mf_double calc_error(vector<BlockBase*> &blocks,
+                         vector<mf_int> &cv_block_ids,
+                         mf_model const &model);
+    void scale_model(mf_model &model, mf_float scale);
+    static mf_problem* copy_problem(mf_problem const *prob, bool copy_data);
+    static vector<mf_int> gen_random_map(mf_int size);
+    // A function used to allocate all aligned float array.
+    // It hides platform-specific function calls. Memory
+    // allocated by malloc_aligned_float must be freed by using
+    // free_aligned_float.
+    static mf_float* malloc_aligned_float(mf_long size);
+    // A function used to free all aligned float array.
+    // It hides platform-specific function calls.
+    static void free_aligned_float(mf_float* ptr);
+    // Initialization function for stochastic gradient method.
+    // Factor matrices P and Q are both randomly initialized.
+    static mf_model* init_model(mf_int loss, mf_int m, mf_int n,
+                                mf_int k, mf_float avg,
+                                vector<mf_int> &omega_p,
+                                vector<mf_int> &omega_q);
+    // Initialization function for one-class CD.
+    // It does zero-initialization on factor matrix P and random initialization
+    // on factor matrix Q.
+    static mf_model* init_model(mf_int m, mf_int n, mf_int k);
+    static mf_float inner_product(mf_float *p, mf_float *q, mf_int k);
+    static vector<mf_int> gen_inv_map(vector<mf_int> &map);
+    static void shrink_model(mf_model &model, mf_int k_new);
+    static void shuffle_model(mf_model &model,
+                              vector<mf_int> &p_map,
+                              vector<mf_int> &q_map);
+    mf_int get_thread_number() const { return nr_threads; };
+private:
+    mf_int fun;
+    mf_int nr_threads;
+};
+void Utility::collect_info(
+    mf_problem &prob,
+    mf_float &avg,
+    mf_float &std_dev)
+{
+    mf_double ex = 0;
+    mf_double ex2 = 0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:ex,ex2)
+#endif
+    for(mf_long i = 0; i < prob.nnz; ++i)
+    {
+        mf_node &N = prob.R[i];
+        ex += (mf_double)N.r;
+        ex2 += (mf_double)N.r*N.r;
+    }
+    ex /= (mf_double)prob.nnz;
+    ex2 /= (mf_double)prob.nnz;
+    avg = (mf_float)ex;
+    std_dev = (mf_float)sqrt(ex2-ex*ex);
+}
+void Utility::collect_info_on_disk(
+    string data_path,
+    mf_problem &prob,
+    mf_float &avg,
+    mf_float &std_dev)
+{
+    mf_double ex = 0;
+    mf_double ex2 = 0;
+    ifstream source(data_path);
+    if(!source.is_open())
+        throw runtime_error("cannot open " + data_path);
+    for(mf_node N; source >> N.u >> N.v >> N.r;)
+    {
+        if(N.u+1 > prob.m)
+            prob.m = N.u+1;
+        if(N.v+1 > prob.n)
+            prob.n = N.v+1;
+        prob.nnz += 1;
+        ex += (mf_double)N.r;
+        ex2 += (mf_double)N.r*N.r;
+    }
+    source.close();
+    ex /= (mf_double)prob.nnz;
+    ex2 /= (mf_double)prob.nnz;
+    avg = (mf_float)ex;
+    std_dev = (mf_float)sqrt(ex2-ex*ex);
+}
+void Utility::scale_problem(mf_problem &prob, mf_float scale)
+{
+    if(scale == 1.0)
+        return;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static)
+#endif
+    for(mf_long i = 0; i < prob.nnz; ++i)
+        prob.R[i].r *= scale;
+}
+void Utility::scale_model(mf_model &model, mf_float scale)
+{
+    if(scale == 1.0)
+        return;
+    mf_int k = model.k;
+    model.b *= scale;
+    auto scale1 = [&] (mf_float *ptr, mf_int size, mf_float factor_scale)
+    {
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static)
+#endif
+        for(mf_int i = 0; i < size; ++i)
+        {
+            mf_float *ptr1 = ptr+(mf_long)i*model.k;
+            for(mf_int d = 0; d < k; ++d)
+                ptr1[d] *= factor_scale;
+        }
+    };
+    scale1(model.P, model.m, sqrt(scale));
+    scale1(model.Q, model.n, sqrt(scale));
+}
+mf_float Utility::inner_product(mf_float *p, mf_float *q, mf_int k)
+{
+#if defined USESSE
+    __m128 XMM = _mm_setzero_ps();
+    for(mf_int d = 0; d < k; d += 4)
+        XMM = _mm_add_ps(XMM, _mm_mul_ps(
+                  _mm_load_ps(p+d), _mm_load_ps(q+d)));
+    __m128 XMMtmp = _mm_add_ps(XMM, _mm_movehl_ps(XMM, XMM));
+    XMM = _mm_add_ps(XMM, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    mf_float product;
+    _mm_store_ss(&product, XMM);
+    return product;
+#elif defined USEAVX
+    __m256 XMM = _mm256_setzero_ps();
+    for(mf_int d = 0; d < k; d += 8)
+        XMM = _mm256_add_ps(XMM, _mm256_mul_ps(
+                  _mm256_load_ps(p+d), _mm256_load_ps(q+d)));
+    XMM = _mm256_add_ps(XMM, _mm256_permute2f128_ps(XMM, XMM, 1));
+    XMM = _mm256_hadd_ps(XMM, XMM);
+    XMM = _mm256_hadd_ps(XMM, XMM);
+    mf_float product;
+    _mm_store_ss(&product, _mm256_castps256_ps128(XMM));
+    return product;
+#else
+    return std::inner_product(p, p+k, q, (mf_float)0.0);
+#endif
+}
+mf_double Utility::calc_reg1(mf_model &model,
+                             mf_float lambda_p, mf_float lambda_q,
+                             vector<mf_int> &omega_p, vector<mf_int> &omega_q)
+{
+    auto calc_reg1_core = [&] (mf_float *ptr, mf_int size,
+                               vector<mf_int> &omega)
+    {
+        mf_double reg = 0;
+        for(mf_int i = 0; i < size; ++i)
+        {
+            if(omega[i] <= 0)
+                continue;
+            mf_float tmp = 0;
+            for(mf_int j = 0; j < model.k; ++j)
+                tmp += abs(ptr[(mf_long)i*model.k+j]);
+            reg += omega[i]*tmp;
+        }
+        return reg;
+    };
+    return lambda_p*calc_reg1_core(model.P, model.m, omega_p)+
+           lambda_q*calc_reg1_core(model.Q, model.n, omega_q);
+}
+mf_double Utility::calc_reg2(mf_model &model,
+                             mf_float lambda_p, mf_float lambda_q,
+                             vector<mf_int> &omega_p, vector<mf_int> &omega_q)
+{
+    auto calc_reg2_core = [&] (mf_float *ptr, mf_int size,
+                               vector<mf_int> &omega)
+    {
+        mf_double reg = 0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:reg)
+#endif
+        for(mf_int i = 0; i < size; ++i)
+        {
+            if(omega[i] <= 0)
+                continue;
+            mf_float *ptr1 = ptr+(mf_long)i*model.k;
+            reg += omega[i]*Utility::inner_product(ptr1, ptr1, model.k);
+        }
+        return reg;
+    };
+    return lambda_p*calc_reg2_core(model.P, model.m, omega_p) +
+           lambda_q*calc_reg2_core(model.Q, model.n, omega_q);
+}
+mf_double Utility::calc_error(
+    vector<BlockBase*> &blocks,
+    vector<mf_int> &cv_block_ids,
+    mf_model const &model)
+{
+    mf_double error = 0;
+    if(fun == P_L2_MFR || fun == P_L1_MFR || fun == P_KL_MFR ||
+       fun == P_LR_MFC || fun == P_L2_MFC || fun == P_L1_MFC)
+    {
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:error)
+#endif
+        for(mf_int i = 0; i < (mf_long)cv_block_ids.size(); ++i)
+        {
+            BlockBase *block = blocks[cv_block_ids[i]];
+            block->reload();
+            while(block->move_next())
+            {
+                mf_node const &N = *(block->get_current());
+                mf_float z = mf_predict(&model, N.u, N.v);
+                switch(fun)
+                {
+                    case P_L2_MFR:
+                        error += pow(N.r-z, 2);
+                        break;
+                    case P_L1_MFR:
+                        error += abs(N.r-z);
+                        break;
+                    case P_KL_MFR:
+                        error += N.r*log(N.r/z)-N.r+z;
+                        break;
+                    case P_LR_MFC:
+                        if(N.r > 0)
+                            error += log(1.0+exp(-z));
+                        else
+                            error += log(1.0+exp(z));
+                        break;
+                    case P_L2_MFC:
+                    case P_L1_MFC:
+                        if(N.r > 0)
+                            error += z > 0? 1: 0;
+                        else
+                            error += z < 0? 1: 0;
+                        break;
+                    default:
+                        throw invalid_argument("unknown error function");
+                        break;
+                }
+            }
+            block->free();
+        }
+    }
+    else
+    {
+        minstd_rand0 generator(rand());
+        switch(fun)
+        {
+            case P_ROW_BPR_MFOC:
+            {
+                uniform_int_distribution<mf_int> distribution(0, model.n-1);
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:error)
+#endif
+                for(mf_int i = 0; i < (mf_long)cv_block_ids.size(); ++i)
+                {
+                    BlockBase *block = blocks[cv_block_ids[i]];
+                    block->reload();
+                    while(block->move_next())
+                    {
+                        mf_node const &N = *(block->get_current());
+                        mf_int w = distribution(generator);
+                        error += log(1+exp(mf_predict(&model, N.u, w)-
+                                           mf_predict(&model, N.u, N.v)));
+                    }
+                    block->free();
+                }
+                break;
+            }
+            case P_COL_BPR_MFOC:
+            {
+                uniform_int_distribution<mf_int> distribution(0, model.m-1);
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:error)
+#endif
+                for(mf_int i = 0; i < (mf_long)cv_block_ids.size(); ++i)
+                {
+                    BlockBase *block = blocks[cv_block_ids[i]];
+                    block->reload();
+                    while(block->move_next())
+                    {
+                        mf_node const &N = *(block->get_current());
+                        mf_int w = distribution(generator);
+                        error += log(1+exp(mf_predict(&model, w, N.v)-
+                                           mf_predict(&model, N.u, N.v)));
+                    }
+                    block->free();
+                }
+                break;
+            }
+            default:
+            {
+                throw invalid_argument("unknown error function");
+                break;
+            }
+        }
+    }
+    return error;
+}
+string Utility::get_error_legend() const
+{
+    switch(fun)
+    {
+        case P_L2_MFR:
+            return string("rmse");
+            break;
+        case P_L1_MFR:
+            return string("mae");
+            break;
+        case P_KL_MFR:
+            return string("gkl");
+            break;
+        case P_LR_MFC:
+            return string("logloss");
+            break;
+        case P_L2_MFC:
+        case P_L1_MFC:
+            return string("accuracy");
+            break;
+        case P_ROW_BPR_MFOC:
+        case P_COL_BPR_MFOC:
+            return string("bprloss");
+            break;
+        case P_L2_MFOC:
+            return string("sqerror");
+        default:
+            return string();
+            break;
+     }
+}
+void Utility::shuffle_problem(
+    mf_problem &prob,
+    vector<mf_int> &p_map,
+    vector<mf_int> &q_map)
+{
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static)
+#endif
+    for(mf_long i = 0; i < prob.nnz; ++i)
+    {
+        mf_node &N = prob.R[i];
+        if(N.u < (mf_long)p_map.size())
+            N.u = p_map[N.u];
+        if(N.v < (mf_long)q_map.size())
+            N.v = q_map[N.v];
+    }
+}
+vector<mf_node*> Utility::grid_problem(
+    mf_problem &prob,
+    mf_int nr_bins,
+    vector<mf_int> &omega_p,
+    vector<mf_int> &omega_q,
+    vector<Block> &blocks)
+{
+    vector<mf_long> counts(nr_bins*nr_bins, 0);
+    mf_int seg_p = (mf_int)ceil((double)prob.m/nr_bins);
+    mf_int seg_q = (mf_int)ceil((double)prob.n/nr_bins);
+    auto get_block_id = [=] (mf_int u, mf_int v)
+    {
+        return (u/seg_p)*nr_bins+v/seg_q;
+    };
+    for(mf_long i = 0; i < prob.nnz; ++i)
+    {
+        mf_node &N = prob.R[i];
+        mf_int block = get_block_id(N.u, N.v);
+        counts[block] += 1;
+        omega_p[N.u] += 1;
+        omega_q[N.v] += 1;
+    }
+    vector<mf_node*> ptrs(nr_bins*nr_bins+1);
+    mf_node *ptr = prob.R;
+    ptrs[0] = ptr;
+    for(mf_int block = 0; block < nr_bins*nr_bins; ++block)
+        ptrs[block+1] = ptrs[block] + counts[block];
+    vector<mf_node*> pivots(ptrs.begin(), ptrs.end()-1);
+    for(mf_int block = 0; block < nr_bins*nr_bins; ++block)
+    {
+        for(mf_node* pivot = pivots[block]; pivot != ptrs[block+1];)
+        {
+            mf_int curr_block = get_block_id(pivot->u, pivot->v);
+            if(curr_block == block)
+            {
+                ++pivot;
+                continue;
+            }
+            mf_node *next = pivots[curr_block];
+            swap(*pivot, *next);
+            pivots[curr_block] += 1;
+        }
+    }
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(dynamic)
+#endif
+    for(mf_int block = 0; block < nr_bins*nr_bins; ++block)
+    {
+        if(prob.m > prob.n)
+            sort(ptrs[block], ptrs[block+1], sort_node_by_p());
+        else
+            sort(ptrs[block], ptrs[block+1], sort_node_by_q());
+    }
+    for(mf_int i = 0; i < (mf_long)blocks.size(); ++i)
+        blocks[i].tie_to(ptrs[i], ptrs[i+1]);
+    return ptrs;
+}
+void Utility::grid_shuffle_scale_problem_on_disk(
+    mf_int m, mf_int n, mf_int nr_bins,
+    mf_float scale, string data_path,
+    vector<mf_int> &p_map, vector<mf_int> &q_map,
+    vector<mf_int> &omega_p, vector<mf_int> &omega_q,
+    vector<BlockOnDisk> &blocks)
+{
+    string const buffer_path = data_path+string(".disk");
+    mf_int seg_p = (mf_int)ceil((double)m/nr_bins);
+    mf_int seg_q = (mf_int)ceil((double)n/nr_bins);
+    vector<mf_long> counts(nr_bins*nr_bins+1, 0);
+    vector<mf_long> pivots(nr_bins*nr_bins, 0);
+    ifstream source(data_path);
+    fstream buffer(buffer_path, fstream::in|fstream::out|
+                   fstream::binary|fstream::trunc);
+    auto get_block_id = [=] (mf_int u, mf_int v)
+    {
+        return (u/seg_p)*nr_bins+v/seg_q;
+    };
+    if(!source)
+        throw ios::failure(string("cannot to open ")+data_path);
+    if(!buffer)
+        throw ios::failure(string("cannot to open ")+buffer_path);
+    for(mf_node N; source >> N.u >> N.v >> N.r;)
+    {
+        N.u = p_map[N.u];
+        N.v = q_map[N.v];
+        mf_int bid = get_block_id(N.u, N.v);
+        omega_p[N.u] += 1;
+        omega_q[N.v] += 1;
+        counts[bid+1] += 1;
+    }
+    for(mf_int i = 1; i < nr_bins*nr_bins+1; ++i)
+    {
+        counts[i] += counts[i-1];
+        pivots[i-1] = counts[i-1];
+    }
+    source.clear();
+    source.seekg(0);
+    for(mf_node N; source >> N.u >> N.v >> N.r;)
+    {
+        N.u = p_map[N.u];
+        N.v = q_map[N.v];
+        N.r /= scale;
+        mf_int bid = get_block_id(N.u, N.v);
+        buffer.seekp(pivots[bid]*sizeof(mf_node));
+        buffer.write((char*)&N, sizeof(mf_node));
+        pivots[bid] += 1;
+    }
+    for(mf_int i = 0; i < nr_bins*nr_bins; ++i)
+    {
+        vector<mf_node> nodes(static_cast<size_t>(counts[i+1]-counts[i]));
+        buffer.clear();
+        buffer.seekg(counts[i]*sizeof(mf_node));
+        buffer.read((char*)nodes.data(), sizeof(mf_node)*nodes.size());
+        if(m > n)
+            sort(nodes.begin(), nodes.end(), sort_node_by_p());
+        else
+            sort(nodes.begin(), nodes.end(), sort_node_by_q());
+        buffer.clear();
+        buffer.seekp(counts[i]*sizeof(mf_node));
+        buffer.write((char*)nodes.data(), sizeof(mf_node)*nodes.size());
+        buffer.read((char*)nodes.data(), sizeof(mf_node)*nodes.size());
+    }
+    for(mf_int i = 0; i < (mf_long)blocks.size(); ++i)
+        blocks[i].tie_to(buffer_path, counts[i], counts[i+1]);
+}
+mf_float* Utility::malloc_aligned_float(mf_long size)
+{
+    // Check if conversion from mf_long to size_t causes overflow.
+    if (size > numeric_limits<std::size_t>::max() / sizeof(mf_float) + 1)
+        throw bad_alloc();
+    // [REVIEW] I hope one day we can use C11 aligned_alloc to replace
+    // platform-depedent functions below. Both of Windows and OSX currently
+    // don't support that function.
+    void *ptr = nullptr;
+#ifdef _WIN32
+    ptr = _aligned_malloc(static_cast<size_t>(size*sizeof(mf_float)),
+            kALIGNByte);
+#else
+    int status = posix_memalign(&ptr, kALIGNByte, size*sizeof(mf_float));
+        if(status != 0)
+                    throw bad_alloc();
+#endif
+    if(ptr == nullptr)
+        throw bad_alloc();
+    return (mf_float*)ptr;
+}
+void Utility::free_aligned_float(mf_float *ptr)
+{
+#ifdef _WIN32
+    // Unfortunately, Visual Studio doesn't want to support the
+    // cross-platform allocation below.
+    _aligned_free(ptr);
+#else
+    free(ptr);
+#endif
+}
+mf_model* Utility::init_model(mf_int fun,
+                              mf_int m, mf_int n,
+                              mf_int k, mf_float avg,
+                              vector<mf_int> &omega_p,
+                              vector<mf_int> &omega_q)
+{
+    mf_int k_real = k;
+    mf_int k_aligned = (mf_int)ceil(mf_double(k)/kALIGN)*kALIGN;
+    mf_model *model = new mf_model;
+    model->fun = fun;
+    model->m = m;
+    model->n = n;
+    model->k = k_aligned;
+    model->b = avg;
+    model->P = nullptr;
+    model->Q = nullptr;
+    mf_float scale = (mf_float)sqrt(1.0/k_real);
+    default_random_engine generator;
+    uniform_real_distribution<mf_float> distribution(0.0, 1.0);
+    try
+    {
+        model->P = Utility::malloc_aligned_float((mf_long)model->m*model->k);
+        model->Q = Utility::malloc_aligned_float((mf_long)model->n*model->k);
+    }
+    catch(bad_alloc const &e)
+    {
+        cerr << e.what() << endl;
+        mf_destroy_model(&model);
+        throw;
+    }
+    auto init1 = [&](mf_float *start_ptr, mf_long size, vector<mf_int> counts)
+    {
+        memset(start_ptr, 0, static_cast<size_t>(
+                    sizeof(mf_float) * size*model->k));
+        for(mf_long i = 0; i < size; ++i)
+        {
+            mf_float * ptr = start_ptr + i*model->k;
+            if(counts[static_cast<size_t>(i)] > 0)
+                for(mf_long d = 0; d < k_real; ++d, ++ptr)
+                    *ptr = (mf_float)(distribution(generator)*scale);
+            else
+                if(fun != P_ROW_BPR_MFOC && fun != P_COL_BPR_MFOC) // unseen for bpr is 0
+                    for(mf_long d = 0; d < k_real; ++d, ++ptr)
+                        *ptr = numeric_limits<mf_float>::quiet_NaN();
+        }
+    };
+    init1(model->P, m, omega_p);
+    init1(model->Q, n, omega_q);
+    return model;
+}
+// Initialize P=[\bar{p}_1, ..., \bar{p}_d] and Q=[\bar{q}_1, ..., \bar{q}_d].
+// Note that \bar{q}_{kv} is Q[k*n+v] and \bar{p}_{ku} is P[k*m+u].  One may
+// notice that P and Q here are actually the transposes of P and Q in fpsg(...)
+// because fpsg(...) uses P^TQ (where P and Q are respectively k-by-m and
+// k-by-n) to approximate the given rating matrix R while ccd_one_class(...)
+// uses PQ^T (where P and Q are respectively m-by-k and n-by-k.
+mf_model* Utility::init_model(mf_int m, mf_int n, mf_int k)
+{
+    mf_model *model = new mf_model;
+    model->fun = P_L2_MFOC;
+    model->m = m;
+    model->n = n;
+    model->k = k;
+    model->b = 0.0; // One-class matrix factorization doesn't have bias.
+    model->P = nullptr;
+    model->Q = nullptr;
+    try
+    {
+        model->P = Utility::malloc_aligned_float((mf_long)model->m*model->k);
+        model->Q = Utility::malloc_aligned_float((mf_long)model->n*model->k);
+    }
+    catch(bad_alloc const &e)
+    {
+        cerr << e.what() << endl;
+        mf_destroy_model(&model);
+        throw;
+    }
+    // Our initialization strategy is that all P's elements are zero and do
+    // random initization on Q. Thus, all initial predicted ratings are all zero
+    // since the approximated rating matrix is PQ^T.
+    // Initialize P with zeros
+    for(mf_long i = 0; i < k * m; ++i)
+        model->P[i] = 0.0;
+    // Initialize Q with random numbers
+    default_random_engine generator;
+    uniform_real_distribution<mf_float> distribution(0.0, 1.0);
+    for(mf_long i = 0; i < k * n; ++i)
+        model->Q[i] = distribution(generator);
+    return model;
+}
+vector<mf_int> Utility::gen_random_map(mf_int size)
+{
+    srand(0);
+    vector<mf_int> map(size, 0);
+    for(mf_int i = 0; i < size; ++i)
+        map[i] = i;
+    random_shuffle(map.begin(), map.end());
+    return map;
+}
+vector<mf_int> Utility::gen_inv_map(vector<mf_int> &map)
+{
+    vector<mf_int> inv_map(map.size());
+    for(mf_int i = 0; i < (mf_long)map.size(); ++i)
+      inv_map[map[i]] = i;
+    return inv_map;
+}
+void Utility::shuffle_model(
+    mf_model &model,
+    vector<mf_int> &p_map,
+    vector<mf_int> &q_map)
+{
+    auto inv_shuffle1 = [] (mf_float *vec, vector<mf_int> &map,
+                            mf_int size, mf_int k)
+    {
+        for(mf_int pivot = 0; pivot < size;)
+        {
+            if(pivot == map[pivot])
+            {
+                ++pivot;
+                continue;
+            }
+            mf_int next = map[pivot];
+            for(mf_int d = 0; d < k; ++d)
+                swap(*(vec+(mf_long)pivot*k+d), *(vec+(mf_long)next*k+d));
+            map[pivot] = map[next];
+            map[next] = next;
+        }
+    };
+    inv_shuffle1(model.P, p_map, model.m, model.k);
+    inv_shuffle1(model.Q, q_map, model.n, model.k);
+}
+void Utility::shrink_model(mf_model &model, mf_int k_new)
+{
+    mf_int k_old = model.k;
+    model.k = k_new;
+    auto shrink1 = [&] (mf_float *ptr, mf_int size)
+    {
+        for(mf_int i = 0; i < size; ++i)
+        {
+            mf_float *src = ptr+(mf_long)i*k_old;
+            mf_float *dst = ptr+(mf_long)i*k_new;
+            copy(src, src+k_new, dst);
+        }
+    };
+    shrink1(model.P, model.m);
+    shrink1(model.Q, model.n);
+}
+mf_problem* Utility::copy_problem(mf_problem const *prob, bool copy_data)
+{
+    mf_problem *new_prob = new mf_problem;
+    if(prob == nullptr)
+    {
+        new_prob->m = 0;
+        new_prob->n = 0;
+        new_prob->nnz = 0;
+        new_prob->R = nullptr;
+        return new_prob;
+    }
+    new_prob->m = prob->m;
+    new_prob->n = prob->n;
+    new_prob->nnz = prob->nnz;
+    if(copy_data)
+    {
+        try
+        {
+            new_prob->R = new mf_node[static_cast<size_t>(prob->nnz)];
+            copy(prob->R, prob->R+prob->nnz, new_prob->R);
+        }
+        catch(...)
+        {
+            delete new_prob;
+            throw;
+        }
+    }
+    else
+    {
+        new_prob->R = prob->R;
+    }
+    return new_prob;
+}
+//--------------------------------------
+//-----The base class of all solvers----
+//--------------------------------------
+class SolverBase
+{
+public:
+    SolverBase(Scheduler &scheduler, vector<BlockBase*> &blocks,
+               mf_float *PG, mf_float *QG, mf_model &model, mf_parameter param,
+               bool &slow_only)
+        : scheduler(scheduler), blocks(blocks), PG(PG), QG(QG),
+          model(model), param(param), slow_only(slow_only) {}
+    void run();
+    SolverBase(const SolverBase&) = delete;
+    SolverBase& operator=(const SolverBase&) = delete;
+    // Solver is stateless functor, so default destructor should be
+    // good enough.
+    virtual ~SolverBase() = default;
+protected:
+#if defined USESSE
+    static void calc_z(__m128 &XMMz, mf_int k, mf_float *p, mf_float *q);
+    virtual void load_fixed_variables(
+        __m128 &XMMlambda_p1, __m128 &XMMlambda_q1,
+        __m128 &XMMlambda_p2, __m128 &XMMlabmda_q2,
+        __m128 &XMMeta, __m128 &XMMrk_slow,
+        __m128 &XMMrk_fast);
+    virtual void arrange_block(__m128d &XMMloss, __m128d &XMMerror);
+    virtual void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror) = 0;
+    virtual void sg_update(mf_int d_begin, mf_int d_end, __m128 XMMz,
+                           __m128 XMMlambda_p1, __m128 XMMlambda_q1,
+                           __m128 XMMlambda_p2, __m128 XMMlamdba_q2,
+                           __m128 XMMeta, __m128 XMMrk) = 0;
+    virtual void finalize(__m128d XMMloss, __m128d XMMerror);
+#elif defined USEAVX
+    static void calc_z(__m256 &XMMz, mf_int k, mf_float *p, mf_float *q);
+    virtual void load_fixed_variables(
+        __m256 &XMMlambda_p1, __m256 &XMMlambda_q1,
+        __m256 &XMMlambda_p2, __m256 &XMMlabmda_q2,
+        __m256 &XMMeta, __m256 &XMMrk_slow,
+        __m256 &XMMrk_fast);
+    virtual void arrange_block(__m128d &XMMloss, __m128d &XMMerror);
+    virtual void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror) = 0;
+    virtual void sg_update(mf_int d_begin, mf_int d_end, __m256 XMMz,
+                           __m256 XMMlambda_p1, __m256 XMMlambda_q1,
+                           __m256 XMMlambda_p2, __m256 XMMlamdba_q2,
+                           __m256 XMMeta, __m256 XMMrk) = 0;
+    virtual void finalize(__m128d XMMloss, __m128d XMMerror);
+#else
+    static void calc_z(mf_float &z, mf_int k, mf_float *p, mf_float *q);
+    virtual void load_fixed_variables();
+    virtual void arrange_block();
+    virtual void prepare_for_sg_update() = 0;
+    virtual void sg_update(mf_int d_begin, mf_int d_end, mf_float rk) = 0;
+    virtual void finalize();
+    static float qrsqrt(float x);
+#endif
+    virtual void update() { ++pG; ++qG; };
+    Scheduler &scheduler;
+    vector<BlockBase*> &blocks;
+    BlockBase *block;
+    mf_float *PG;
+    mf_float *QG;
+    mf_model &model;
+    mf_parameter param;
+    bool &slow_only;
+    mf_node *N;
+    mf_float z;
+    mf_double loss;
+    mf_double error;
+    mf_float *p;
+    mf_float *q;
+    mf_float *pG;
+    mf_float *qG;
+    mf_int bid;
+    mf_float lambda_p1;
+    mf_float lambda_q1;
+    mf_float lambda_p2;
+    mf_float lambda_q2;
+    mf_float rk_slow;
+    mf_float rk_fast;
+};
+#if defined USESSE
+inline void SolverBase::run()
+{
+    __m128d XMMloss;
+    __m128d XMMerror;
+    __m128 XMMz;
+    __m128 XMMlambda_p1;
+    __m128 XMMlambda_q1;
+    __m128 XMMlambda_p2;
+    __m128 XMMlambda_q2;
+    __m128 XMMeta;
+    __m128 XMMrk_slow;
+    __m128 XMMrk_fast;
+    load_fixed_variables(XMMlambda_p1, XMMlambda_q1,
+                         XMMlambda_p2, XMMlambda_q2,
+                         XMMeta, XMMrk_slow,
+                         XMMrk_fast);
+    while(!scheduler.is_terminated())
+    {
+        arrange_block(XMMloss, XMMerror);
+        while(block->move_next())
+        {
+            N = block->get_current();
+            p = model.P+(mf_long)N->u*model.k;
+            q = model.Q+(mf_long)N->v*model.k;
+            pG = PG+N->u*2;
+            qG = QG+N->v*2;
+            prepare_for_sg_update(XMMz, XMMloss, XMMerror);
+            sg_update(0, kALIGN, XMMz, XMMlambda_p1, XMMlambda_q1,
+                    XMMlambda_p2, XMMlambda_q2, XMMeta, XMMrk_slow);
+            if(slow_only)
+                continue;
+            update();
+            sg_update(kALIGN, model.k, XMMz, XMMlambda_p1, XMMlambda_q1,
+                    XMMlambda_p2, XMMlambda_q2, XMMeta, XMMrk_slow);
+        }
+        finalize(XMMloss, XMMerror);
+    }
+}
+void SolverBase::load_fixed_variables(
+    __m128 &XMMlambda_p1, __m128 &XMMlambda_q1,
+    __m128 &XMMlambda_p2, __m128 &XMMlambda_q2,
+    __m128 &XMMeta, __m128 &XMMrk_slow,
+    __m128 &XMMrk_fast)
+{
+    XMMlambda_p1 = _mm_set1_ps(param.lambda_p1);
+    XMMlambda_q1 = _mm_set1_ps(param.lambda_q1);
+    XMMlambda_p2 = _mm_set1_ps(param.lambda_p2);
+    XMMlambda_q2 = _mm_set1_ps(param.lambda_q2);
+    XMMeta = _mm_set1_ps(param.eta);
+    XMMrk_slow = _mm_set1_ps((mf_float)1.0/kALIGN);
+    XMMrk_fast = _mm_set1_ps((mf_float)1.0/(model.k-kALIGN));
+}
+void SolverBase::arrange_block(__m128d &XMMloss, __m128d &XMMerror)
+{
+    XMMloss = _mm_setzero_pd();
+    XMMerror = _mm_setzero_pd();
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+}
+inline void SolverBase::calc_z(
+    __m128 &XMMz, mf_int k, mf_float *p, mf_float *q)
+{
+    XMMz = _mm_setzero_ps();
+    for(mf_int d = 0; d < k; d += 4)
+        XMMz = _mm_add_ps(XMMz, _mm_mul_ps(
+               _mm_load_ps(p+d), _mm_load_ps(q+d)));
+    // Bit-wise representation of 177 is {1,0}+{1,1}+{0,0}+{0,1} from
+    // high-bit to low-bit, where "+" means concatenating two arrays.
+    __m128 XMMtmp = _mm_add_ps(XMMz, _mm_shuffle_ps(XMMz, XMMz, 177));
+    // Bit-wise representation of 78 is {0,1}+{0,0}+{1,1}+{1,0} from
+    // high-bit to low-bit, where "+" means concatenating two arrays.
+    XMMz = _mm_add_ps(XMMtmp, _mm_shuffle_ps(XMMtmp, XMMtmp, 78));
+}
+void SolverBase::finalize(__m128d XMMloss, __m128d XMMerror)
+{
+    _mm_store_sd(&loss, XMMloss);
+    _mm_store_sd(&error, XMMerror);
+    block->free();
+    scheduler.put_job(bid, loss, error);
+}
+#elif defined USEAVX
+inline void SolverBase::run()
+{
+    __m128d XMMloss;
+    __m128d XMMerror;
+    __m256 XMMz;
+    __m256 XMMlambda_p1;
+    __m256 XMMlambda_q1;
+    __m256 XMMlambda_p2;
+    __m256 XMMlambda_q2;
+    __m256 XMMeta;
+    __m256 XMMrk_slow;
+    __m256 XMMrk_fast;
+    load_fixed_variables(XMMlambda_p1, XMMlambda_q1,
+                         XMMlambda_p2, XMMlambda_q2,
+                         XMMeta, XMMrk_slow, XMMrk_fast);
+    while(!scheduler.is_terminated())
+    {
+        arrange_block(XMMloss, XMMerror);
+        while(block->move_next())
+        {
+            N = block->get_current();
+            p = model.P+(mf_long)N->u*model.k;
+            q = model.Q+(mf_long)N->v*model.k;
+            pG = PG+N->u*2;
+            qG = QG+N->v*2;
+            prepare_for_sg_update(XMMz, XMMloss, XMMerror);
+            sg_update(0, kALIGN, XMMz, XMMlambda_p1, XMMlambda_q1,
+                      XMMlambda_p2, XMMlambda_q2, XMMeta, XMMrk_slow);
+            if(slow_only)
+                continue;
+            update();
+            sg_update(kALIGN, model.k, XMMz, XMMlambda_p1, XMMlambda_q1,
+                      XMMlambda_p2, XMMlambda_q2, XMMeta, XMMrk_fast);
+        }
+        finalize(XMMloss, XMMerror);
+    }
+}
+void SolverBase::load_fixed_variables(
+    __m256 &XMMlambda_p1, __m256 &XMMlambda_q1,
+    __m256 &XMMlambda_p2, __m256 &XMMlambda_q2,
+    __m256 &XMMeta, __m256 &XMMrk_slow,
+    __m256 &XMMrk_fast)
+{
+    XMMlambda_p1 = _mm256_set1_ps(param.lambda_p1);
+    XMMlambda_q1 = _mm256_set1_ps(param.lambda_q1);
+    XMMlambda_p2 = _mm256_set1_ps(param.lambda_p2);
+    XMMlambda_q2 = _mm256_set1_ps(param.lambda_q2);
+    XMMeta = _mm256_set1_ps(param.eta);
+    XMMrk_slow = _mm256_set1_ps((mf_float)1.0/kALIGN);
+    XMMrk_fast = _mm256_set1_ps((mf_float)1.0/(model.k-kALIGN));
+}
+void SolverBase::arrange_block(__m128d &XMMloss, __m128d &XMMerror)
+{
+    XMMloss = _mm_setzero_pd();
+    XMMerror = _mm_setzero_pd();
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+}
+inline void SolverBase::calc_z(
+    __m256 &XMMz, mf_int k, mf_float *p, mf_float *q)
+{
+    XMMz = _mm256_setzero_ps();
+    for(mf_int d = 0; d < k; d += 8)
+        XMMz = _mm256_add_ps(XMMz, _mm256_mul_ps(
+               _mm256_load_ps(p+d), _mm256_load_ps(q+d)));
+    XMMz = _mm256_add_ps(XMMz, _mm256_permute2f128_ps(XMMz, XMMz, 0x1));
+    XMMz = _mm256_hadd_ps(XMMz, XMMz);
+    XMMz = _mm256_hadd_ps(XMMz, XMMz);
+}
+void SolverBase::finalize(__m128d XMMloss, __m128d XMMerror)
+{
+    _mm_store_sd(&loss, XMMloss);
+    _mm_store_sd(&error, XMMerror);
+    block->free();
+    scheduler.put_job(bid, loss, error);
+}
+#else
+inline void SolverBase::run()
+{
+    load_fixed_variables();
+    while(!scheduler.is_terminated())
+    {
+        arrange_block();
+        while(block->move_next())
+        {
+            N = block->get_current();
+            p = model.P+(mf_long)N->u*model.k;
+            q = model.Q+(mf_long)N->v*model.k;
+            pG = PG+N->u*2;
+            qG = QG+N->v*2;
+            prepare_for_sg_update();
+            sg_update(0, kALIGN, rk_slow);
+            if(slow_only)
+                continue;
+            update();
+            sg_update(kALIGN, model.k, rk_fast);
+        }
+        finalize();
+    }
+}
+inline float SolverBase::qrsqrt(float x)
+{
+    float xhalf = 0.5f*x;
+    uint32_t i;
+    memcpy(&i, &x, sizeof(i));
+    i = 0x5f375a86 - (i>>1);
+    memcpy(&x, &i, sizeof(i));
+    x = x*(1.5f - xhalf*x*x);
+    return x;
+}
+void SolverBase::load_fixed_variables()
+{
+    lambda_p1 = param.lambda_p1;
+    lambda_q1 = param.lambda_q1;
+    lambda_p2 = param.lambda_p2;
+    lambda_q2 = param.lambda_q2;
+    rk_slow = (mf_float)1.0/kALIGN;
+    rk_fast = (mf_float)1.0/(model.k-kALIGN);
+}
+void SolverBase::arrange_block()
+{
+    loss = 0.0;
+    error = 0.0;
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+}
+inline void SolverBase::calc_z(mf_float &z, mf_int k, mf_float *p, mf_float *q)
+{
+    z = 0;
+    for(mf_int d = 0; d < k; ++d)
+        z += p[d]*q[d];
+}
+void SolverBase::finalize()
+{
+    block->free();
+    scheduler.put_job(bid, loss, error);
+}
+#endif
+//--------------------------------------
+//-----Real-valued MF and binary MF-----
+//--------------------------------------
+class MFSolver: public SolverBase
+{
+public:
+    MFSolver(Scheduler &scheduler, vector<BlockBase*> &blocks,
+             mf_float *PG, mf_float *QG, mf_model &model,
+             mf_parameter param, bool &slow_only)
+        : SolverBase(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void sg_update(mf_int d_begin, mf_int d_end, __m128 XMMz,
+                   __m128 XMMlambda_p1, __m128 XMMlambda_q1,
+                   __m128 XMMlambda_p2, __m128 XMMlambda_q2,
+                   __m128 XMMeta, __m128 XMMrk);
+#elif defined USEAVX
+    void sg_update(mf_int d_begin, mf_int d_end, __m256 XMMz,
+                   __m256 XMMlambda_p1, __m256 XMMlambda_q1,
+                   __m256 XMMlambda_p2, __m256 XMMlambda_q2,
+                   __m256 XMMeta, __m256 XMMrk);
+#else
+    void sg_update(mf_int d_begin, mf_int d_end, mf_float rk);
+#endif
+};
+#if defined USESSE
+void MFSolver::sg_update(mf_int d_begin, mf_int d_end, __m128 XMMz,
+                                __m128 XMMlambda_p1, __m128 XMMlambda_q1,
+                                __m128 XMMlambda_p2, __m128 XMMlambda_q2,
+                                __m128 XMMeta, __m128 XMMrk)
+{
+    __m128 XMMpG = _mm_load1_ps(pG);
+    __m128 XMMqG = _mm_load1_ps(qG);
+    __m128 XMMeta_p = _mm_mul_ps(XMMeta, _mm_rsqrt_ps(XMMpG));
+    __m128 XMMeta_q = _mm_mul_ps(XMMeta, _mm_rsqrt_ps(XMMqG));
+    __m128 XMMpG1 = _mm_setzero_ps();
+    __m128 XMMqG1 = _mm_setzero_ps();
+    for(mf_int d = d_begin; d < d_end; d += 4)
+    {
+        __m128 XMMp = _mm_load_ps(p+d);
+        __m128 XMMq = _mm_load_ps(q+d);
+        __m128 XMMpg = _mm_sub_ps(_mm_mul_ps(XMMlambda_p2, XMMp),
+                       _mm_mul_ps(XMMz, XMMq));
+        __m128 XMMqg = _mm_sub_ps(_mm_mul_ps(XMMlambda_q2, XMMq),
+                       _mm_mul_ps(XMMz, XMMp));
+        XMMpG1 = _mm_add_ps(XMMpG1, _mm_mul_ps(XMMpg, XMMpg));
+        XMMqG1 = _mm_add_ps(XMMqG1, _mm_mul_ps(XMMqg, XMMqg));
+        XMMp = _mm_sub_ps(XMMp, _mm_mul_ps(XMMeta_p, XMMpg));
+        XMMq = _mm_sub_ps(XMMq, _mm_mul_ps(XMMeta_q, XMMqg));
+        _mm_store_ps(p+d, XMMp);
+        _mm_store_ps(q+d, XMMq);
+    }
+    mf_float tmp = 0;
+    _mm_store_ss(&tmp, XMMlambda_p1);
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMp = _mm_load_ps(p+d);
+            __m128 XMMflip = _mm_and_ps(_mm_cmple_ps(XMMp, _mm_set1_ps(0.0f)),
+                             _mm_set1_ps(-0.0f));
+            XMMp = _mm_xor_ps(XMMflip,
+                   _mm_max_ps(_mm_sub_ps(_mm_xor_ps(XMMp, XMMflip),
+                   _mm_mul_ps(XMMeta_p, XMMlambda_p1)), _mm_set1_ps(0.0f)));
+            _mm_store_ps(p+d, XMMp);
+        }
+    }
+    _mm_store_ss(&tmp, XMMlambda_q1);
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMq = _mm_load_ps(q+d);
+            __m128 XMMflip = _mm_and_ps(_mm_cmple_ps(XMMq, _mm_set1_ps(0.0f)),
+                             _mm_set1_ps(-0.0f));
+            XMMq = _mm_xor_ps(XMMflip,
+                   _mm_max_ps(_mm_sub_ps(_mm_xor_ps(XMMq, XMMflip),
+                   _mm_mul_ps(XMMeta_q, XMMlambda_q1)), _mm_set1_ps(0.0f)));
+            _mm_store_ps(q+d, XMMq);
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMp = _mm_load_ps(p+d);
+            __m128 XMMq = _mm_load_ps(q+d);
+            XMMp = _mm_max_ps(XMMp, _mm_set1_ps(0.0f));
+            XMMq = _mm_max_ps(XMMq, _mm_set1_ps(0.0f));
+            _mm_store_ps(p+d, XMMp);
+            _mm_store_ps(q+d, XMMq);
+        }
+    }
+    __m128 XMMtmp = _mm_add_ps(XMMpG1, _mm_movehl_ps(XMMpG1, XMMpG1));
+    XMMpG1 = _mm_add_ps(XMMpG1, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    XMMpG = _mm_add_ps(XMMpG, _mm_mul_ps(XMMpG1, XMMrk));
+    _mm_store_ss(pG, XMMpG);
+    XMMtmp = _mm_add_ps(XMMqG1, _mm_movehl_ps(XMMqG1, XMMqG1));
+    XMMqG1 = _mm_add_ps(XMMqG1, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    XMMqG = _mm_add_ps(XMMqG, _mm_mul_ps(XMMqG1, XMMrk));
+    _mm_store_ss(qG, XMMqG);
+}
+#elif defined USEAVX
+void MFSolver::sg_update(mf_int d_begin, mf_int d_end, __m256 XMMz,
+                                __m256 XMMlambda_p1, __m256 XMMlambda_q1,
+                                __m256 XMMlambda_p2, __m256 XMMlambda_q2,
+                                __m256 XMMeta, __m256 XMMrk)
+{
+    __m256 XMMpG = _mm256_broadcast_ss(pG);
+    __m256 XMMqG = _mm256_broadcast_ss(qG);
+    __m256 XMMeta_p = _mm256_mul_ps(XMMeta, _mm256_rsqrt_ps(XMMpG));
+    __m256 XMMeta_q = _mm256_mul_ps(XMMeta, _mm256_rsqrt_ps(XMMqG));
+    __m256 XMMpG1 = _mm256_setzero_ps();
+    __m256 XMMqG1 = _mm256_setzero_ps();
+    for(mf_int d = d_begin; d < d_end; d += 8)
+    {
+        __m256 XMMp = _mm256_load_ps(p+d);
+        __m256 XMMq = _mm256_load_ps(q+d);
+        __m256 XMMpg = _mm256_sub_ps(_mm256_mul_ps(XMMlambda_p2, XMMp),
+                                     _mm256_mul_ps(XMMz, XMMq));
+        __m256 XMMqg = _mm256_sub_ps(_mm256_mul_ps(XMMlambda_q2, XMMq),
+                                     _mm256_mul_ps(XMMz, XMMp));
+        XMMpG1 = _mm256_add_ps(XMMpG1, _mm256_mul_ps(XMMpg, XMMpg));
+        XMMqG1 = _mm256_add_ps(XMMqG1, _mm256_mul_ps(XMMqg, XMMqg));
+        XMMp = _mm256_sub_ps(XMMp, _mm256_mul_ps(XMMeta_p, XMMpg));
+        XMMq = _mm256_sub_ps(XMMq, _mm256_mul_ps(XMMeta_q, XMMqg));
+        _mm256_store_ps(p+d, XMMp);
+        _mm256_store_ps(q+d, XMMq);
+    }
+    mf_float tmp = 0;
+    _mm_store_ss(&tmp, _mm256_castps256_ps128(XMMlambda_p1));
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMp = _mm256_load_ps(p+d);
+            __m256 XMMflip = _mm256_and_ps(_mm256_cmp_ps(XMMp,
+                             _mm256_set1_ps(0.0f), _CMP_LE_OS),
+                             _mm256_set1_ps(-0.0f));
+            XMMp = _mm256_xor_ps(XMMflip,
+                   _mm256_max_ps(_mm256_sub_ps(
+                   _mm256_xor_ps(XMMp, XMMflip),
+                   _mm256_mul_ps(XMMeta_p, XMMlambda_p1)),
+                   _mm256_set1_ps(0.0f)));
+            _mm256_store_ps(p+d, XMMp);
+        }
+    }
+    _mm_store_ss(&tmp, _mm256_castps256_ps128(XMMlambda_q1));
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMq = _mm256_load_ps(q+d);
+            __m256 XMMflip = _mm256_and_ps(_mm256_cmp_ps(XMMq,
+                             _mm256_set1_ps(0.0f), _CMP_LE_OS),
+                             _mm256_set1_ps(-0.0f));
+            XMMq = _mm256_xor_ps(XMMflip,
+                   _mm256_max_ps(_mm256_sub_ps(
+                   _mm256_xor_ps(XMMq, XMMflip),
+                   _mm256_mul_ps(XMMeta_q, XMMlambda_q1)),
+                   _mm256_set1_ps(0.0f)));
+            _mm256_store_ps(q+d, XMMq);
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMp = _mm256_load_ps(p+d);
+            __m256 XMMq = _mm256_load_ps(q+d);
+            XMMp = _mm256_max_ps(XMMp, _mm256_set1_ps(0));
+            XMMq = _mm256_max_ps(XMMq, _mm256_set1_ps(0));
+            _mm256_store_ps(p+d, XMMp);
+            _mm256_store_ps(q+d, XMMq);
+        }
+    }
+    XMMpG1 = _mm256_add_ps(XMMpG1,
+             _mm256_permute2f128_ps(XMMpG1, XMMpG1, 0x1));
+    XMMpG1 = _mm256_hadd_ps(XMMpG1, XMMpG1);
+    XMMpG1 = _mm256_hadd_ps(XMMpG1, XMMpG1);
+    XMMqG1 = _mm256_add_ps(XMMqG1,
+             _mm256_permute2f128_ps(XMMqG1, XMMqG1, 0x1));
+    XMMqG1 = _mm256_hadd_ps(XMMqG1, XMMqG1);
+    XMMqG1 = _mm256_hadd_ps(XMMqG1, XMMqG1);
+    XMMpG = _mm256_add_ps(XMMpG, _mm256_mul_ps(XMMpG1, XMMrk));
+    XMMqG = _mm256_add_ps(XMMqG, _mm256_mul_ps(XMMqG1, XMMrk));
+    _mm_store_ss(pG, _mm256_castps256_ps128(XMMpG));
+    _mm_store_ss(qG, _mm256_castps256_ps128(XMMqG));
+}
+#else
+void MFSolver::sg_update(mf_int d_begin, mf_int d_end, mf_float rk)
+{
+    mf_float eta_p = param.eta*qrsqrt(*pG);
+    mf_float eta_q = param.eta*qrsqrt(*qG);
+    mf_float pG1 = 0;
+    mf_float qG1 = 0;
+    for(mf_int d = d_begin; d < d_end; ++d)
+    {
+        mf_float gp = -z*q[d]+lambda_p2*p[d];
+        mf_float gq = -z*p[d]+lambda_q2*q[d];
+        pG1 += gp*gp;
+        qG1 += gq*gq;
+        p[d] -= eta_p*gp;
+        q[d] -= eta_q*gq;
+    }
+    if(lambda_p1 > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            mf_float p1 = max(abs(p[d])-lambda_p1*eta_p, 0.0f);
+            p[d] = p[d] >= 0? p1: -p1;
+        }
+    }
+    if(lambda_q1 > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            mf_float q1 = max(abs(q[d])-lambda_q1*eta_q, 0.0f);
+            q[d] = q[d] >= 0? q1: -q1;
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            p[d] = max(p[d], (mf_float)0.0f);
+            q[d] = max(q[d], (mf_float)0.0f);
+        }
+    }
+    *pG += pG1*rk;
+    *qG += qG1*rk;
+}
+#endif
+class L2_MFR : public MFSolver
+{
+public:
+    L2_MFR(Scheduler &scheduler, vector<BlockBase*> &blocks, mf_float *PG, mf_float *QG,
+           mf_model &model, mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void L2_MFR::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm_sub_ps(_mm_set1_ps(N->r), XMMz);
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_mul_ps(XMMz, XMMz)));
+    XMMerror = XMMloss;
+}
+#elif defined USEAVX
+void L2_MFR::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm256_sub_ps(_mm256_set1_ps(N->r), XMMz);
+    XMMloss = _mm_add_pd(XMMloss,
+              _mm_cvtps_pd(_mm256_castps256_ps128(
+              _mm256_mul_ps(XMMz, XMMz))));
+    XMMerror = XMMloss;
+}
+#else
+void L2_MFR::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    z = N->r-z;
+    loss += z*z;
+    error = loss;
+}
+#endif
+class L1_MFR : public MFSolver
+{
+public:
+    L1_MFR(Scheduler &scheduler, vector<BlockBase*> &blocks, mf_float *PG, mf_float *QG,
+           mf_model &model, mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void L1_MFR::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm_sub_ps(_mm_set1_ps(N->r), XMMz);
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_andnot_ps(_mm_set1_ps(-0.0f), XMMz)));
+    XMMerror = XMMloss;
+    XMMz = _mm_add_ps(_mm_and_ps(_mm_cmpgt_ps(XMMz, _mm_set1_ps(0.0f)),
+           _mm_set1_ps(1.0f)),
+           _mm_and_ps(_mm_cmplt_ps(XMMz, _mm_set1_ps(0.0f)),
+           _mm_set1_ps(-1.0f)));
+}
+#elif defined USEAVX
+void L1_MFR::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm256_sub_ps(_mm256_set1_ps(N->r), XMMz);
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(_mm256_castps256_ps128(
+              _mm256_andnot_ps(_mm256_set1_ps(-0.0f), XMMz))));
+    XMMerror = XMMloss;
+    XMMz = _mm256_add_ps(_mm256_and_ps(_mm256_cmp_ps(XMMz,
+           _mm256_set1_ps(0.0f), _CMP_GT_OS), _mm256_set1_ps(1.0f)),
+           _mm256_and_ps(_mm256_cmp_ps(XMMz,
+           _mm256_set1_ps(0.0f), _CMP_LT_OS), _mm256_set1_ps(-1.0f)));
+}
+#else
+void L1_MFR::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    z = N->r-z;
+    loss += abs(z);
+    error = loss;
+    if(z > 0)
+        z = 1;
+    else if(z < 0)
+        z = -1;
+}
+#endif
+class KL_MFR : public MFSolver
+{
+public:
+    KL_MFR(Scheduler &scheduler, vector<BlockBase*> &blocks, mf_float *PG, mf_float *QG,
+           mf_model &model, mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void KL_MFR::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm_div_ps(_mm_set1_ps(N->r), XMMz);
+    _mm_store_ss(&z, XMMz);
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_set1_ps(N->r*(log(z)-1+1/z))));
+    XMMerror = XMMloss;
+    XMMz = _mm_sub_ps(XMMz, _mm_set1_ps(1.0f));
+}
+#elif defined USEAVX
+void KL_MFR::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    XMMz = _mm256_div_ps(_mm256_set1_ps(N->r), XMMz);
+    _mm_store_ss(&z, _mm256_castps256_ps128(XMMz));
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_set1_ps(N->r*(log(z)-1+1/z))));
+    XMMerror = XMMloss;
+    XMMz = _mm256_sub_ps(XMMz, _mm256_set1_ps(1.0f));
+}
+#else
+void KL_MFR::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    z = N->r/z;
+    loss += N->r*(log(z)-1+1/z);
+    error = loss;
+    z -= 1;
+}
+#endif
+class LR_MFC : public MFSolver
+{
+public:
+    LR_MFC(Scheduler &scheduler, vector<BlockBase*> &blocks,
+           mf_float *PG, mf_float *QG, mf_model &model,
+           mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void LR_MFC::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    _mm_store_ss(&z, XMMz);
+    if(N->r > 0)
+    {
+        z = exp(-z);
+        XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1+z)));
+        XMMz = _mm_set1_ps(z/(1+z));
+    }
+    else
+    {
+        z = exp(z);
+        XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1+z)));
+        XMMz = _mm_set1_ps(-z/(1+z));
+    }
+    XMMerror = XMMloss;
+}
+#elif defined USEAVX
+void LR_MFC::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    _mm_store_ss(&z, _mm256_castps256_ps128(XMMz));
+    if(N->r > 0)
+    {
+        z = exp(-z);
+        XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1.0+z)));
+        XMMz = _mm256_set1_ps(z/(1+z));
+    }
+    else
+    {
+        z = exp(z);
+        XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1.0+z)));
+        XMMz = _mm256_set1_ps(-z/(1+z));
+    }
+    XMMerror = XMMloss;
+}
+#else
+void LR_MFC::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    if(N->r > 0)
+    {
+        z = exp(-z);
+        loss += log(1+z);
+        error = loss;
+        z = z/(1+z);
+    }
+    else
+    {
+        z = exp(z);
+        loss += log(1+z);
+        error = loss;
+        z = -z/(1+z);
+    }
+}
+#endif
+class L2_MFC : public MFSolver
+{
+public:
+    L2_MFC(Scheduler &scheduler, vector<BlockBase*> &blocks,
+           mf_float *PG, mf_float *QG, mf_model &model,
+           mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void L2_MFC::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    if(N->r > 0)
+    {
+        __m128 mask = _mm_cmpgt_ps(XMMz, _mm_set1_ps(0.0f));
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_set1_ps(1.0f), mask)));
+        XMMz = _mm_max_ps(_mm_set1_ps(0.0f), _mm_sub_ps(
+               _mm_set1_ps(1.0f), XMMz));
+    }
+    else
+    {
+        __m128 mask = _mm_cmplt_ps(XMMz, _mm_set1_ps(0.0f));
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_set1_ps(1.0f), mask)));
+        XMMz = _mm_min_ps(_mm_set1_ps(0.0f), _mm_sub_ps(
+               _mm_set1_ps(-1.0f), XMMz));
+    }
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_mul_ps(XMMz, XMMz)));
+}
+#elif defined USEAVX
+void L2_MFC::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    if(N->r > 0)
+    {
+        __m128 mask = _mm_cmpgt_ps(_mm256_castps256_ps128(XMMz),
+                      _mm_set1_ps(0.0f));
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_set1_ps(1.0f), mask)));
+        XMMz = _mm256_max_ps(_mm256_set1_ps(0.0f),
+               _mm256_sub_ps(_mm256_set1_ps(1.0f), XMMz));
+    }
+    else
+    {
+        __m128 mask = _mm_cmplt_ps(_mm256_castps256_ps128(XMMz),
+                      _mm_set1_ps(0.0f));
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_set1_ps(1.0f), mask)));
+        XMMz = _mm256_min_ps(_mm256_set1_ps(0.0f),
+               _mm256_sub_ps(_mm256_set1_ps(-1.0f), XMMz));
+    }
+    XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+              _mm_mul_ps(_mm256_castps256_ps128(XMMz),
+              _mm256_castps256_ps128(XMMz))));
+}
+#else
+void L2_MFC::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    if(N->r > 0)
+    {
+        error += z > 0? 1: 0;
+        z = max(0.0f, 1-z);
+    }
+    else
+    {
+        error += z < 0? 1: 0;
+        z = min(0.0f, -1-z);
+    }
+    loss += z*z;
+}
+#endif
+class L1_MFC : public MFSolver
+{
+public:
+    L1_MFC(Scheduler &scheduler, vector<BlockBase*> &blocks, mf_float *PG, mf_float *QG,
+           mf_model &model, mf_parameter param, bool &slow_only)
+        : MFSolver(scheduler, blocks, PG, QG, model, param, slow_only) {}
+protected:
+#if defined USESSE
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#elif defined USEAVX
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+#else
+    void prepare_for_sg_update();
+#endif
+};
+#if defined USESSE
+void L1_MFC::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    if(N->r > 0)
+    {
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_cmpge_ps(XMMz, _mm_set1_ps(0.0f)),
+                   _mm_set1_ps(1.0f))));
+        XMMz = _mm_sub_ps(_mm_set1_ps(1.0f), XMMz);
+        XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+                  _mm_max_ps(_mm_set1_ps(0.0f), XMMz)));
+        XMMz = _mm_and_ps(_mm_cmpge_ps(XMMz, _mm_set1_ps(0.0f)),
+               _mm_set1_ps(1.0f));
+    }
+    else
+    {
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(
+                   _mm_and_ps(_mm_cmplt_ps(XMMz, _mm_set1_ps(0.0f)),
+                   _mm_set1_ps(1.0f))));
+        XMMz = _mm_add_ps(_mm_set1_ps(1.0f), XMMz);
+        XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(
+                  _mm_max_ps(_mm_set1_ps(0.0f), XMMz)));
+        XMMz = _mm_and_ps(_mm_cmpge_ps(XMMz, _mm_set1_ps(0.0f)),
+               _mm_set1_ps(-1.0f));
+    }
+}
+#elif defined USEAVX
+void L1_MFC::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    calc_z(XMMz, model.k, p, q);
+    if(N->r > 0)
+    {
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(_mm_and_ps(
+                   _mm_cmpge_ps(_mm256_castps256_ps128(XMMz),
+                   _mm_set1_ps(0.0f)), _mm_set1_ps(1.0f))));
+        XMMz = _mm256_sub_ps(_mm256_set1_ps(1.0f), XMMz);
+        XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(_mm_max_ps(
+                  _mm_set1_ps(0.0f), _mm256_castps256_ps128(XMMz))));
+        XMMz = _mm256_and_ps(_mm256_cmp_ps(XMMz, _mm256_set1_ps(0.0f),
+               _CMP_GE_OS), _mm256_set1_ps(1.0f));
+    }
+    else
+    {
+        XMMerror = _mm_add_pd(XMMerror, _mm_cvtps_pd(_mm_and_ps(
+                   _mm_cmplt_ps(_mm256_castps256_ps128(XMMz),
+                   _mm_set1_ps(0.0f)), _mm_set1_ps(1.0f))));
+        XMMz = _mm256_add_ps(_mm256_set1_ps(1.0f), XMMz);
+        XMMloss = _mm_add_pd(XMMloss, _mm_cvtps_pd(_mm_max_ps(
+                  _mm_set1_ps(0.0f), _mm256_castps256_ps128(XMMz))));
+        XMMz = _mm256_and_ps(_mm256_cmp_ps(XMMz, _mm256_set1_ps(0.0f),
+               _CMP_GE_OS), _mm256_set1_ps(-1.0f));
+    }
+}
+#else
+void L1_MFC::prepare_for_sg_update()
+{
+    calc_z(z, model.k, p, q);
+    if(N->r > 0)
+    {
+        loss += max(0.0f, 1-z);
+        error += z > 0? 1.0f: 0.0f;
+        z = z > 1? 0.0f: 1.0f;
+    }
+    else
+    {
+        loss += max(0.0f, 1+z);
+        error += z < 0? 1.0f: 0.0f;
+        z = z < -1? 0.0f: -1.0f;
+    }
+}
+#endif
+//--------------------------------------
+//------------One-class MF--------------
+//--------------------------------------
+class BPRSolver : public SolverBase
+{
+public:
+    BPRSolver(Scheduler &scheduler, vector<BlockBase*> &blocks,
+              mf_float *PG, mf_float *QG, mf_model &model, mf_parameter param,
+              bool &slow_only, bool is_column_oriented)
+        : SolverBase(scheduler, blocks, PG, QG, model, param, slow_only),
+                     is_column_oriented(is_column_oriented) {}
+protected:
+#if defined USESSE
+    static void calc_z(__m128 &XMMz, mf_int k,
+                       mf_float *p, mf_float *q, mf_float *w);
+    void arrange_block(__m128d &XMMloss, __m128d &XMMerror);
+    void prepare_for_sg_update(
+        __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+    void sg_update(mf_int d_begin, mf_int d_end, __m128 XMMz,
+                   __m128 XMMlambda_p1, __m128 XMMlambda_q1,
+                   __m128 XMMlambda_p2, __m128 XMMlamdba_q2,
+                   __m128 XMMeta, __m128 XMMrk);
+    void finalize(__m128d XMMloss, __m128d XMMerror);
+#elif defined USEAVX
+    static void calc_z(__m256 &XMMz, mf_int k,
+                       mf_float *p, mf_float *q, mf_float *w);
+    void arrange_block(__m128d &XMMloss, __m128d &XMMerror);
+    void prepare_for_sg_update(
+        __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror);
+    void sg_update(mf_int d_begin, mf_int d_end, __m256 XMMz,
+                   __m256 XMMlambda_p1, __m256 XMMlambda_q1,
+                   __m256 XMMlambda_p2, __m256 XMMlamdba_q2,
+                   __m256 XMMeta, __m256 XMMrk);
+    void finalize(__m128d XMMloss, __m128d XMMerror);
+#else
+    static void calc_z(mf_float &z, mf_int k,
+                       mf_float *p, mf_float *q, mf_float *w);
+    void arrange_block();
+    void prepare_for_sg_update();
+    void sg_update(mf_int d_begin, mf_int d_end, mf_float rk);
+    void finalize();
+#endif
+    void update() { ++pG; ++qG; ++wG; };
+    virtual void prepare_negative() = 0;
+    bool is_column_oriented;
+    mf_int bpr_bid;
+    mf_float *w;
+    mf_float *wG;
+};
+#if defined USESSE
+inline void BPRSolver::calc_z(
+    __m128 &XMMz, mf_int k, mf_float *p, mf_float *q, mf_float *w)
+{
+    XMMz = _mm_setzero_ps();
+    for(mf_int d = 0; d < k; d += 4)
+        XMMz = _mm_add_ps(XMMz, _mm_mul_ps(_mm_load_ps(p+d),
+               _mm_sub_ps(_mm_load_ps(q+d), _mm_load_ps(w+d))));
+    // Bit-wise representation of 177 is {1,0}+{1,1}+{0,0}+{0,1} from
+    // high-bit to low-bit, where "+" means concatenating two arrays.
+    __m128 XMMtmp = _mm_add_ps(XMMz, _mm_shuffle_ps(XMMz, XMMz, 177));
+    // Bit-wise representation of 78 is {0,1}+{0,0}+{1,1}+{1,0} from
+    // high-bit to low-bit, where "+" means concatenating two arrays.
+    XMMz = _mm_add_ps(XMMz, _mm_shuffle_ps(XMMtmp, XMMtmp, 78));
+}
+void BPRSolver::arrange_block(__m128d &XMMloss, __m128d &XMMerror)
+{
+    XMMloss = _mm_setzero_pd();
+    XMMerror = _mm_setzero_pd();
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+    bpr_bid = scheduler.get_bpr_job(bid, is_column_oriented);
+}
+void BPRSolver::finalize(__m128d XMMloss, __m128d XMMerror)
+{
+    _mm_store_sd(&loss, XMMloss);
+    _mm_store_sd(&error, XMMerror);
+    scheduler.put_job(bid, loss, error);
+    scheduler.put_bpr_job(bid, bpr_bid);
+}
+void BPRSolver::sg_update(mf_int d_begin, mf_int d_end, __m128 XMMz,
+                                 __m128 XMMlambda_p1, __m128 XMMlambda_q1,
+                                 __m128 XMMlambda_p2, __m128 XMMlambda_q2,
+                                 __m128 XMMeta, __m128 XMMrk)
+{
+    __m128 XMMpG = _mm_load1_ps(pG);
+    __m128 XMMqG = _mm_load1_ps(qG);
+    __m128 XMMwG = _mm_load1_ps(wG);
+    __m128 XMMeta_p = _mm_mul_ps(XMMeta, _mm_rsqrt_ps(XMMpG));
+    __m128 XMMeta_q = _mm_mul_ps(XMMeta, _mm_rsqrt_ps(XMMqG));
+    __m128 XMMeta_w = _mm_mul_ps(XMMeta, _mm_rsqrt_ps(XMMwG));
+    __m128 XMMpG1 = _mm_setzero_ps();
+    __m128 XMMqG1 = _mm_setzero_ps();
+    __m128 XMMwG1 = _mm_setzero_ps();
+    for(mf_int d = d_begin; d < d_end; d += 4)
+    {
+        __m128 XMMp = _mm_load_ps(p+d);
+        __m128 XMMq = _mm_load_ps(q+d);
+        __m128 XMMw = _mm_load_ps(w+d);
+        __m128 XMMpg = _mm_add_ps(_mm_mul_ps(XMMlambda_p2, XMMp),
+                       _mm_mul_ps(XMMz, _mm_sub_ps(XMMw, XMMq)));
+        __m128 XMMqg = _mm_sub_ps(_mm_mul_ps(XMMlambda_q2, XMMq),
+                       _mm_mul_ps(XMMz, XMMp));
+        __m128 XMMwg = _mm_add_ps(_mm_mul_ps(XMMlambda_q2, XMMw),
+                       _mm_mul_ps(XMMz, XMMp));
+        XMMpG1 = _mm_add_ps(XMMpG1, _mm_mul_ps(XMMpg, XMMpg));
+        XMMqG1 = _mm_add_ps(XMMqG1, _mm_mul_ps(XMMqg, XMMqg));
+        XMMwG1 = _mm_add_ps(XMMwG1, _mm_mul_ps(XMMwg, XMMwg));
+        XMMp = _mm_sub_ps(XMMp, _mm_mul_ps(XMMeta_p, XMMpg));
+        XMMq = _mm_sub_ps(XMMq, _mm_mul_ps(XMMeta_q, XMMqg));
+        XMMw = _mm_sub_ps(XMMw, _mm_mul_ps(XMMeta_w, XMMwg));
+        _mm_store_ps(p+d, XMMp);
+        _mm_store_ps(q+d, XMMq);
+        _mm_store_ps(w+d, XMMw);
+    }
+    mf_float tmp = 0;
+    _mm_store_ss(&tmp, XMMlambda_p1);
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMp = _mm_load_ps(p+d);
+            __m128 XMMflip = _mm_and_ps(_mm_cmple_ps(XMMp, _mm_set1_ps(0.0f)),
+                             _mm_set1_ps(-0.0f));
+            XMMp = _mm_xor_ps(XMMflip,
+                   _mm_max_ps(_mm_sub_ps(_mm_xor_ps(XMMp, XMMflip),
+                   _mm_mul_ps(XMMeta_p, XMMlambda_p1)), _mm_set1_ps(0.0f)));
+            _mm_store_ps(p+d, XMMp);
+        }
+    }
+    _mm_store_ss(&tmp, XMMlambda_q1);
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMq = _mm_load_ps(q+d);
+            __m128 XMMw = _mm_load_ps(w+d);
+            __m128 XMMflip = _mm_and_ps(_mm_cmple_ps(XMMq, _mm_set1_ps(0.0f)),
+                             _mm_set1_ps(-0.0f));
+            XMMq = _mm_xor_ps(XMMflip,
+                   _mm_max_ps(_mm_sub_ps(_mm_xor_ps(XMMq, XMMflip),
+                   _mm_mul_ps(XMMeta_q, XMMlambda_q1)), _mm_set1_ps(0.0f)));
+            _mm_store_ps(q+d, XMMq);
+            XMMflip = _mm_and_ps(_mm_cmple_ps(XMMw, _mm_set1_ps(0.0f)),
+                    _mm_set1_ps(-0.0f));
+            XMMw = _mm_xor_ps(XMMflip,
+                   _mm_max_ps(_mm_sub_ps(_mm_xor_ps(XMMw, XMMflip),
+                   _mm_mul_ps(XMMeta_w, XMMlambda_q1)), _mm_set1_ps(0.0f)));
+            _mm_store_ps(w+d, XMMw);
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 4)
+        {
+            __m128 XMMp = _mm_load_ps(p+d);
+            __m128 XMMq = _mm_load_ps(q+d);
+            __m128 XMMw = _mm_load_ps(w+d);
+            XMMp = _mm_max_ps(XMMp, _mm_set1_ps(0.0f));
+            XMMq = _mm_max_ps(XMMq, _mm_set1_ps(0.0f));
+            XMMw = _mm_max_ps(XMMw, _mm_set1_ps(0.0f));
+            _mm_store_ps(p+d, XMMp);
+            _mm_store_ps(q+d, XMMq);
+            _mm_store_ps(w+d, XMMw);
+        }
+    }
+    // Update learning rate of latent vector p. Squared derivatives along all
+    // latent dimensions will be computed above. Here their average will be
+    // added into the associated squared-gradient sum.
+    __m128 XMMtmp = _mm_add_ps(XMMpG1, _mm_movehl_ps(XMMpG1, XMMpG1));
+    XMMpG1 = _mm_add_ps(XMMpG1, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    XMMpG = _mm_add_ps(XMMpG, _mm_mul_ps(XMMpG1, XMMrk));
+    _mm_store_ss(pG, XMMpG);
+    // Similar code is used to update learning rate of latent vector q.
+    XMMtmp = _mm_add_ps(XMMqG1, _mm_movehl_ps(XMMqG1, XMMqG1));
+    XMMqG1 = _mm_add_ps(XMMqG1, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    XMMqG = _mm_add_ps(XMMqG, _mm_mul_ps(XMMqG1, XMMrk));
+    _mm_store_ss(qG, XMMqG);
+    // Similar code is used to update learning rate of latent vector w.
+    XMMtmp = _mm_add_ps(XMMwG1, _mm_movehl_ps(XMMwG1, XMMwG1));
+    XMMwG1 = _mm_add_ps(XMMwG1, _mm_shuffle_ps(XMMtmp, XMMtmp, 1));
+    XMMwG = _mm_add_ps(XMMwG, _mm_mul_ps(XMMwG1, XMMrk));
+    _mm_store_ss(wG, XMMwG);
+}
+void BPRSolver::prepare_for_sg_update(
+    __m128 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    prepare_negative();
+    calc_z(XMMz, model.k, p, q, w);
+    _mm_store_ss(&z, XMMz);
+    z = exp(-z);
+    XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1+z)));
+    XMMerror = XMMloss;
+    XMMz = _mm_set1_ps(z/(1+z));
+}
+#elif defined USEAVX
+inline void BPRSolver::calc_z(
+    __m256 &XMMz, mf_int k, mf_float *p, mf_float *q, mf_float *w)
+{
+    XMMz = _mm256_setzero_ps();
+    for(mf_int d = 0; d < k; d += 8)
+        XMMz = _mm256_add_ps(XMMz, _mm256_mul_ps(
+               _mm256_load_ps(p+d), _mm256_sub_ps(
+               _mm256_load_ps(q+d), _mm256_load_ps(w+d))));
+    XMMz = _mm256_add_ps(XMMz, _mm256_permute2f128_ps(XMMz, XMMz, 0x1));
+    XMMz = _mm256_hadd_ps(XMMz, XMMz);
+    XMMz = _mm256_hadd_ps(XMMz, XMMz);
+}
+void BPRSolver::arrange_block(__m128d &XMMloss, __m128d &XMMerror)
+{
+    XMMloss = _mm_setzero_pd();
+    XMMerror = _mm_setzero_pd();
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+    bpr_bid = scheduler.get_bpr_job(bid, is_column_oriented);
+}
+void BPRSolver::finalize(__m128d XMMloss, __m128d XMMerror)
+{
+    _mm_store_sd(&loss, XMMloss);
+    _mm_store_sd(&error, XMMerror);
+    scheduler.put_job(bid, loss, error);
+    scheduler.put_bpr_job(bid, bpr_bid);
+}
+void BPRSolver::sg_update(mf_int d_begin, mf_int d_end, __m256 XMMz,
+                                 __m256 XMMlambda_p1, __m256 XMMlambda_q1,
+                                 __m256 XMMlambda_p2, __m256 XMMlambda_q2,
+                                 __m256 XMMeta, __m256 XMMrk)
+{
+    __m256 XMMpG = _mm256_broadcast_ss(pG);
+    __m256 XMMqG = _mm256_broadcast_ss(qG);
+    __m256 XMMwG = _mm256_broadcast_ss(wG);
+    __m256 XMMeta_p =
+        _mm256_mul_ps(XMMeta, _mm256_rsqrt_ps(XMMpG));
+    __m256 XMMeta_q =
+        _mm256_mul_ps(XMMeta, _mm256_rsqrt_ps(XMMqG));
+    __m256 XMMeta_w =
+        _mm256_mul_ps(XMMeta, _mm256_rsqrt_ps(XMMwG));
+    __m256 XMMpG1 = _mm256_setzero_ps();
+    __m256 XMMqG1 = _mm256_setzero_ps();
+    __m256 XMMwG1 = _mm256_setzero_ps();
+    for(mf_int d = d_begin; d < d_end; d += 8)
+    {
+        __m256 XMMp = _mm256_load_ps(p+d);
+        __m256 XMMq = _mm256_load_ps(q+d);
+        __m256 XMMw = _mm256_load_ps(w+d);
+        __m256 XMMpg = _mm256_add_ps(_mm256_mul_ps(XMMlambda_p2, XMMp),
+                       _mm256_mul_ps(XMMz, _mm256_sub_ps(XMMw, XMMq)));
+        __m256 XMMqg = _mm256_sub_ps(_mm256_mul_ps(XMMlambda_q2, XMMq),
+                       _mm256_mul_ps(XMMz, XMMp));
+        __m256 XMMwg = _mm256_add_ps(_mm256_mul_ps(XMMlambda_q2, XMMw),
+                       _mm256_mul_ps(XMMz, XMMp));
+        XMMpG1 = _mm256_add_ps(XMMpG1, _mm256_mul_ps(XMMpg, XMMpg));
+        XMMqG1 = _mm256_add_ps(XMMqG1, _mm256_mul_ps(XMMqg, XMMqg));
+        XMMwG1 = _mm256_add_ps(XMMwG1, _mm256_mul_ps(XMMwg, XMMwg));
+        XMMp = _mm256_sub_ps(XMMp, _mm256_mul_ps(XMMeta_p, XMMpg));
+        XMMq = _mm256_sub_ps(XMMq, _mm256_mul_ps(XMMeta_q, XMMqg));
+        XMMw = _mm256_sub_ps(XMMw, _mm256_mul_ps(XMMeta_w, XMMwg));
+        _mm256_store_ps(p+d, XMMp);
+        _mm256_store_ps(q+d, XMMq);
+        _mm256_store_ps(w+d, XMMw);
+    }
+    mf_float tmp = 0;
+    _mm_store_ss(&tmp, _mm256_castps256_ps128(XMMlambda_p1));
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMp = _mm256_load_ps(p+d);
+            __m256 XMMflip =
+                _mm256_and_ps(
+                _mm256_cmp_ps(XMMp, _mm256_set1_ps(0.0f), _CMP_LE_OS),
+                _mm256_set1_ps(-0.0f));
+            XMMp = _mm256_xor_ps(XMMflip,
+                   _mm256_max_ps(_mm256_sub_ps(_mm256_xor_ps(XMMp, XMMflip),
+                   _mm256_mul_ps(XMMeta_p, XMMlambda_p1)),
+                   _mm256_set1_ps(0.0f)));
+            _mm256_store_ps(p+d, XMMp);
+        }
+    }
+    _mm_store_ss(&tmp, _mm256_castps256_ps128(XMMlambda_q1));
+    if(tmp > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMq = _mm256_load_ps(q+d);
+            __m256 XMMw = _mm256_load_ps(w+d);
+            __m256 XMMflip;
+            XMMflip = _mm256_and_ps(
+                      _mm256_cmp_ps(XMMq, _mm256_set1_ps(0.0f), _CMP_LE_OS),
+                      _mm256_set1_ps(-0.0f));
+            XMMq = _mm256_xor_ps(XMMflip,
+                   _mm256_max_ps(_mm256_sub_ps(_mm256_xor_ps(XMMq, XMMflip),
+                   _mm256_mul_ps(XMMeta_q, XMMlambda_q1)),
+                   _mm256_set1_ps(0.0f)));
+            _mm256_store_ps(q+d, XMMq);
+            XMMflip = _mm256_and_ps(
+                      _mm256_cmp_ps(XMMw, _mm256_set1_ps(0.0f), _CMP_LE_OS),
+                      _mm256_set1_ps(-0.0f));
+            XMMw = _mm256_xor_ps(XMMflip,
+                   _mm256_max_ps(_mm256_sub_ps(_mm256_xor_ps(XMMw, XMMflip),
+                   _mm256_mul_ps(XMMeta_w, XMMlambda_q1)),
+                   _mm256_set1_ps(0.0f)));
+            _mm256_store_ps(w+d, XMMw);
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; d += 8)
+        {
+            __m256 XMMp = _mm256_load_ps(p+d);
+            __m256 XMMq = _mm256_load_ps(q+d);
+            __m256 XMMw = _mm256_load_ps(w+d);
+            XMMp = _mm256_max_ps(XMMp, _mm256_set1_ps(0.0f));
+            XMMq = _mm256_max_ps(XMMq, _mm256_set1_ps(0.0f));
+            XMMw = _mm256_max_ps(XMMw, _mm256_set1_ps(0.0f));
+            _mm256_store_ps(p+d, XMMp);
+            _mm256_store_ps(q+d, XMMq);
+            _mm256_store_ps(w+d, XMMw);
+        }
+    }
+    XMMpG1 = _mm256_add_ps(XMMpG1,
+             _mm256_permute2f128_ps(XMMpG1, XMMpG1, 0x1));
+    XMMpG1 = _mm256_hadd_ps(XMMpG1, XMMpG1);
+    XMMpG1 = _mm256_hadd_ps(XMMpG1, XMMpG1);
+    XMMqG1 = _mm256_add_ps(XMMqG1,
+             _mm256_permute2f128_ps(XMMqG1, XMMqG1, 0x1));
+    XMMqG1 = _mm256_hadd_ps(XMMqG1, XMMqG1);
+    XMMqG1 = _mm256_hadd_ps(XMMqG1, XMMqG1);
+    XMMwG1 = _mm256_add_ps(XMMwG1,
+             _mm256_permute2f128_ps(XMMwG1, XMMwG1, 0x1));
+    XMMwG1 = _mm256_hadd_ps(XMMwG1, XMMwG1);
+    XMMwG1 = _mm256_hadd_ps(XMMwG1, XMMwG1);
+    XMMpG = _mm256_add_ps(XMMpG, _mm256_mul_ps(XMMpG1, XMMrk));
+    XMMqG = _mm256_add_ps(XMMqG, _mm256_mul_ps(XMMqG1, XMMrk));
+    XMMwG = _mm256_add_ps(XMMwG, _mm256_mul_ps(XMMwG1, XMMrk));
+    _mm_store_ss(pG, _mm256_castps256_ps128(XMMpG));
+    _mm_store_ss(qG, _mm256_castps256_ps128(XMMqG));
+    _mm_store_ss(wG, _mm256_castps256_ps128(XMMwG));
+}
+void BPRSolver::prepare_for_sg_update(
+    __m256 &XMMz, __m128d &XMMloss, __m128d &XMMerror)
+{
+    prepare_negative();
+    calc_z(XMMz, model.k, p, q, w);
+    _mm_store_ss(&z, _mm256_castps256_ps128(XMMz));
+    z = exp(-z);
+    XMMloss = _mm_add_pd(XMMloss, _mm_set1_pd(log(1+z)));
+    XMMerror = XMMloss;
+    XMMz = _mm256_set1_ps(z/(1+z));
+}
+#else
+inline void BPRSolver::calc_z(
+    mf_float &z, mf_int k, mf_float *p, mf_float *q, mf_float *w)
+{
+    z = 0;
+    for(mf_int d = 0; d < k; ++d)
+        z += p[d]*(q[d]-w[d]);
+}
+void BPRSolver::arrange_block()
+{
+    loss = 0.0;
+    error = 0.0;
+    bid = scheduler.get_job();
+    block = blocks[bid];
+    block->reload();
+    bpr_bid = scheduler.get_bpr_job(bid, is_column_oriented);
+}
+void BPRSolver::finalize()
+{
+    scheduler.put_job(bid, loss, error);
+    scheduler.put_bpr_job(bid, bpr_bid);
+}
+void BPRSolver::sg_update(mf_int d_begin, mf_int d_end, mf_float rk)
+{
+    mf_float eta_p = param.eta*qrsqrt(*pG);
+    mf_float eta_q = param.eta*qrsqrt(*qG);
+    mf_float eta_w = param.eta*qrsqrt(*wG);
+    mf_float pG1 = 0;
+    mf_float qG1 = 0;
+    mf_float wG1 = 0;
+    for(mf_int d = d_begin; d < d_end; ++d)
+    {
+        mf_float gp = z*(w[d]-q[d]) + lambda_p2*p[d];
+        mf_float gq = -z*p[d] + lambda_q2*q[d];
+        mf_float gw = z*p[d] + lambda_q2*w[d];
+        pG1 += gp*gp;
+        qG1 += gq*gq;
+        wG1 += gw*gw;
+        p[d] -= eta_p*gp;
+        q[d] -= eta_q*gq;
+        w[d] -= eta_w*gw;
+    }
+    if(lambda_p1 > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            mf_float p1 = max(abs(p[d])-lambda_p1*eta_p, 0.0f);
+            p[d] = p[d] >= 0? p1: -p1;
+        }
+    }
+    if(lambda_q1 > 0)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            mf_float q1 = max(abs(w[d])-lambda_q1*eta_w, 0.0f);
+            w[d] = w[d] >= 0? q1: -q1;
+            q1 = max(abs(q[d])-lambda_q1*eta_q, 0.0f);
+            q[d] = q[d] >= 0? q1: -q1;
+        }
+    }
+    if(param.do_nmf)
+    {
+        for(mf_int d = d_begin; d < d_end; ++d)
+        {
+            p[d] = max(p[d], (mf_float)0.0);
+            q[d] = max(q[d], (mf_float)0.0);
+            w[d] = max(w[d], (mf_float)0.0);
+        }
+    }
+    *pG += pG1*rk;
+    *qG += qG1*rk;
+    *wG += wG1*rk;
+}
+void BPRSolver::prepare_for_sg_update()
+{
+    prepare_negative();
+    calc_z(z, model.k, p, q, w);
+    z = exp(-z);
+    loss += log(1+z);
+    error = loss;
+    z = z/(1+z);
+}
+#endif
+class COL_BPR_MFOC : public BPRSolver
+{
+public:
+    COL_BPR_MFOC(Scheduler &scheduler, vector<BlockBase*> &blocks,
+                 mf_float *PG, mf_float *QG, mf_model &model,
+                 mf_parameter param, bool &slow_only,
+                 bool is_column_oriented=true)
+        : BPRSolver(scheduler, blocks, PG, QG, model, param,
+                    slow_only, is_column_oriented) {}
+protected:
+#if defined USESSE
+    void load_fixed_variables(
+        __m128 &XMMlambda_p1, __m128 &XMMlambda_q1,
+        __m128 &XMMlambda_p2, __m128 &XMMlabmda_q2,
+        __m128 &XMMeta, __m128 &XMMrk_slow,
+        __m128 &XMMrk_fast);
+#elif defined USEAVX
+    void load_fixed_variables(
+        __m256 &XMMlambda_p1, __m256 &XMMlambda_q1,
+        __m256 &XMMlambda_p2, __m256 &XMMlabmda_q2,
+        __m256 &XMMeta, __m256 &XMMrk_slow,
+        __m256 &XMMrk_fast);
+#else
+    void load_fixed_variables();
+#endif
+    void prepare_negative();
+};
+void COL_BPR_MFOC::prepare_negative()
+{
+    mf_int negative = scheduler.get_negative(bid, bpr_bid, model.m, model.n,
+                                             is_column_oriented);
+    w = model.P + negative*model.k;
+    wG = PG + negative*2;
+    swap(p, q);
+    swap(pG, qG);
+}
+#if defined USESSE
+void COL_BPR_MFOC::load_fixed_variables(
+    __m128 &XMMlambda_p1, __m128 &XMMlambda_q1,
+    __m128 &XMMlambda_p2, __m128 &XMMlambda_q2,
+    __m128 &XMMeta, __m128 &XMMrk_slow,
+    __m128 &XMMrk_fast)
+{
+    XMMlambda_p1 = _mm_set1_ps(param.lambda_q1);
+    XMMlambda_q1 = _mm_set1_ps(param.lambda_p1);
+    XMMlambda_p2 = _mm_set1_ps(param.lambda_q2);
+    XMMlambda_q2 = _mm_set1_ps(param.lambda_p2);
+    XMMeta = _mm_set1_ps(param.eta);
+    XMMrk_slow = _mm_set1_ps((mf_float)1.0/kALIGN);
+    XMMrk_fast = _mm_set1_ps((mf_float)1.0/(model.k-kALIGN));
+}
+#elif defined USEAVX
+void COL_BPR_MFOC::load_fixed_variables(
+    __m256 &XMMlambda_p1, __m256 &XMMlambda_q1,
+    __m256 &XMMlambda_p2, __m256 &XMMlambda_q2,
+    __m256 &XMMeta, __m256 &XMMrk_slow,
+    __m256 &XMMrk_fast)
+{
+    XMMlambda_p1 = _mm256_set1_ps(param.lambda_q1);
+    XMMlambda_q1 = _mm256_set1_ps(param.lambda_p1);
+    XMMlambda_p2 = _mm256_set1_ps(param.lambda_q2);
+    XMMlambda_q2 = _mm256_set1_ps(param.lambda_p2);
+    XMMeta = _mm256_set1_ps(param.eta);
+    XMMrk_slow = _mm256_set1_ps((mf_float)1.0/kALIGN);
+    XMMrk_fast = _mm256_set1_ps((mf_float)1.0/(model.k-kALIGN));
+}
+#else
+void COL_BPR_MFOC::load_fixed_variables()
+{
+    lambda_p1 = param.lambda_q1;
+    lambda_q1 = param.lambda_p1;
+    lambda_p2 = param.lambda_q2;
+    lambda_q2 = param.lambda_p2;
+    rk_slow = (mf_float)1.0/kALIGN;
+    rk_fast = (mf_float)1.0/(model.k-kALIGN);
+}
+#endif
+class ROW_BPR_MFOC : public BPRSolver
+{
+public:
+    ROW_BPR_MFOC(Scheduler &scheduler, vector<BlockBase*> &blocks,
+                 mf_float *PG, mf_float *QG, mf_model &model,
+                 mf_parameter param, bool &slow_only,
+                 bool is_column_oriented = false)
+        : BPRSolver(scheduler, blocks, PG, QG, model, param,
+                    slow_only, is_column_oriented) {}
+protected:
+    void prepare_negative();
+};
+void ROW_BPR_MFOC::prepare_negative()
+{
+    mf_int negative = scheduler.get_negative(bid, bpr_bid, model.m, model.n,
+                                             is_column_oriented);
+    w = model.Q + negative*model.k;
+    wG = QG + negative*2;
+}
+class SolverFactory
+{
+public:
+    static shared_ptr<SolverBase> get_solver(
+        Scheduler &scheduler,
+        vector<BlockBase*> &blocks,
+        mf_float *PG,
+        mf_float *QG,
+        mf_model &model,
+        mf_parameter param,
+        bool &slow_only);
+};
+shared_ptr<SolverBase> SolverFactory::get_solver(
+    Scheduler &scheduler,
+    vector<BlockBase*> &blocks,
+    mf_float *PG,
+    mf_float *QG,
+    mf_model &model,
+    mf_parameter param,
+    bool &slow_only)
+{
+    shared_ptr<SolverBase> solver;
+    switch(param.fun)
+    {
+        case P_L2_MFR:
+            solver = shared_ptr<SolverBase>(new L2_MFR(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_L1_MFR:
+            solver = shared_ptr<SolverBase>(new L1_MFR(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_KL_MFR:
+            solver = shared_ptr<SolverBase>(new KL_MFR(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_LR_MFC:
+            solver = shared_ptr<SolverBase>(new LR_MFC(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_L2_MFC:
+            solver = shared_ptr<SolverBase>(new L2_MFC(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_L1_MFC:
+            solver = shared_ptr<SolverBase>(new L1_MFC(scheduler, blocks,
+                        PG, QG, model, param, slow_only));
+            break;
+        case P_ROW_BPR_MFOC:
+            solver = shared_ptr<SolverBase>(new ROW_BPR_MFOC(scheduler,
+                        blocks, PG, QG, model, param, slow_only));
+            break;
+        case P_COL_BPR_MFOC:
+            solver = shared_ptr<SolverBase>(new COL_BPR_MFOC(scheduler,
+                        blocks, PG, QG, model, param, slow_only));
+            break;
+        default:
+            throw invalid_argument("unknown error function");
+    }
+    return solver;
+}
+void fpsg_core(
+    Utility &util,
+    Scheduler &sched,
+    mf_problem *tr,
+    mf_problem *va,
+    mf_parameter param,
+    mf_float scale,
+    vector<BlockBase*> &block_ptrs,
+    vector<mf_int> &omega_p,
+    vector<mf_int> &omega_q,
+    shared_ptr<mf_model> &model,
+    vector<mf_int> cv_blocks,
+    mf_double *cv_error)
+{
+#if defined USESSE || defined USEAVX
+    auto flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE();
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+#endif
+    if(tr->nnz == 0)
+    {
+        cout << "warning: train on an empty training set" << endl;
+        return;
+    }
+    if(param.fun == P_L2_MFR ||
+       param.fun == P_L1_MFR ||
+       param.fun == P_KL_MFR)
+    {
+        switch(param.fun)
+        {
+            case P_L2_MFR:
+                param.lambda_p2 /= scale;
+                param.lambda_q2 /= scale;
+                param.lambda_p1 /= (mf_float)pow(scale, 1.5);
+                param.lambda_q1 /= (mf_float)pow(scale, 1.5);
+                break;
+            case P_L1_MFR:
+            case P_KL_MFR:
+                param.lambda_p1 /= sqrt(scale);
+                param.lambda_q1 /= sqrt(scale);
+                break;
+        }
+    }
+    if(!param.quiet)
+    {
+        cout.width(4);
+        cout << "iter";
+        cout.width(13);
+        cout << "tr_"+util.get_error_legend();
+        if(va->nnz != 0)
+        {
+            cout.width(13);
+            cout << "va_"+util.get_error_legend();
+        }
+        cout.width(13);
+        cout << "obj";
+        cout << "\n";
+    }
+    bool slow_only = param.lambda_p1 == 0 && param.lambda_q1 == 0? true: false;
+    vector<mf_float> PG(model->m*2, 1), QG(model->n*2, 1);
+    vector<shared_ptr<SolverBase>> solvers(param.nr_threads);
+    vector<thread> threads;
+    threads.reserve(param.nr_threads);
+    for(mf_int i = 0; i < param.nr_threads; ++i)
+    {
+        solvers[i] = SolverFactory::get_solver(sched, block_ptrs,
+                                               PG.data(), QG.data(),
+                                               *model, param, slow_only);
+        threads.emplace_back(&SolverBase::run, solvers[i].get());
+    }
+    for(mf_int iter = 0; iter < param.nr_iters; ++iter)
+    {
+        sched.wait_for_jobs_done();
+        if(!param.quiet)
+        {
+            mf_double reg = 0;
+            mf_double reg1 = util.calc_reg1(*model, param.lambda_p1,
+                             param.lambda_q1, omega_p, omega_q);
+            mf_double reg2 = util.calc_reg2(*model, param.lambda_p2,
+                             param.lambda_q2, omega_p, omega_q);
+            mf_double tr_loss = sched.get_loss();
+            mf_double tr_error = sched.get_error()/tr->nnz;
+            switch(param.fun)
+            {
+                case P_L2_MFR:
+                    reg = (reg1+reg2)*scale*scale;
+                    tr_loss *= scale*scale;
+                    tr_error = sqrt(tr_error*scale*scale);
+                    break;
+                case P_L1_MFR:
+                case P_KL_MFR:
+                    reg = (reg1+reg2)*scale;
+                    tr_loss *= scale;
+                    tr_error *= scale;
+                    break;
+                default:
+                    reg = reg1+reg2;
+                    break;
+            }
+            cout.width(4);
+            cout << iter;
+            cout.width(13);
+            cout << fixed << setprecision(4) << tr_error;
+            if(va->nnz != 0)
+            {
+                Block va_block(va->R, va->R+va->nnz);
+                vector<BlockBase*> va_blocks(1, &va_block);
+                vector<mf_int> va_block_ids(1, 0);
+                mf_double va_error =
+                    util.calc_error(va_blocks, va_block_ids, *model)/va->nnz;
+                switch(param.fun)
+                {
+                    case P_L2_MFR:
+                        va_error = sqrt(va_error*scale*scale);
+                        break;
+                    case P_L1_MFR:
+                    case P_KL_MFR:
+                        va_error *= scale;
+                        break;
+                }
+                cout.width(13);
+                cout << fixed << setprecision(4) << va_error;
+            }
+            cout.width(13);
+            cout << fixed << setprecision(4) << scientific << reg+tr_loss;
+            cout << "\n" << flush;
+        }
+        if(iter == 0)
+            slow_only = false;
+        if(iter == param.nr_iters - 1)
+            sched.terminate();
+        sched.resume();
+    }
+    for(auto &thread : threads)
+        thread.join();
+    if(cv_error != nullptr && cv_blocks.size() > 0)
+    {
+        mf_long cv_count = 0;
+        for(auto block : cv_blocks)
+            cv_count += block_ptrs[block]->get_nnz();
+        *cv_error = util.calc_error(block_ptrs, cv_blocks, *model)/cv_count;
+        switch(param.fun)
+        {
+            case P_L2_MFR:
+                *cv_error = sqrt(*cv_error*scale*scale);
+                break;
+            case P_L1_MFR:
+            case P_KL_MFR:
+                *cv_error *= scale;
+                break;
+        }
+    }
+#if defined USESSE || defined USEAVX
+    _MM_SET_FLUSH_ZERO_MODE(flush_zero_mode);
+#endif
+}
+shared_ptr<mf_model> fpsg(
+    mf_problem const *tr_,
+    mf_problem const *va_,
+    mf_parameter param,
+    vector<mf_int> cv_blocks=vector<mf_int>(),
+    mf_double *cv_error=nullptr)
+{
+    shared_ptr<mf_model> model;
+try
+{
+    Utility util(param.fun, param.nr_threads);
+    Scheduler sched(param.nr_bins, param.nr_threads, cv_blocks);
+    shared_ptr<mf_problem> tr;
+    shared_ptr<mf_problem> va;
+    vector<Block> blocks(param.nr_bins*param.nr_bins);
+    vector<BlockBase*> block_ptrs(param.nr_bins*param.nr_bins);
+    vector<mf_node*> ptrs;
+    vector<mf_int> p_map;
+    vector<mf_int> q_map;
+    vector<mf_int> inv_p_map;
+    vector<mf_int> inv_q_map;
+    vector<mf_int> omega_p;
+    vector<mf_int> omega_q;
+    mf_float avg = 0;
+    mf_float std_dev = 0;
+    mf_float scale = 1;
+    if(param.copy_data)
+    {
+        tr = shared_ptr<mf_problem>(
+                Utility::copy_problem(tr_, true), deleter());
+        va = shared_ptr<mf_problem>(
+                Utility::copy_problem(va_, true), deleter());
+    }
+    else
+    {
+        tr = shared_ptr<mf_problem>(Utility::copy_problem(tr_, false));
+        va = shared_ptr<mf_problem>(Utility::copy_problem(va_, false));
+    }
+    util.collect_info(*tr, avg, std_dev);
+    if(param.fun == P_L2_MFR ||
+       param.fun == P_L1_MFR ||
+       param.fun == P_KL_MFR)
+        scale = max((mf_float)1e-4, std_dev);
+    p_map = Utility::gen_random_map(tr->m);
+    q_map = Utility::gen_random_map(tr->n);
+    inv_p_map = Utility::gen_inv_map(p_map);
+    inv_q_map = Utility::gen_inv_map(q_map);
+    omega_p = vector<mf_int>(tr->m, 0);
+    omega_q = vector<mf_int>(tr->n, 0);
+    util.shuffle_problem(*tr, p_map, q_map);
+    util.shuffle_problem(*va, p_map, q_map);
+    util.scale_problem(*tr, (mf_float)1.0/scale);
+    util.scale_problem(*va, (mf_float)1.0/scale);
+    ptrs = util.grid_problem(*tr, param.nr_bins, omega_p, omega_q, blocks);
+    model = shared_ptr<mf_model>(Utility::init_model(param.fun,
+                tr->m, tr->n, param.k, avg/scale, omega_p, omega_q),
+                [] (mf_model *ptr) { mf_destroy_model(&ptr); });
+    for(mf_int i = 0; i < (mf_long)blocks.size(); ++i)
+        block_ptrs[i] = &blocks[i];
+    fpsg_core(util, sched, tr.get(), va.get(), param, scale,
+              block_ptrs, omega_p, omega_q, model, cv_blocks, cv_error);
+    if(!param.copy_data)
+    {
+        util.scale_problem(*tr, scale);
+        util.scale_problem(*va, scale);
+        util.shuffle_problem(*tr, inv_p_map, inv_q_map);
+        util.shuffle_problem(*va, inv_p_map, inv_q_map);
+    }
+    util.scale_model(*model, scale);
+    Utility::shrink_model(*model, param.k);
+    Utility::shuffle_model(*model, inv_p_map, inv_q_map);
+}
+catch(exception const &e)
+{
+    cerr << e.what() << endl;
+    throw;
+}
+    return model;
+}
+shared_ptr<mf_model> fpsg_on_disk(
+    const string tr_path,
+    const string va_path,
+    mf_parameter param,
+    vector<mf_int> cv_blocks=vector<mf_int>(),
+    mf_double *cv_error=nullptr)
+{
+    shared_ptr<mf_model> model;
+try
+{
+    Utility util(param.fun, param.nr_threads);
+    Scheduler sched(param.nr_bins, param.nr_threads, cv_blocks);
+    mf_problem tr = {};
+    mf_problem va = read_problem(va_path.c_str());
+    vector<BlockOnDisk> blocks(param.nr_bins*param.nr_bins);
+    vector<BlockBase*> block_ptrs(param.nr_bins*param.nr_bins);
+    vector<mf_int> p_map;
+    vector<mf_int> q_map;
+    vector<mf_int> inv_p_map;
+    vector<mf_int> inv_q_map;
+    vector<mf_int> omega_p;
+    vector<mf_int> omega_q;
+    mf_float avg = 0;
+    mf_float std_dev = 0;
+    mf_float scale = 1;
+    util.collect_info_on_disk(tr_path, tr, avg, std_dev);
+    if(param.fun == P_L2_MFR ||
+       param.fun == P_L1_MFR ||
+       param.fun == P_KL_MFR)
+        scale = max((mf_float)1e-4, std_dev);
+    p_map = Utility::gen_random_map(tr.m);
+    q_map = Utility::gen_random_map(tr.n);
+    inv_p_map = Utility::gen_inv_map(p_map);
+    inv_q_map = Utility::gen_inv_map(q_map);
+    omega_p = vector<mf_int>(tr.m, 0);
+    omega_q = vector<mf_int>(tr.n, 0);
+    util.shuffle_problem(va, p_map, q_map);
+    util.scale_problem(va, (mf_float)1.0/scale);
+    util.grid_shuffle_scale_problem_on_disk(
+        tr.m, tr.n, param.nr_bins, scale, tr_path,
+        p_map, q_map, omega_p, omega_q, blocks);
+    model = shared_ptr<mf_model>(Utility::init_model(param.fun,
+                tr.m, tr.n, param.k, avg/scale, omega_p, omega_q),
+                [] (mf_model *ptr) { mf_destroy_model(&ptr); });
+    for(mf_int i = 0; i < (mf_long)blocks.size(); ++i)
+        block_ptrs[i] = &blocks[i];
+    fpsg_core(util, sched, &tr, &va, param, scale,
+              block_ptrs, omega_p, omega_q, model, cv_blocks, cv_error);
+    delete [] va.R;
+    util.scale_model(*model, scale);
+    Utility::shrink_model(*model, param.k);
+    Utility::shuffle_model(*model, inv_p_map, inv_q_map);
+}
+catch(exception const &e)
+{
+    cerr << e.what() << endl;
+    throw;
+}
+    return model;
+}
+// The function implements an efficient method to compute objective function
+// minimized by coordinate descent method.
+//
+// \min_{P, Q} 0.5 * \sum_{(u,v)\in\Omega^+} (1-r_{u,v})^2  +
+//             0.5 * \alpha \sum_{(u,v)\not\in\Omega^+} (c-r_{u,v})^2 +
+//             0.5 * \lambda_p2 * ||P||_F^2 + 0.5 * \lambda_q2 * ||Q||_F^2
+// where
+//  1. (u,v) is a tuple of row index and column index,
+//  2. \Omega^+ a collections of (u,v) which specifies the locations of
+//     positive entries in the training matrix.
+//  3. r_{u,v} is the predicted rating at (u,v)
+//  4. \alpha is the weight of negative entries' loss.
+//  5. c is the desired value at every negative entries.
+//  6. ||P||_F is matrix P's Frobenius norm.
+//  7. \lambda_p2 is the regularization coefficient of P.
+//
+//  Note that coordinate descent method's P and Q are the transpose
+//  counterparts of P and Q in stochastic gradient method. Let R denoates
+//  the training matrix. For stochastic gradient method, we have R ~ P^TQ.
+//  For coordinate descent method, we have R ~ PQ^T.
+void calc_ccd_one_class_obj(const mf_int nr_threads,
+        const mf_float alpha, const mf_float c,
+        const mf_int m, const mf_int n, const mf_int d,
+        const mf_float lambda_p2, const mf_float lambda_q2,
+        const mf_float *P, const mf_float *Q,
+        shared_ptr<const mf_problem> data,
+        /*output*/ mf_double &obj,
+        /*output*/ mf_double &positive_loss,
+        /*output*/ mf_double &negative_loss,
+        /*output*/ mf_double &reg)
+{
+    // Declare regularization term of P.
+    mf_double p_square_norm = 0.0;
+    // Reduce P along column axis, which is the sum of rows in P.
+    vector<mf_double> all_p_sum(d, 0.0);
+    // Compute square of Frobenius norm on P and sum of all rows in P.
+    for(mf_int k = 0; k < d; ++k)
+    {
+        // Declare a temporal buffer of all_p_sum[k] for using OpenMP.
+        mf_double all_p_sum_k = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:p_square_norm,all_p_sum_k)
+#endif
+        for(mf_int u = 0; u < m; ++u)
+        {
+            const mf_float &p_ku = P[u + k * m];
+            p_square_norm += p_ku * p_ku;
+            all_p_sum_k += p_ku;
+        }
+        all_p_sum[k] = all_p_sum_k;
+    }
+    // Declare regularization term of Q
+    mf_double q_square_norm = 0.0;
+    // Reduce Q along column axis, whihc is the sum of rows in Q.
+    vector<mf_double> all_q_sum(d, 0.0);
+    // Compute square of Frobenius norm on Q and sum of all elements in Q
+    for(mf_int k = 0; k < d; ++k)
+    {
+        // Declare a temporal buffer of all_p_sum[k] for using OpenMP.
+        mf_double all_q_sum_k = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:q_square_norm,all_q_sum_k)
+#endif
+        for(mf_int v = 0; v < n; ++v)
+        {
+            const mf_float &q_kv = Q[v + k * n];
+            q_square_norm += q_kv * q_kv;
+            all_q_sum_k += q_kv;
+        }
+        all_q_sum[k] = all_q_sum_k;
+    }
+    // PTP = P^T * P, where P^T is the transpose of P. Note that P is a m-by-d
+    // matrix and PTP is a d-by-d matrix.
+    vector<mf_double> PTP(d * d, 0.0);
+    // QTQ = Q^T * P, a d-by-d matrix.
+    vector<mf_double> QTQ(d * d, 0.0);
+    // We calculate PTP and QTQ because they are needed in the computation of
+    // negative entries' loss function.
+    for(mf_int k1 = 0; k1 < d; ++k1)
+    {
+        for(mf_int k2 = 0; k2 < d; ++k2)
+        {
+            // Inner product of the k1 and k2 columns in P, a m-by-d matrix.
+            mf_double p_k1_p_k2_inner_product = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:p_k1_p_k2_inner_product)
+#endif
+            for(mf_int u = 0; u < m; ++u)
+                p_k1_p_k2_inner_product += P[u + k1 * m] * P[u + k2 * m];
+            PTP[k1 * d + k2] = p_k1_p_k2_inner_product;
+            // Inner product of the k1 and k2 columns in Q, a n-by-d matrix.
+            mf_double q_k1_q_k2_inner_product = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:q_k1_q_k2_inner_product)
+#endif
+            for(mf_int v = 0; v < n; ++v)
+                q_k1_q_k2_inner_product += Q[v + k1 * n] * Q[v + k2 * n];
+            QTQ[k1 * d + k2] = q_k1_q_k2_inner_product;
+        }
+    }
+    // Initialize loss function value of positive matrix entries.
+    // It consists two parts. The first part is the true prediction error
+    // while the second part is only used for implementing faster algorithm.
+    mf_double positive_loss1 = 0.0;
+    mf_double positive_loss2 = 0.0;
+    // Scan through positive matrix entries to compute their loss values.
+    // Notice that we assume that positive entries' values are all one.
+#if defined USEOMP
+#pragma omp parallel for num_threads(nr_threads) schedule(static) reduction(+:positive_loss1,positive_loss2)
+#endif
+    for(mf_long i = 0; i < data->nnz; ++i)
+    {
+        const mf_double &r = data->R[i].r;
+        positive_loss1 += (1.0 - r) * (1.0 - r);
+        positive_loss2 -= alpha * (c - r) * (c - r);
+    }
+    positive_loss1 *= 0.5;
+    positive_loss2 *= 0.5;
+    // Declare loss terms related to negative matrix entries.
+    mf_double negative_loss1 = c * c * m * n;
+    mf_double negative_loss2 = 0.0;
+    mf_double negative_loss3 = 0.0;
+    // Compute loss terms.
+    for(mf_int k1 = 0; k1 < d; ++k1)
+    {
+        negative_loss2 += all_p_sum[k1] * all_q_sum[k1];
+        for(mf_int k2 = 0; k2 < d; ++k2)
+            negative_loss3 += PTP[k1 + k2 * d] * QTQ[k2 + k1 * d];
+    }
+    // Compute the loss function of negative matrix entries.
+    mf_double negative_loss4 = 0.5 * alpha *
+        (negative_loss1 - 2 * c * negative_loss2 + negative_loss3);
+    // Assign results to output variables.
+    reg = 0.5 * lambda_p2 * p_square_norm + 0.5 * lambda_q2 * q_square_norm;
+    // The function minimized by coordinate descent method.
+    obj = positive_loss1 + positive_loss2 + negative_loss4 + reg;
+    // Sume of squared error over positive matrix entries (i.e., those mf_node's
+    // in data).
+    positive_loss = positive_loss1;
+    // Sume of squared error over negative matrix entries (i.e., those mf_node's
+    // in data). The value negative_loss4 contains the squared errors by
+    // considering positive entries as negative entries, so positive_loss2 is
+    // added to compensate that.
+    negative_loss = negative_loss4 + positive_loss2;
+}
+void ccd_one_class_core(
+    const Utility &util,
+    shared_ptr<const mf_problem> tr_csr,
+    shared_ptr<const mf_problem> tr_csc,
+    shared_ptr<const mf_problem> va,
+    const mf_parameter param,
+    const vector<mf_node*> &ptrs_u,
+    const vector<mf_node*> &ptrs_v,
+    /*output*/ shared_ptr<mf_model> &model)
+{
+    // Check problems stored in CSR and CSC formats
+    if(tr_csr == nullptr) throw invalid_argument("CSR problem pointer is null.");
+    if(tr_csc == nullptr) throw invalid_argument("CSC problem pointer is null.");
+    if(tr_csr->m != tr_csc->m)
+        throw logic_error(
+                "Row counts must be identical in CSR and CSC formats: " +
+                to_string(tr_csr->m) + " != " + to_string(tr_csc->m));
+    const mf_int m = tr_csr->m;
+    if(tr_csr->n != tr_csc->n)
+        throw logic_error(
+                "Column counts must be identical in CSR and CSC formats: " +
+                to_string(tr_csr->n) + " != " + to_string(tr_csc->n));
+    const mf_int n = tr_csr->n;
+    if(tr_csc->nnz != tr_csc->nnz)
+        throw logic_error(
+                "Numbers of data points must be identical in CSR and CSC formats: " +
+                to_string(tr_csr->nnz) + " != " + to_string(tr_csc->nnz));
+    const mf_long nnz = tr_csr->nnz;
+    // Check formulation parameters
+    if(param.k <= 0)
+        throw invalid_argument(
+                "Latent dimension must be positive but got " +
+                to_string(param.k));
+    const mf_int d = param.k;
+    if(param.lambda_p1 != 0)
+        throw invalid_argument(
+                "P's L1-regularization coefficient must be zero but got " +
+                to_string(param.lambda_p1));
+    if(param.lambda_q1 != 0)
+        throw invalid_argument(
+                "Q's L1-regularization coefficient must be zero but got " +
+                to_string(param.lambda_q1));
+    if(param.lambda_p2 <= 0)
+        throw invalid_argument(
+                "P's L2-regularization coefficient must be positive but got " +
+                to_string(param.lambda_p2));
+    if(param.lambda_q2 <= 0)
+        throw invalid_argument(
+                "Q's L2-regularization coefficient must be positive but got " +
+                to_string(param.lambda_q2));
+    // REVIEW: It is not difficult to support non-negative matrix factorization
+    // for coordinate descent method; we just need to project the updated value
+    // back to the feasible region by using max(0, new_value) right after each
+    // Newton step. LIBMF hasn't support it only because we don't see actual
+    // users.
+    if(param.do_nmf)
+        throw invalid_argument(
+                "Coordinate descent does not support non-negative constraint");
+    // Check some resources prepared internally
+    if(ptrs_u.size() != (size_t)m + 1)
+        throw invalid_argument("Number of row pointer must be " +
+                to_string(m + 1) + " but got " + to_string(ptrs_u.size()));
+    if(ptrs_v.size() != (size_t)n + 1)
+        throw invalid_argument("Number of column pointer must be " +
+                to_string(n + 1) + " but got " + to_string(ptrs_v.size()));
+    // Some constants of the formulation.
+    // alpha: coefficient of negative part
+    // c: the desired prediction values of unobserved ratings
+    // lambda_p2: regularization coefficient of P's L2-norm
+    // lambda_q2: regularization coefficient of P's Q2-norm
+    const mf_float alpha = param.alpha;
+    const mf_float c = param.c;
+    const mf_float lambda_p2 = param.lambda_p2;
+    const mf_float lambda_q2 = param.lambda_q2;
+    // Initialize P and Q. Note that \bar{q}_{kv} is Q[k*n+v]
+    // and \bar{p}_{ku} is P[k*m+u]. One may notice that P and
+    // Q here are actually the transposes of P and Q in FPSG.
+    mf_float *P = model->P;
+    mf_float *Q = model->Q;
+    // Cache the prediction values on positive matrix entries.
+    // Given that P=zero and Q=random initialized in
+    // Utility::init_model(mf_int m, mf_int n, mf_int k),
+    // all predictions are zeros.
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+    for(mf_long i = 0; i < nnz; ++i)
+    {
+        tr_csr->R[i].r = 0.0;
+        tr_csc->R[i].r = 0.0;
+    }
+    // If the model is not initialized by
+    // Utility::init_model(mf_int m, mf_int n, mf_int k),
+    // please use the following initialization code to compute
+    // and cache all prediction values on positive entries.
+    /*
+    for(mf_long i = 0; i < nnz; ++i)
+    {
+        mf_node &node = tr_csr->R[i];
+        node.r = 0;
+        for(mf_int k = 0; k < d; ++k)
+            node.r += P[node.u + k * m]*Q[node.v + k * n];
+    }
+    for(mf_long i = 0; i < nnz; ++i)
+    {
+        mf_node &node = tr_csc->R[i];
+        node.r = 0;
+        for(mf_int k = 0; k < d; ++k)
+            node.r += P[node.u + k * m]*Q[node.v + k * n];
+    }
+    */
+    if(!param.quiet)
+    {
+        cout.width(4);
+        cout << "iter";
+        cout.width(13);
+        cout << "tr_"+util.get_error_legend();
+        cout.width(14);
+        cout << "tr_"+util.get_error_legend() << "+";
+        cout.width(14);
+        cout << "tr_"+util.get_error_legend() << "-";
+        if(va->nnz != 0)
+        {
+            cout.width(13);
+            cout << "va_"+util.get_error_legend();
+            cout.width(14);
+            cout << "va_"+util.get_error_legend() << "+";
+            cout.width(14);
+            cout << "va_"+util.get_error_legend() << "-";
+        }
+        cout.width(13);
+        cout << "obj";
+        cout << "\n";
+    }
+    /////////////////////////////////////////////////////////////////
+    // Minimize the objective function via coordinate descent method
+    ////////////////////////////////////////////////////////////////
+    // Solve P and Q using coordinate descent.
+    // P = [\bar{p}_1, ..., \bar{p}_d] \in R^{m \times k}
+    // Q = [\bar{q}_1, ..., \bar{q}_d] \in R^{n \times k}
+    // Finally, the rating matrice R would be approximated via
+    // R ~ PQ^T \in R^{m \times n}
+    for(mf_int outer = 0; outer < param.nr_iters; ++outer)
+    {
+        // Update \bar{p}_k and \bar{q}_k. The basic idea is
+        // to replace \bar{p}_k and \bar{q}_k with a and b,
+        // and then minimizes the original objective function.
+        for(mf_int k = 0; k < d; ++k)
+        {
+            // Get the pointer to the first element of \bar{p}_k (and
+            // \bar{q}_k).
+            mf_float *P_k = P + m * k;
+            mf_float *Q_k = Q + n * k;
+            // Initialize a and b with the value they need to replace
+            // so that we can ensure improvement at each iteration.
+            vector<mf_float> a(P_k, P_k + m);
+            vector<mf_float> b(Q_k, Q_k + n);
+            for(mf_int inner = 0; inner < 3; ++inner)
+            {
+                ///////////////////////////////////////////////////////////////
+                // Update a:
+                //  1. Compute and cache constants
+                //  2. For each coordinate of a, calculate optimal update using
+                //     Newton method
+                ///////////////////////////////////////////////////////////////
+                // Compute and cache constants
+                // \hat{b} = \sum_{v=1}^n \bar{b}_v
+                // \tilde{b} = \sum_{v=1}^n \bar{b}_v^2
+                mf_double b_hat = 0.0;
+                mf_double b_tilde = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static) reduction(+:b_hat,b_tilde)
+#endif
+                for(mf_int v = 0; v < n; ++v)
+                {
+                    const mf_double &b_v = b[v];
+                    b_hat += b_v;
+                    b_tilde += b_v * b_v;
+                }
+                // Compute and cache a constant vector
+                // s_k = \sum_{v=1}^n \bar{q}_{kv}b_v, k = 1, ..., d
+                vector<mf_double> s(d, 0.0);
+                for(mf_int k1 = 0; k1 < d; ++k1)
+                {
+                    // Buffer variable for using OpenMP
+                    mf_double s_k1 = 0;
+                    const mf_float *Q_k1 = Q + k1 * n;
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static) reduction(+:s_k1)
+#endif
+                    for(mf_int v = 0; v < n; ++v)
+                        s_k1 += Q_k1[v] * b[v];
+                    s[k1] = s_k1;
+                }
+                // Solve a's sub-problem
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+                for(mf_int u = 0; u < m; ++u)
+                {
+                    ////////////////////////////////////////////////////////
+                    // Update a[u] via Newton method. Let g_u and h_u denote
+                    // the first-order and second-order derivatives w.r.t.
+                    // a[u]. The following code implements
+                    //  a[u] <-- a[u] - g_u/h_u
+                    ////////////////////////////////////////////////////////
+                    // Initialize temporal variables for calculating gradient and hessian.
+                    mf_double g_u_1 = 0.0;
+                    mf_double h_u_1 = 0.0;
+                    mf_double g_u_2 = 0.0;
+                    // Scan through specified entries at the u-th row
+                    for(const mf_node *ptr = ptrs_u[u]; ptr != ptrs_u[u+1]; ++ptr)
+                    {
+                        const mf_int &v = ptr->v;
+                        const mf_float &b_v = b[v];
+                        g_u_1 += b_v;
+                        h_u_1 += b_v * b_v;
+                        g_u_2 += (ptr->r - P_k[u] * Q_k[v] + a[u] * b_v) * b_v;
+                    }
+                    mf_double g_u_3 = -c * b_hat - P_k[u] * s[k] + a[u] * b_tilde;
+                    for(mf_int k1 = 0; k1 < d; ++k1)
+                        g_u_3 += P[m * k1 + u] * s[k1];
+                    mf_double g_u = -(1.0 - alpha * c) * g_u_1 + (1.0 - alpha) * g_u_2 + alpha * g_u_3 + lambda_p2 * a[u];
+                    mf_double h_u = (1.0 - alpha) * h_u_1 + alpha * b_tilde + lambda_p2;
+                    a[u] -= static_cast<mf_float>(g_u / h_u);
+                }
+                ///////////////////////////////////////////////////////////////
+                // Update b:
+                //  1. Compute and cache constants
+                //  2. For each coordinate of b, calculate optimal update using
+                //     Newton method
+                ///////////////////////////////////////////////////////////////
+                // Compute and cache a_hat, a_tilde
+                // \hat{a} = \sum_{u=1}^m \bar{a}_u
+                // \tilde{a} = \sum_{u=1}^m \bar{a}_u^2
+                mf_double a_hat = 0.0;
+                mf_double a_tilde = 0.0;
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static) reduction(+:a_hat,a_tilde)
+#endif
+                for(mf_int u = 0; u < m; ++u)
+                {
+                    const mf_float &a_u = a[u];
+                    a_hat += a_u;
+                    a_tilde += a_u * a_u;
+                }
+                // Compute and cache t
+                // t_k = \sum_{u=1}^m \bar{a}_{ku}a_u, k = 1, ..., d
+                vector<mf_double> t(d, 0.0);
+                for(mf_int k1 = 0; k1 < d; ++k1)
+                {
+                    // Declare buffer variable for using OpenMP
+                    mf_double t_k1 = 0;
+                    const mf_float *P_k1 = P + k1 * m;
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static) reduction(+:t_k1)
+#endif
+                    for(mf_int u = 0; u < m; ++u)
+                        t_k1 += P_k1[u] * a[u];
+                    t[k1] = t_k1;
+                }
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+                for(mf_int v = 0; v < n; ++v)
+                {
+                    ////////////////////////////////////////////////////////
+                    // Update b[v] via Newton method. Let g_v and h_v denote
+                    // the first-order and second-order derivatives w.r.t.
+                    // b[v]. The following code implements
+                    //  b[v] <-- b[v] - g_v/h_v
+                    ////////////////////////////////////////////////////////
+                    // Initialize temporal variables for calculating gradient and hessian.
+                    mf_double g_v_1 = 0;
+                    mf_double g_v_2 = 0;
+                    mf_double h_v_1 = 0;
+                    // Scan through all positive entries at column v
+                    for(const mf_node *ptr = ptrs_v[v]; ptr != ptrs_v[v+1]; ++ptr)
+                    {
+                        const mf_int &u = ptr->u;
+                        const mf_float &a_u = a[u];
+                        g_v_1 += a_u;
+                        h_v_1 += a_u * a_u;
+                        g_v_2 += (ptr->r - P_k[u] * Q_k[v] + a_u * b[v]) * a_u;
+                    }
+                    mf_double g_v_3 = -c * a_hat - Q_k[v] * t[k] + b[v] * a_tilde;
+                    for(mf_int k1 = 0; k1 < d; ++k1)
+                        g_v_3 += Q[n * k1 + v] * t[k1];
+                    mf_double g_v = -(1.0 - alpha * c) * g_v_1 + (1.0 - alpha) * g_v_2 +
+                        alpha * g_v_3 + lambda_q2 * b[v];
+                    mf_double h_v = (1 - alpha) * h_v_1 + alpha * a_tilde + lambda_q2;
+                    b[v] -= static_cast<mf_float>(g_v / h_v);
+                }
+                ///////////////////////////////////////////////////////////////
+                // Update cached variables.
+                ///////////////////////////////////////////////////////////////
+                // Update prediction error in CSR format
+                // \bar{r}_{uv} <- \bar{r}_{uv} - \bar_{p}_{ku}*\bar_{q}_{kv} + a_u*b_v
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+                for(mf_long i = 0; i < tr_csr->nnz; ++i)
+                {
+                    // Update prediction values of positive entries in CSR
+                    mf_node *csr_ptr = tr_csr->R + i;
+                    const mf_int &u_csr = csr_ptr->u;
+                    const mf_int &v_csr = csr_ptr->v;
+                    csr_ptr->r += a[u_csr] * b[v_csr] - P_k[u_csr] * Q_k[v_csr];
+                    // Update prediction values of positive entries in CSC
+                    mf_node *csc_ptr = tr_csc->R + i;
+                    const mf_int &u_csc = csc_ptr->u;
+                    const mf_int &v_csc = csc_ptr->v;
+                    csc_ptr->r += a[u_csc] * b[v_csc] - P_k[u_csc] * Q_k[v_csc];
+                }
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+                // Update P_k and Q_k
+                for(mf_int u = 0; u < m; ++u)
+                    P_k[u] = a[u];
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+                for(mf_int v = 0; v < n; ++v)
+                    Q_k[v] = b[v];
+            }
+        }
+        // Skip the whole evaluation if nothing should be printed out.
+        if(param.quiet)
+            continue;
+        // Declare variable for storing objective value being minimized
+        // by the training procedure. Note that The objective value consists
+        // of two parts, loss function and regularization function.
+        mf_double obj = 0;
+        // Declare variables for storing loss function's value.
+        mf_double positive_loss = 0; // for positive entries in training matrix.
+        mf_double negative_loss = 0; // for negative entries in training matrix.
+        // Declare variable for storing regularization function's value.
+        mf_double reg = 0;
+        // Compute objective value, loss function value, and regularization
+        // function value
+        calc_ccd_one_class_obj(util.get_thread_number(), alpha, c, m, n, d,
+                lambda_p2, lambda_q2, P, Q, tr_csr,
+                obj, positive_loss, negative_loss, reg);
+        // Print number of outer iterations.
+        cout.width(4);
+        cout << outer;
+        cout.width(13);
+        cout << fixed << setprecision(4) << positive_loss + negative_loss;
+        cout.width(15);
+        cout << fixed << setprecision(4) << positive_loss;
+        cout.width(15);
+        cout << fixed << setprecision(4) << negative_loss;
+        if(va->nnz != 0)
+        {
+            // The following loop computes prediction scores on validation set.
+            // Because training scores is also maintained in coordinate descent
+            // framework, we didn't need to actively compute scores on training set.
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+            for(mf_long i = 0; i < va->nnz; ++i)
+            {
+                mf_node &node = va->R[i];
+                node.r = 0;
+                for(mf_int k = 0; k < d; ++k)
+                    node.r += P[node.u + k * m]*Q[node.v + k * n];
+            }
+            mf_double va_obj = 0;
+            mf_double va_positive_loss = 0;
+            mf_double va_negative_loss = 0;
+            mf_double va_reg = 0;
+            calc_ccd_one_class_obj(util.get_thread_number(), alpha, c, m, n, d,
+                    lambda_p2, lambda_q2, P, Q, va,
+                    va_obj, va_positive_loss, va_negative_loss, va_reg);
+            cout.width(13);
+            cout << fixed << setprecision(4) << va_positive_loss + va_negative_loss;
+            cout.width(15);
+            cout << fixed << setprecision(4) << va_positive_loss;
+            cout.width(15);
+            cout << fixed << setprecision(4) << va_negative_loss;
+        }
+        cout.width(13);
+        cout << fixed << setprecision(4) << scientific << obj;
+        cout << "\n" << flush;
+    }
+    // Transpose P and Q. Note that the format of P and Q here are different
+    // than that for mf_model.
+    mf_float *P_transpose = Utility::malloc_aligned_float((mf_long)m * d);
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+    for(mf_int u = 0; u < m; ++u)
+        for(mf_int k = 0; k < d; ++k)
+            P_transpose[k + u * d] = P[u + k * m];
+    Utility::free_aligned_float(P);
+    mf_float *Q_transpose = Utility::malloc_aligned_float((mf_long)n * d);
+#if defined USEOMP
+#pragma omp parallel for num_threads(util.get_thread_number()) schedule(static)
+#endif
+    for(mf_int v = 0; v < n; ++v)
+        for(mf_int k = 0; k < d; ++k)
+            Q_transpose[k + v * d] = Q[v + k * n];
+    Utility::free_aligned_float(Q);
+    // Set the passed-in model to the result learned from the given data
+    // model is null
+    model->m = m;
+    model->n = n;
+    model->k = d;
+    model->b = 0.0;
+    model->P = P_transpose;
+    model->Q = Q_transpose;
+}
+shared_ptr<mf_model> ccd_one_class(
+    mf_problem const *tr_,
+    mf_problem const *va_,
+    mf_parameter param)
+{
+    shared_ptr<mf_model> model;
+try
+{
+    Utility util(param.fun, param.nr_threads);
+    // Training matrix in compressed row format (sort nodes by user id)
+    shared_ptr<mf_problem> tr_csr;
+    // Training matrix in compressed column format (sort nodes by item id)
+    shared_ptr<mf_problem> tr_csc;
+    shared_ptr<mf_problem> va;
+    // In tr_csr->R, i-th row starting at row_ptrs[i] and eneding right before row_ptrs[i+1]
+    vector<mf_node*> ptrs_u(tr_->m + 1, nullptr);
+    // In tr_csv->R, i-th column starting at col_ptrs[i] and eneding right before col_ptrs[i+1]
+    vector<mf_node*> ptrs_v(tr_->n + 1, nullptr);
+    if(param.copy_data)
+    {
+        // Need a row-major and a column-major training formats
+        // Thus, two duplicates are made.
+        tr_csr = shared_ptr<mf_problem>(
+                Utility::copy_problem(tr_, true), deleter());
+        tr_csc = shared_ptr<mf_problem>(
+                Utility::copy_problem(tr_, true), deleter());
+        va = shared_ptr<mf_problem>(
+                Utility::copy_problem(va_, true), deleter());
+    }
+    else
+    {
+        // Need a row-major and a column-major training formats
+        // The original data is reused as row-major one so
+        // one duplicate for column-major one would be created.
+        tr_csr = shared_ptr<mf_problem>(Utility::copy_problem(tr_, false));
+        tr_csc = shared_ptr<mf_problem>(Utility::copy_problem(tr_, true));
+        va = shared_ptr<mf_problem>(Utility::copy_problem(va_, false));
+    }
+    // Make the training set CSR/CSC by sorting their nodes. More specifically,
+    // a matrix with values sorted by row index is CSR and vice versa. We will
+    // compute the starting location for each row (CSR) and each column (CSC)
+    // later.
+    sort(tr_csr->R, tr_csr->R+tr_csr->nnz, sort_node_by_p());
+    sort(tr_csc->R, tr_csc->R+tr_csc->nnz, sort_node_by_q());
+    // Save starting addresses of rows for CSR and columns for CSC.
+    mf_int u_current = -1;
+    mf_int v_current = -1;
+    for(mf_long i = 0; i < tr_->nnz; ++i)
+    {
+        mf_node* N = nullptr;
+        // Deal with CSR format.
+        N = tr_csr->R + i;
+        // Since tr_csr has been sorted by index u, seeing a larger index
+        // implies a new row. Assume a node is encoded a tuple of (u, v, r),
+        // where u is row index, v is column index, and r is entry value.
+        // The nodes in tr_csr->R could be
+        //   (0, 1, 0.5), (0, 2, 3.7), (0, 4, -1.2), (2, 0, 1.2), (2, 4, 2.5)
+        // Then, we can see the first element of the 3rd row (indexed by 2)
+        // is (2, 0, 1.2), which is the 4th element in tr_csr->R. Note that
+        // we use the row pointer of the next non-empty row as the pointers
+        // of empty rows. That is,
+        //   ptrs[0] = pointer of (0, 1, 0.5)
+        //   ptrs[1] = pointer of (0, 2, 1.5)
+        //   ptrs[2] = pointer of (0, 2, 1.5)
+        if(N->u > u_current)
+        {
+            // We (if u_current != -1) have assigned starting addresses to rows
+            // indexed by values smaller than or equal to u_current. Thus, we
+            // should handle all rows indexed starting from u_current+1 to the
+            // seen row index N->u.
+            for(mf_int u_passed = u_current + 1; u_passed <= N->u; ++u_passed)
+            {
+                // i-th non-zero value's location in tr_csr is the starting
+                // address of u_passed-th row.
+                ptrs_u[u_passed] = tr_csr->R + i;
+            }
+            u_current = N->u;
+        }
+        // Deal with CSC format
+        N = tr_csc->R + i;
+        if(N->v > v_current)
+        {
+            // We (if v_current != -1) have assigned starting addresses to rows
+            // indexed by values smaller than or equal to v_current. Thus, we
+            // should handle all columns indexed starting from v_current+1 to
+            // the seen row index N->v.
+            for(mf_int v_passed = v_current + 1; v_passed <= N->v; ++v_passed)
+            {
+                // i-th non-zero value's location in tr_csc is the starting
+                // address of v_passed-th column.
+                ptrs_v[v_passed] = tr_csc->R + i;
+            }
+            v_current = N->v;
+        }
+    }
+    // The bound of the last row. It's the address one-element behind the last
+    // matrix entry.
+    for(mf_int u_passed = u_current + 1; u_passed <= tr_->m; ++u_passed)
+        ptrs_u[u_passed] = tr_csr->R + tr_csr->nnz;
+     // The bound of the last column.
+    for(mf_int v_passed = v_current + 1; v_passed <= tr_->n; ++v_passed)
+        ptrs_v[v_passed] = tr_csc->R + tr_csc->nnz;
+    model = shared_ptr<mf_model>(Utility::init_model(tr_->m, tr_->n, param.k),
+                [] (mf_model *ptr) { mf_destroy_model(&ptr); });
+    ccd_one_class_core(util, tr_csr, tr_csc, va, param, ptrs_u, ptrs_v, model);
+}
+catch(exception const &e)
+{
+    cerr << e.what() << endl;
+    throw;
+}
+    return model;
+}
+bool check_parameter(mf_parameter param)
+{
+    if(param.fun != P_L2_MFR &&
+       param.fun != P_L1_MFR &&
+       param.fun != P_KL_MFR &&
+       param.fun != P_LR_MFC &&
+       param.fun != P_L2_MFC &&
+       param.fun != P_L1_MFC &&
+       param.fun != P_ROW_BPR_MFOC &&
+       param.fun != P_COL_BPR_MFOC &&
+       param.fun != P_L2_MFOC)
+    {
+        cerr << "unknown loss function" << endl;
+        return false;
+    }
+    if(param.k < 1)
+    {
+        cerr << "number of factors must be greater than zero" << endl;
+        return false;
+    }
+    if(param.nr_threads < 1)
+    {
+        cerr << "number of threads must be greater than zero" << endl;
+        return false;
+    }
+    if(param.nr_bins < 1 || param.nr_bins < param.nr_threads)
+    {
+        cerr << "number of bins must be greater than number of threads"
+             << endl;
+        return false;
+    }
+    if(param.nr_iters < 1)
+    {
+        cerr << "number of iterations must be greater than zero" << endl;
+        return false;
+    }
+    if(param.lambda_p1 < 0 ||
+       param.lambda_p2 < 0 ||
+       param.lambda_q1 < 0 ||
+       param.lambda_q2 < 0)
+    {
+        cerr << "regularization coefficient must be non-negative" << endl;
+        return false;
+    }
+    if(param.eta <= 0)
+    {
+        cerr << "learning rate must be greater than zero" << endl;
+        return false;
+    }
+    if(param.fun == P_KL_MFR && !param.do_nmf)
+    {
+        cerr << "--nmf must be set when using generalized KL-divergence"
+             << endl;
+        return false;
+    }
+    if(param.nr_bins <= 2*param.nr_threads)
+    {
+        cerr << "Warning: insufficient blocks may slow down the training"
+             << "process (4*nr_threads^2+1 blocks is suggested)" << endl;
+    }
+    if(param.nr_bins <= 2*param.nr_threads)
+    {
+        cerr << "Warning: insufficient blocks may slow down the training"
+             << "process (4*nr_threads^2+1 blocks is suggested)" << endl;
+    }
+    if(param.alpha < 0)
+    {
+        cerr << "alpha must be a non-negative number" << endl;
+    }
+    return true;
+}
+//--------------------------------------
+//-----Classes for cross validation-----
+//--------------------------------------
+class CrossValidatorBase
+{
+public:
+    CrossValidatorBase(mf_parameter param_, mf_int nr_folds_);
+    mf_double do_cross_validation();
+    virtual mf_double do_cv1(vector<mf_int> &hidden_blocks) = 0;
+protected:
+    mf_parameter param;
+    mf_int nr_bins;
+    mf_int nr_folds;
+    mf_int nr_blocks_per_fold;
+    bool quiet;
+    Utility util;
+    mf_double cv_error;
+};
+CrossValidatorBase::CrossValidatorBase(mf_parameter param_, mf_int nr_folds_)
+    : param(param_), nr_bins(param_.nr_bins), nr_folds(nr_folds_),
+      nr_blocks_per_fold(nr_bins*nr_bins/nr_folds), quiet(param_.quiet),
+      util(param.fun, param.nr_threads), cv_error(0)
+{
+    param.quiet = true;
+}
+mf_double CrossValidatorBase::do_cross_validation()
+{
+    vector<mf_int> cv_blocks;
+    srand(0);
+    for(mf_int block = 0; block < nr_bins*nr_bins; ++block)
+        cv_blocks.push_back(block);
+    random_shuffle(cv_blocks.begin(), cv_blocks.end());
+    if(!quiet)
+    {
+        cout.width(4);
+        cout << "fold";
+        cout.width(10);
+        cout << util.get_error_legend();
+        cout << endl;
+    }
+    cv_error = 0;
+    for(mf_int fold = 0; fold < nr_folds; ++fold)
+    {
+        mf_int begin = fold*nr_blocks_per_fold;
+        mf_int end = min((fold+1)*nr_blocks_per_fold, nr_bins*nr_bins);
+        vector<mf_int> hidden_blocks(cv_blocks.begin()+begin,
+                                    cv_blocks.begin()+end);
+        mf_double err = do_cv1(hidden_blocks);
+        cv_error += err;
+        if(!quiet)
+        {
+            cout.width(4);
+            cout << fold;
+            cout.width(10);
+            cout << fixed << setprecision(4) << err;
+            cout << endl;
+        }
+    }
+    if(!quiet)
+    {
+        cout.width(14);
+        cout.fill('=');
+        cout << "" << endl;
+        cout.fill(' ');
+        cout.width(4);
+        cout << "avg";
+        cout.width(10);
+        cout << fixed << setprecision(4) << cv_error/nr_folds;
+        cout << endl;
+    }
+    return cv_error/nr_folds;
+}
+class CrossValidator : public CrossValidatorBase
+{
+public:
+    CrossValidator(
+        mf_parameter param_, mf_int nr_folds_, mf_problem const *prob_)
+        : CrossValidatorBase(param_, nr_folds_), prob(prob_) {};
+    mf_double do_cv1(vector<mf_int> &hidden_blocks);
+private:
+    mf_problem const *prob;
+};
+mf_double CrossValidator::do_cv1(vector<mf_int> &hidden_blocks)
+{
+    mf_double err = 0;
+    fpsg(prob, nullptr, param, hidden_blocks, &err);
+    return err;
+}
+class CrossValidatorOnDisk : public CrossValidatorBase
+{
+public:
+    CrossValidatorOnDisk(
+        mf_parameter param_, mf_int nr_folds_, string data_path_)
+        : CrossValidatorBase(param_, nr_folds_), data_path(data_path_) {};
+    mf_double do_cv1(vector<mf_int> &hidden_blocks);
+private:
+    string data_path;
+};
+mf_double CrossValidatorOnDisk::do_cv1(vector<mf_int> &hidden_blocks)
+{
+    mf_double err = 0;
+    fpsg_on_disk(data_path, string(), param, hidden_blocks, &err);
+    return err;
+}
+} // unnamed namespace
+mf_model* mf_train_with_validation(
+    mf_problem const *tr,
+    mf_problem const *va,
+    mf_parameter param)
+{
+    if(!check_parameter(param))
+        return nullptr;
+    shared_ptr<mf_model> model(nullptr);
+    if(param.fun != P_L2_MFOC)
+        // Use stochastic gradient method
+        model = fpsg(tr, va, param);
+    else
+        // Use coordinate descent method
+        model = ccd_one_class(tr, va, param);
+    mf_model *model_ret = new mf_model;
+    model_ret->fun = model->fun;
+    model_ret->m = model->m;
+    model_ret->n = model->n;
+    model_ret->k = model->k;
+    model_ret->b = model->b;
+    model_ret->P = model->P;
+    model->P = nullptr;
+    model_ret->Q = model->Q;
+    model->Q = nullptr;
+    return model_ret;
+}
+mf_model* mf_train_with_validation_on_disk(
+    char const *tr_path,
+    char const *va_path,
+    mf_parameter param)
+{
+    // Two conditions lead to empty model. First, any parameter is not in its
+    // supported range. Second, one-class matrix facotorization with L2-loss
+    // (-f 12) doesn't support disk-level training.
+    if(!check_parameter(param) || param.fun == P_L2_MFOC)
+        return nullptr;
+    shared_ptr<mf_model> model = fpsg_on_disk(
+        string(tr_path), string(va_path), param);
+    mf_model *model_ret = new mf_model;
+    model_ret->fun = model->fun;
+    model_ret->m = model->m;
+    model_ret->n = model->n;
+    model_ret->k = model->k;
+    model_ret->b = model->b;
+    model_ret->P = model->P;
+    model->P = nullptr;
+    model_ret->Q = model->Q;
+    model->Q = nullptr;
+    return model_ret;
+}
+mf_model* mf_train(mf_problem const *prob, mf_parameter param)
+{
+    return mf_train_with_validation(prob, nullptr, param);
+}
+mf_model* mf_train_on_disk(char const *tr_path, mf_parameter param)
+{
+    return mf_train_with_validation_on_disk(tr_path, "", param);
+}
+mf_double mf_cross_validation(
+    mf_problem const *prob,
+    mf_int nr_folds,
+    mf_parameter param)
+{
+    // Two conditions lead to empty model. First, any parameter is not in its
+    // supported range. Second, one-class matrix facotorization with L2-loss
+    // (-f 12) doesn't support disk-level training.
+    if(!check_parameter(param) || param.fun == P_L2_MFOC)
+        return 0;
+    CrossValidator validator(param, nr_folds, prob);
+    return validator.do_cross_validation();
+}
+mf_double mf_cross_validation_on_disk(
+    char const *prob,
+    mf_int nr_folds,
+    mf_parameter param)
+{
+    // Two conditions lead to empty model. First, any parameter is not in its
+    // supported range. Second, one-class matrix facotorization with L2-loss
+    // (-f 12) doesn't support disk-level training.
+    if(!check_parameter(param) || param.fun == P_L2_MFOC)
+        return 0;
+    CrossValidatorOnDisk validator(param, nr_folds, string(prob));
+    return validator.do_cross_validation();
+}
+mf_problem read_problem(string path)
+{
+    mf_problem prob;
+    prob.m = 0;
+    prob.n = 0;
+    prob.nnz = 0;
+    prob.R = nullptr;
+    if(path.empty())
+        return prob;
+    ifstream f(path);
+    if(!f.is_open())
+        return prob;
+    string line;
+    while(getline(f, line))
+        prob.nnz += 1;
+    mf_node *R = new mf_node[static_cast<size_t>(prob.nnz)];
+    f.close();
+    f.open(path);
+    mf_long idx = 0;
+    for(mf_node N; f >> N.u >> N.v >> N.r;)
+    {
+        if(N.u+1 > prob.m)
+            prob.m = N.u+1;
+        if(N.v+1 > prob.n)
+            prob.n = N.v+1;
+        R[idx] = N;
+        ++idx;
+    }
+    prob.R = R;
+    f.close();
+    return prob;
+}
+mf_int mf_save_model(mf_model const *model, char const *path)
+{
+    ofstream f(path);
+    if(!f.is_open())
+        return 1;
+    f << "f " << model->fun << endl;
+    f << "m " << model->m << endl;
+    f << "n " << model->n << endl;
+    f << "k " << model->k << endl;
+    f << "b " << model->b << endl;
+    auto write = [&] (mf_float *ptr, mf_int size, char prefix)
+    {
+        for(mf_int i = 0; i < size; ++i)
+        {
+            mf_float *ptr1 = ptr + (mf_long)i*model->k;
+            f << prefix << i << " ";
+            if(isnan(ptr1[0]))
+            {
+                f << "F ";
+                for(mf_int d = 0; d < model->k; ++d)
+                    f << 0 << " ";
+            }
+            else
+            {
+                f << "T ";
+                for(mf_int d = 0; d < model->k; ++d)
+                    f << ptr1[d] << " ";
+            }
+            f << endl;
+        }
+    };
+    write(model->P, model->m, 'p');
+    write(model->Q, model->n, 'q');
+    f.close();
+    return 0;
+}
+mf_model* mf_load_model(char const *path)
+{
+    ifstream f(path);
+    if(!f.is_open())
+        return nullptr;
+    string dummy;
+    mf_model *model = new mf_model;
+    model->P = nullptr;
+    model->Q = nullptr;
+    f >> dummy >> model->fun >> dummy >> model->m >> dummy >> model->n >>
+         dummy >> model->k >> dummy >> model->b;
+    try
+    {
+        model->P = Utility::malloc_aligned_float((mf_long)model->m*model->k);
+        model->Q = Utility::malloc_aligned_float((mf_long)model->n*model->k);
+    }
+    catch(bad_alloc const &e)
+    {
+        cerr << e.what() << endl;
+        mf_destroy_model(&model);
+        return nullptr;
+    }
+    auto read = [&] (mf_float *ptr, mf_int size)
+    {
+        for(mf_int i = 0; i < size; ++i)
+        {
+            mf_float *ptr1 = ptr + (mf_long)i*model->k;
+            f >> dummy >> dummy;
+            if(dummy.compare("F") == 0) // nan vector starts with "F"
+                for(mf_int d = 0; d < model->k; ++d)
+                {
+                    f >> dummy;
+                    ptr1[d] = numeric_limits<mf_float>::quiet_NaN();
+                }
+            else
+                for(mf_int d = 0; d < model->k; ++d)
+                    f >> ptr1[d];
+        }
+    };
+    read(model->P, model->m);
+    read(model->Q, model->n);
+    f.close();
+    return model;
+}
+void mf_destroy_model(mf_model **model)
+{
+    if(model == nullptr || *model == nullptr)
+        return;
+    Utility::free_aligned_float((*model)->P);
+    Utility::free_aligned_float((*model)->Q);
+    delete *model;
+    *model = nullptr;
+}
+mf_float mf_predict(mf_model const *model, mf_int u, mf_int v)
+{
+    if(u < 0 || u >= model->m || v < 0 || v >= model->n)
+        return model->b;
+    mf_float *p = model->P+(mf_long)u*model->k;
+    mf_float *q = model->Q+(mf_long)v*model->k;
+    mf_float z = std::inner_product(p, p+model->k, q, (mf_float)0.0f);
+    if(isnan(z))
+        z = model->b;
+    if(model->fun == P_L2_MFC ||
+       model->fun == P_L1_MFC ||
+       model->fun == P_LR_MFC)
+        z = z > 0.0f? 1.0f: -1.0f;
+    return z;
+}
+mf_double calc_rmse(mf_problem *prob, mf_model *model)
+{
+    if(prob->nnz == 0)
+        return 0;
+    mf_double loss = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+:loss)
+#endif
+    for(mf_long i = 0; i < prob->nnz; ++i)
+    {
+        mf_node &N = prob->R[i];
+        mf_float e = N.r - mf_predict(model, N.u, N.v);
+        loss += e*e;
+    }
+    return sqrt(loss/prob->nnz);
+}
+mf_double calc_mae(mf_problem *prob, mf_model *model)
+{
+    if(prob->nnz == 0)
+        return 0;
+    mf_double loss = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+:loss)
+#endif
+    for(mf_long i = 0; i < prob->nnz; ++i)
+    {
+        mf_node &N = prob->R[i];
+        loss += abs(N.r - mf_predict(model, N.u, N.v));
+    }
+    return loss/prob->nnz;
+}
+mf_double calc_gkl(mf_problem *prob, mf_model *model)
+{
+    if(prob->nnz == 0)
+        return 0;
+    mf_double loss = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+:loss)
+#endif
+    for(mf_long i = 0; i < prob->nnz; ++i)
+    {
+        mf_node &N = prob->R[i];
+        mf_float z = mf_predict(model, N.u, N.v);
+        loss += N.r*log(N.r/z)-N.r+z;
+    }
+    return loss/prob->nnz;
+}
+mf_double calc_logloss(mf_problem *prob, mf_model *model)
+{
+    if(prob->nnz == 0)
+        return 0;
+    mf_double logloss = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+:logloss)
+#endif
+    for(mf_long i = 0; i < prob->nnz; ++i)
+    {
+        mf_node &N = prob->R[i];
+        mf_float z = mf_predict(model, N.u, N.v);
+        if(N.r > 0)
+            logloss += log(1.0+exp(-z));
+        else
+            logloss += log(1.0+exp(z));
+    }
+    return logloss/prob->nnz;
+}
+mf_double calc_accuracy(mf_problem *prob, mf_model *model)
+{
+    if(prob->nnz == 0)
+        return 0;
+    mf_double acc = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+:acc)
+#endif
+    for(mf_long i = 0; i < prob->nnz; ++i)
+    {
+        mf_node &N = prob->R[i];
+        mf_float z = mf_predict(model, N.u, N.v);
+        if(N.r > 0)
+            acc += z > 0? 1: 0;
+        else
+            acc += z < 0? 1: 0;
+    }
+    return acc/prob->nnz;
+}
+pair<mf_double, mf_double> calc_mpr_auc(mf_problem *prob,
+                                        mf_model *model, bool transpose)
+{
+    mf_int mf_node::*row_ptr;
+    mf_int mf_node::*col_ptr;
+    mf_int m = 0, n = 0;
+    if(!transpose)
+    {
+        row_ptr = &mf_node::u;
+        col_ptr = &mf_node::v;
+        m = max(prob->m, model->m);
+        n = max(prob->n, model->n);
+    }
+    else
+    {
+        row_ptr = &mf_node::v;
+        col_ptr = &mf_node::u;
+        m = max(prob->n, model->n);
+        n = max(prob->m, model->m);
+    }
+    auto sort_by_id = [&] (mf_node const &lhs, mf_node const &rhs)
+    {
+        return tie(lhs.*row_ptr, lhs.*col_ptr) <
+               tie(rhs.*row_ptr, rhs.*col_ptr);
+    };
+    sort(prob->R, prob->R+prob->nnz, sort_by_id);
+    auto sort_by_pred = [&] (pair<mf_node, mf_float> const &lhs,
+        pair<mf_node, mf_float> const &rhs) { return lhs.second < rhs.second; };
+    vector<mf_int> pos_cnts(m+1, 0);
+    for(mf_int i = 0; i < prob->nnz; ++i)
+        pos_cnts[prob->R[i].*row_ptr+1] += 1;
+    for(mf_int i = 1; i < m+1; ++i)
+        pos_cnts[i] += pos_cnts[i-1];
+    mf_int total_m = 0;
+    mf_long total_pos = 0;
+    mf_double all_u_mpr = 0;
+    mf_double all_u_auc = 0;
+#if defined USEOMP
+#pragma omp parallel for schedule(static) reduction(+: total_m, total_pos, all_u_mpr, all_u_auc)
+#endif
+    for(mf_int i = 0; i < m; ++i)
+    {
+        if(pos_cnts[i+1]-pos_cnts[i] < 1)
+            continue;
+        vector<pair<mf_node, mf_float>> row(n);
+        for(mf_int j = 0; j < n; ++j)
+        {
+            mf_node N;
+            N.*row_ptr = i;
+            N.*col_ptr = j;
+            N.r = 0;
+            row[j] = make_pair(N, mf_predict(model, N.u, N.v));
+        }
+        mf_int pos = 0;
+        vector<mf_int> index(pos_cnts[i+1]-pos_cnts[i], 0);
+        for(mf_int j = pos_cnts[i]; j < pos_cnts[i+1]; ++j)
+        {
+            if(prob->R[j].r <= 0)
+                continue;
+            mf_int col = prob->R[j].*col_ptr;
+            row[col].first.r = prob->R[j].r;
+            index[pos] = col;
+            pos += 1;
+        }
+        if(n-pos < 1 || pos < 1)
+            continue;
+        ++total_m;
+        total_pos += pos;
+        mf_int count = 0;
+        for(mf_int k = 0; k < pos; ++k)
+        {
+            swap(row[count], row[index[k]]);
+            ++count;
+        }
+        sort(row.begin(), row.begin()+pos, sort_by_pred);
+        mf_double u_mpr = 0;
+        mf_double u_auc = 0;
+        for(auto neg_it = row.begin()+pos; neg_it != row.end(); ++neg_it)
+        {
+            if(row[pos-1].second <= neg_it->second)
+            {
+                u_mpr += pos;
+                continue;
+            }
+            mf_int left = 0;
+            mf_int right = pos-1;
+            while(left < right)
+            {
+                mf_int mid = (left+right)/2;
+                if(row[mid].second > neg_it->second)
+                    right = mid;
+                else
+                    left = mid+1;
+            }
+            u_mpr += left;
+            u_auc += pos-left;
+        }
+        all_u_mpr += u_mpr/(n-pos);
+        all_u_auc += u_auc/(n-pos)/pos;
+    }
+    all_u_mpr /= total_pos;
+    all_u_auc /= total_m;
+    return make_pair(all_u_mpr, all_u_auc);
+}
+mf_double calc_mpr(mf_problem *prob, mf_model *model, bool transpose)
+{
+    return calc_mpr_auc(prob, model, transpose).first;
+}
+mf_double calc_auc(mf_problem *prob, mf_model *model, bool transpose)
+{
+    return calc_mpr_auc(prob, model, transpose).second;
+}
+mf_parameter mf_get_default_param()
+{
+    mf_parameter param;
+    param.fun = P_L2_MFR;
+    param.k = 8;
+    param.nr_threads = 12;
+    param.nr_bins = 20;
+    param.nr_iters = 20;
+    param.lambda_p1 = 0.0f;
+    param.lambda_q1 = 0.0f;
+    param.lambda_p2 = 0.1f;
+    param.lambda_q2 = 0.1f;
+    param.eta = 0.1f;
+    param.alpha = 1.0f;
+    param.c = 0.0001f;
+    param.do_nmf = false;
+    param.quiet = false;
+    param.copy_data = true;
+    return param;
+}
+}