rumale 0.12.8 → 0.12.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/README.md +2 -2
- data/ext/rumale/rumale.c +80 -48
- data/lib/rumale.rb +1 -0
- data/lib/rumale/clustering/k_medoids.rb +157 -0
- data/lib/rumale/tree/decision_tree_regressor.rb +2 -5
- data/lib/rumale/version.rb +1 -1
- data/rumale.gemspec +1 -1
- metadata +4 -3
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: c55e2ab90432838616c16fdf35d4eac150cc02b8
         | 
| 4 | 
            +
              data.tar.gz: c605feef7c8d3d7dce4e8330419ba88288d17f74
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: f3ec59d17a66d74d978860537271c0d7c8881924cce6589345d43079897879ac603b6c01c7b0884419457e4bf6a99187345d203e8638be6d96aabe1ce513560f
         | 
| 7 | 
            +
              data.tar.gz: 86f0cbf4c92b72b9caff2e5a9ed39b47013e4c11bdacf6661148a01f3c69a72253bc8690fa5c28207888461b8bc1070f39b87bc23df11866b9018d61cd37b2fd
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -6,14 +6,14 @@ | |
| 6 6 | 
             
            [](https://coveralls.io/github/yoshoku/rumale?branch=master)
         | 
| 7 7 | 
             
            [](https://badge.fury.io/rb/rumale)
         | 
| 8 8 | 
             
            [](https://github.com/yoshoku/rumale/blob/master/LICENSE.txt)
         | 
| 9 | 
            -
            [](https://www.rubydoc.info/gems/rumale/0.12. | 
| 9 | 
            +
            [](https://www.rubydoc.info/gems/rumale/0.12.9)
         | 
| 10 10 |  | 
| 11 11 | 
             
            Rumale (**Ru**by **ma**chine **le**arning) is a machine learning library in Ruby.
         | 
| 12 12 | 
             
            Rumale provides machine learning algorithms with interfaces similar to Scikit-Learn in Python.
         | 
| 13 13 | 
             
            Rumale supports Linear / Kernel Support Vector Machine,
         | 
| 14 14 | 
             
            Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
         | 
| 15 15 | 
             
            Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor classifier,
         | 
| 16 | 
            -
            K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 16 | 
            +
            K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 17 17 | 
             
            Mutidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
         | 
| 18 18 |  | 
| 19 19 | 
             
            This project was formerly known as "SVMKit".
         | 
    
        data/ext/rumale/rumale.c
    CHANGED
    
    | @@ -131,9 +131,9 @@ calc_impurity_cls(const char* criterion, VALUE histogram, const long n_elements) | |
| 131 131 | 
             
            }
         | 
| 132 132 |  | 
| 133 133 | 
             
            double
         | 
| 134 | 
            -
            calc_impurity_reg( | 
| 134 | 
            +
            calc_impurity_reg(const char* criterion, VALUE target_vecs, VALUE sum_vec)
         | 
| 135 135 | 
             
            {
         | 
| 136 | 
            -
              if (strcmp( | 
| 136 | 
            +
              if (strcmp(criterion, "mae") == 0) {
         | 
| 137 137 | 
             
                return calc_mae(target_vecs, sum_vec);
         | 
| 138 138 | 
             
              }
         | 
| 139 139 | 
             
              return calc_mse(target_vecs, sum_vec);
         | 
| @@ -286,83 +286,115 @@ find_split_params_cls(VALUE self, VALUE criterion, VALUE impurity, VALUE order, | |
| 286 286 |  | 
| 287 287 | 
             
            /**
         | 
| 288 288 | 
             
             * @!visibility private
         | 
| 289 | 
            -
             * Find for split point with maximum information gain.
         | 
| 290 | 
            -
             *
         | 
| 291 | 
            -
             * @overload find_split_params(criterion, impurity, sorted_features, sorted_targets) -> Array<Float>
         | 
| 292 | 
            -
             *
         | 
| 293 | 
            -
             * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
         | 
| 294 | 
            -
             * @param impurity [Float] The impurity of whole dataset.
         | 
| 295 | 
            -
             * @param sorted_features [Numo::DFloat] (shape: [n_samples]) The feature values sorted in ascending order.
         | 
| 296 | 
            -
             * @param sorted_targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values sorted according to feature values.
         | 
| 297 | 
            -
             * @return [Float] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
         | 
| 298 289 | 
             
             */
         | 
| 299 | 
            -
             | 
| 300 | 
            -
             | 
| 290 | 
            +
            typedef struct {
         | 
| 291 | 
            +
              char* criterion;
         | 
| 292 | 
            +
              double impurity;
         | 
| 293 | 
            +
            } split_opts_reg;
         | 
| 294 | 
            +
            /**
         | 
| 295 | 
            +
             * @!visibility private
         | 
| 296 | 
            +
             */
         | 
| 297 | 
            +
            static void
         | 
| 298 | 
            +
            iter_find_split_params_reg(na_loop_t const* lp)
         | 
| 301 299 | 
             
            {
         | 
| 302 | 
            -
              const  | 
| 303 | 
            -
              const  | 
| 304 | 
            -
              const double  | 
| 305 | 
            -
              long  | 
| 300 | 
            +
              const int32_t* o = (int32_t*)NDL_PTR(lp, 0);
         | 
| 301 | 
            +
              const double* f = (double*)NDL_PTR(lp, 1);
         | 
| 302 | 
            +
              const double* y = (double*)NDL_PTR(lp, 2);
         | 
| 303 | 
            +
              const long n_elements = NDL_SHAPE(lp, 0)[0];
         | 
| 304 | 
            +
              const long n_outputs = NDL_SHAPE(lp, 2)[1];
         | 
| 305 | 
            +
              const char* criterion = ((split_opts_reg*)lp->opt_ptr)->criterion;
         | 
| 306 | 
            +
              const double w_impurity = ((split_opts_reg*)lp->opt_ptr)->impurity;
         | 
| 307 | 
            +
              double* params = (double*)NDL_PTR(lp, 3);
         | 
| 308 | 
            +
              long i, j;
         | 
| 306 309 | 
             
              long curr_pos = 0;
         | 
| 307 310 | 
             
              long next_pos = 0;
         | 
| 308 311 | 
             
              long n_l_elements = 0;
         | 
| 309 312 | 
             
              long n_r_elements = n_elements;
         | 
| 310 | 
            -
              double  | 
| 311 | 
            -
              double  | 
| 313 | 
            +
              double curr_el = f[o[0]];
         | 
| 314 | 
            +
              double last_el = f[o[n_elements - 1]];
         | 
| 312 315 | 
             
              double next_el;
         | 
| 313 316 | 
             
              double l_impurity;
         | 
| 314 317 | 
             
              double r_impurity;
         | 
| 315 318 | 
             
              double gain;
         | 
| 316 | 
            -
              VALUE l_sum_vec = create_zero_vector( | 
| 317 | 
            -
              VALUE r_sum_vec = create_zero_vector( | 
| 319 | 
            +
              VALUE l_sum_vec = create_zero_vector(n_outputs);
         | 
| 320 | 
            +
              VALUE r_sum_vec = create_zero_vector(n_outputs);
         | 
| 318 321 | 
             
              VALUE l_target_vecs = rb_ary_new();
         | 
| 319 322 | 
             
              VALUE r_target_vecs = rb_ary_new();
         | 
| 320 323 | 
             
              VALUE target;
         | 
| 321 | 
            -
              VALUE opt_params = rb_ary_new2(4);
         | 
| 322 324 |  | 
| 323 325 | 
             
              /* Initialize optimal parameters. */
         | 
| 324 | 
            -
               | 
| 325 | 
            -
               | 
| 326 | 
            -
               | 
| 327 | 
            -
               | 
| 326 | 
            +
              params[0] = 0.0;        /* left impurity */
         | 
| 327 | 
            +
              params[1] = w_impurity; /* right impurity */
         | 
| 328 | 
            +
              params[2] = curr_el;    /* threshold */
         | 
| 329 | 
            +
              params[3] = 0.0;        /* gain */
         | 
| 328 330 |  | 
| 329 331 | 
             
              /* Initialize child node variables. */
         | 
| 330 | 
            -
              for ( | 
| 331 | 
            -
                target =  | 
| 332 | 
            +
              for (i = 0; i < n_elements; i++) {
         | 
| 333 | 
            +
                target = rb_ary_new2(n_outputs);
         | 
| 334 | 
            +
                for (j = 0; j < n_outputs; j++) {
         | 
| 335 | 
            +
                  rb_ary_store(target, j, DBL2NUM(y[o[i] * n_outputs + j]));
         | 
| 336 | 
            +
                }
         | 
| 332 337 | 
             
                add_sum_vec(r_sum_vec, target);
         | 
| 333 338 | 
             
                rb_ary_push(r_target_vecs, target);
         | 
| 334 339 | 
             
              }
         | 
| 335 340 |  | 
| 336 341 | 
             
              /* Find optimal parameters. */
         | 
| 337 342 | 
             
              while (curr_pos < n_elements && curr_el != last_el) {
         | 
| 338 | 
            -
                next_el =  | 
| 343 | 
            +
                next_el = f[o[next_pos]];
         | 
| 339 344 | 
             
                while (next_pos < n_elements && next_el == curr_el) {
         | 
| 340 | 
            -
                  target =  | 
| 341 | 
            -
                   | 
| 345 | 
            +
                  target = rb_ary_shift(r_target_vecs);
         | 
| 346 | 
            +
                  n_r_elements--;
         | 
| 347 | 
            +
                  sub_sum_vec(r_sum_vec, target);
         | 
| 342 348 | 
             
                  rb_ary_push(l_target_vecs, target);
         | 
| 343 349 | 
             
                  n_l_elements++;
         | 
| 344 | 
            -
                   | 
| 345 | 
            -
                   | 
| 346 | 
            -
                   | 
| 347 | 
            -
                  next_el = NUM2DBL(rb_ary_entry(sorted_f, ++next_pos));
         | 
| 350 | 
            +
                  add_sum_vec(l_sum_vec, target);
         | 
| 351 | 
            +
                  next_pos++;
         | 
| 352 | 
            +
                  next_el = f[o[next_pos]];
         | 
| 348 353 | 
             
                }
         | 
| 349 354 | 
             
                /* Calculate gain of new split. */
         | 
| 350 355 | 
             
                l_impurity = calc_impurity_reg(criterion, l_target_vecs, l_sum_vec);
         | 
| 351 356 | 
             
                r_impurity = calc_impurity_reg(criterion, r_target_vecs, r_sum_vec);
         | 
| 352 357 | 
             
                gain = w_impurity - (n_l_elements * l_impurity + n_r_elements * r_impurity) / n_elements;
         | 
| 353 358 | 
             
                /* Update optimal parameters. */
         | 
| 354 | 
            -
                if (gain >  | 
| 355 | 
            -
                   | 
| 356 | 
            -
                   | 
| 357 | 
            -
                   | 
| 358 | 
            -
                   | 
| 359 | 
            +
                if (gain > params[3]) {
         | 
| 360 | 
            +
                  params[0] = l_impurity;
         | 
| 361 | 
            +
                  params[1] = r_impurity;
         | 
| 362 | 
            +
                  params[2] = 0.5 * (curr_el + next_el);
         | 
| 363 | 
            +
                  params[3] = gain;
         | 
| 359 364 | 
             
                }
         | 
| 360 365 | 
             
                if (next_pos == n_elements) break;
         | 
| 361 366 | 
             
                curr_pos = next_pos;
         | 
| 362 | 
            -
                curr_el =  | 
| 367 | 
            +
                curr_el = f[o[curr_pos]];
         | 
| 363 368 | 
             
              }
         | 
| 364 | 
            -
             | 
| 365 | 
            -
             | 
| 369 | 
            +
            }
         | 
| 370 | 
            +
            /**
         | 
| 371 | 
            +
             * @!visibility private
         | 
| 372 | 
            +
             * Find for split point with maximum information gain.
         | 
| 373 | 
            +
             *
         | 
| 374 | 
            +
             * @overload find_split_params(criterion, impurity, order, features, targets) -> Array<Float>
         | 
| 375 | 
            +
             *
         | 
| 376 | 
            +
             * @param criterion [String] The function to evaluate spliting point. Supported criteria are 'mae' and 'mse'.
         | 
| 377 | 
            +
             * @param impurity [Float] The impurity of whole dataset.
         | 
| 378 | 
            +
             * @param order [Numo::Int32] (shape: [n_samples]) The element indices sorted according to feature values in ascending order.
         | 
| 379 | 
            +
             * @param features [Numo::DFloat] (shape: [n_samples]) The feature values.
         | 
| 380 | 
            +
             * @param targets [Numo::DFloat] (shape: [n_samples, n_outputs]) The target values.
         | 
| 381 | 
            +
             * @return [Array<Float>] The array consists of optimal parameters including impurities of child nodes, threshold, and gain.
         | 
| 382 | 
            +
             */
         | 
| 383 | 
            +
            static VALUE
         | 
| 384 | 
            +
            find_split_params_reg(VALUE self, VALUE criterion, VALUE impurity, VALUE order, VALUE features, VALUE targets)
         | 
| 385 | 
            +
            {
         | 
| 386 | 
            +
              ndfunc_arg_in_t ain[3] = { {numo_cInt32, 1}, {numo_cDFloat, 1}, {numo_cDFloat, 2} };
         | 
| 387 | 
            +
              size_t out_shape[1] = { 4 };
         | 
| 388 | 
            +
              ndfunc_arg_out_t aout[1] = { {numo_cDFloat, 1, out_shape} };
         | 
| 389 | 
            +
              ndfunc_t ndf = { (na_iter_func_t)iter_find_split_params_reg, NO_LOOP, 3, 1, ain, aout };
         | 
| 390 | 
            +
              split_opts_reg opts = { StringValuePtr(criterion), NUM2DBL(impurity) };
         | 
| 391 | 
            +
              VALUE params = na_ndloop3(&ndf, &opts, 3, order, features, targets);
         | 
| 392 | 
            +
              VALUE results = rb_ary_new2(4);
         | 
| 393 | 
            +
              rb_ary_store(results, 0, DBL2NUM(((double*)na_get_pointer_for_read(params))[0]));
         | 
| 394 | 
            +
              rb_ary_store(results, 1, DBL2NUM(((double*)na_get_pointer_for_read(params))[1]));
         | 
| 395 | 
            +
              rb_ary_store(results, 2, DBL2NUM(((double*)na_get_pointer_for_read(params))[2]));
         | 
| 396 | 
            +
              rb_ary_store(results, 3, DBL2NUM(((double*)na_get_pointer_for_read(params))[3]));
         | 
| 397 | 
            +
              return results;
         | 
| 366 398 | 
             
            }
         | 
| 367 399 |  | 
| 368 400 | 
             
            /**
         | 
| @@ -487,7 +519,7 @@ node_impurity_cls(VALUE self, VALUE criterion, VALUE y_nary, VALUE n_elements_, | |
| 487 519 | 
             
             * @overload node_impurity(criterion, y) -> Float
         | 
| 488 520 | 
             
             *
         | 
| 489 521 | 
             
             * @param criterion [String] The function to calculate impurity. Supported criteria are 'mae' and 'mse'.
         | 
| 490 | 
            -
             * @param y [ | 
| 522 | 
            +
             * @param y [Array<Float>] (shape: [n_samples, n_outputs]) The taget values.
         | 
| 491 523 | 
             
             * @return [Float] impurity
         | 
| 492 524 | 
             
             */
         | 
| 493 525 | 
             
            static VALUE
         | 
| @@ -495,8 +527,8 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y) | |
| 495 527 | 
             
            {
         | 
| 496 528 | 
             
              long i;
         | 
| 497 529 | 
             
              const long n_elements = RARRAY_LEN(y);
         | 
| 498 | 
            -
              const long  | 
| 499 | 
            -
              VALUE sum_vec = create_zero_vector( | 
| 530 | 
            +
              const long n_outputs = RARRAY_LEN(rb_ary_entry(y, 0));
         | 
| 531 | 
            +
              VALUE sum_vec = create_zero_vector(n_outputs);
         | 
| 500 532 | 
             
              VALUE target_vecs = rb_ary_new();
         | 
| 501 533 | 
             
              VALUE target;
         | 
| 502 534 |  | 
| @@ -506,7 +538,7 @@ node_impurity_reg(VALUE self, VALUE criterion, VALUE y) | |
| 506 538 | 
             
                rb_ary_push(target_vecs, target);
         | 
| 507 539 | 
             
              }
         | 
| 508 540 |  | 
| 509 | 
            -
              return DBL2NUM(calc_impurity_reg(criterion, target_vecs, sum_vec));
         | 
| 541 | 
            +
              return DBL2NUM(calc_impurity_reg(StringValuePtr(criterion), target_vecs, sum_vec));
         | 
| 510 542 | 
             
            }
         | 
| 511 543 |  | 
| 512 544 | 
             
            void Init_rumale(void)
         | 
| @@ -536,7 +568,7 @@ void Init_rumale(void) | |
| 536 568 | 
             
              VALUE mExtGTreeReg = rb_define_module_under(mTree, "ExtGradientTreeRegressor");
         | 
| 537 569 |  | 
| 538 570 | 
             
              rb_define_private_method(mExtDTreeCls, "find_split_params", find_split_params_cls, 6);
         | 
| 539 | 
            -
              rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg,  | 
| 571 | 
            +
              rb_define_private_method(mExtDTreeReg, "find_split_params", find_split_params_reg, 5);
         | 
| 540 572 | 
             
              rb_define_private_method(mExtGTreeReg, "find_split_params", find_split_params_grad_reg, 7);
         | 
| 541 573 | 
             
              rb_define_private_method(mExtDTreeCls, "node_impurity", node_impurity_cls, 4);
         | 
| 542 574 | 
             
              rb_define_private_method(mExtDTreeReg, "node_impurity", node_impurity_reg, 2);
         | 
    
        data/lib/rumale.rb
    CHANGED
    
    | @@ -57,6 +57,7 @@ require 'rumale/ensemble/random_forest_regressor' | |
| 57 57 | 
             
            require 'rumale/ensemble/extra_trees_classifier'
         | 
| 58 58 | 
             
            require 'rumale/ensemble/extra_trees_regressor'
         | 
| 59 59 | 
             
            require 'rumale/clustering/k_means'
         | 
| 60 | 
            +
            require 'rumale/clustering/k_medoids'
         | 
| 60 61 | 
             
            require 'rumale/clustering/gaussian_mixture'
         | 
| 61 62 | 
             
            require 'rumale/clustering/dbscan'
         | 
| 62 63 | 
             
            require 'rumale/clustering/power_iteration'
         | 
| @@ -0,0 +1,157 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'rumale/base/base_estimator'
         | 
| 4 | 
            +
            require 'rumale/base/cluster_analyzer'
         | 
| 5 | 
            +
            require 'rumale/pairwise_metric'
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            module Rumale
         | 
| 8 | 
            +
              module Clustering
         | 
| 9 | 
            +
                # KMedoids is a class that implements K-Medoids cluster analysis.
         | 
| 10 | 
            +
                #
         | 
| 11 | 
            +
                # @example
         | 
| 12 | 
            +
                #   analyzer = Rumale::Clustering::KMedoids.new(n_clusters: 10, max_iter: 50)
         | 
| 13 | 
            +
                #   cluster_labels = analyzer.fit_predict(samples)
         | 
| 14 | 
            +
                #
         | 
| 15 | 
            +
                # *Reference*
         | 
| 16 | 
            +
                # - D. Arthur and S. Vassilvitskii, "k-means++: the advantages of careful seeding," Proc. SODA'07, pp. 1027--1035, 2007.
         | 
| 17 | 
            +
                class KMedoids
         | 
| 18 | 
            +
                  include Base::BaseEstimator
         | 
| 19 | 
            +
                  include Base::ClusterAnalyzer
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                  # Return the indices of medoids.
         | 
| 22 | 
            +
                  # @return [Numo::Int32] (shape: [n_clusters])
         | 
| 23 | 
            +
                  attr_reader :medoid_ids
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  # Return the random generator.
         | 
| 26 | 
            +
                  # @return [Random]
         | 
| 27 | 
            +
                  attr_reader :rng
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                  # Create a new cluster analyzer with K-Medoids method.
         | 
| 30 | 
            +
                  #
         | 
| 31 | 
            +
                  # @param n_clusters [Integer] The number of clusters.
         | 
| 32 | 
            +
                  # @param metric [String] The metric to calculate the distances in original space.
         | 
| 33 | 
            +
                  #   If metric is 'euclidean', Euclidean distance is calculated for distance in original space.
         | 
| 34 | 
            +
                  #   If metric is 'precomputed', the fit and fit_transform methods expect to be given a distance matrix.
         | 
| 35 | 
            +
                  # @param init [String] The initialization method for centroids ('random' or 'k-means++').
         | 
| 36 | 
            +
                  # @param max_iter [Integer] The maximum number of iterations.
         | 
| 37 | 
            +
                  # @param tol [Float] The tolerance of termination criterion.
         | 
| 38 | 
            +
                  # @param random_seed [Integer] The seed value using to initialize the random generator.
         | 
| 39 | 
            +
                  def initialize(n_clusters: 8, metric: 'euclidean', init: 'k-means++', max_iter: 50, tol: 1.0e-4, random_seed: nil)
         | 
| 40 | 
            +
                    check_params_integer(n_clusters: n_clusters, max_iter: max_iter)
         | 
| 41 | 
            +
                    check_params_float(tol: tol)
         | 
| 42 | 
            +
                    check_params_string(metric: metric, init: init)
         | 
| 43 | 
            +
                    check_params_type_or_nil(Integer, random_seed: random_seed)
         | 
| 44 | 
            +
                    check_params_positive(n_clusters: n_clusters, max_iter: max_iter)
         | 
| 45 | 
            +
                    @params = {}
         | 
| 46 | 
            +
                    @params[:n_clusters] = n_clusters
         | 
| 47 | 
            +
                    @params[:metric] = metric == 'precomputed' ? 'precomputed' : 'euclidean'
         | 
| 48 | 
            +
                    @params[:init] = init == 'random' ? 'random' : 'k-means++'
         | 
| 49 | 
            +
                    @params[:max_iter] = max_iter
         | 
| 50 | 
            +
                    @params[:tol] = tol
         | 
| 51 | 
            +
                    @params[:random_seed] = random_seed
         | 
| 52 | 
            +
                    @params[:random_seed] ||= srand
         | 
| 53 | 
            +
                    @medoid_ids = nil
         | 
| 54 | 
            +
                    @cluster_centers = nil
         | 
| 55 | 
            +
                    @rng = Random.new(@params[:random_seed])
         | 
| 56 | 
            +
                  end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                  # Analysis clusters with given training data.
         | 
| 59 | 
            +
                  #
         | 
| 60 | 
            +
                  # @overload fit(x) -> KMedoids
         | 
| 61 | 
            +
                  #
         | 
| 62 | 
            +
                  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for fitting the model.
         | 
| 63 | 
            +
                  #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
         | 
| 64 | 
            +
                  # @return [KMedoids] The learned cluster analyzer itself.
         | 
| 65 | 
            +
                  def fit(x, _not_used = nil)
         | 
| 66 | 
            +
                    check_sample_array(x)
         | 
| 67 | 
            +
                    raise ArgumentError, 'Expect the input distance matrix to be square.' if @params[:metric] == 'precomputed' && x.shape[0] != x.shape[1]
         | 
| 68 | 
            +
                    # initialize some varibales.
         | 
| 69 | 
            +
                    distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x)
         | 
| 70 | 
            +
                    init_cluster_centers(distance_mat)
         | 
| 71 | 
            +
                    error = distance_mat[true, @medoid_ids].mean
         | 
| 72 | 
            +
                    @params[:max_iter].times do |_t|
         | 
| 73 | 
            +
                      cluster_labels = assign_cluster(distance_mat[true, @medoid_ids])
         | 
| 74 | 
            +
                      @params[:n_clusters].times do |n|
         | 
| 75 | 
            +
                        assigned_ids = cluster_labels.eq(n).where
         | 
| 76 | 
            +
                        @medoid_ids[n] = assigned_ids[distance_mat[assigned_ids, assigned_ids].sum(axis: 1).min_index]
         | 
| 77 | 
            +
                      end
         | 
| 78 | 
            +
                      new_error = distance_mat[true, @medoid_ids].mean
         | 
| 79 | 
            +
                      break if (error - new_error).abs <= @params[:tol]
         | 
| 80 | 
            +
                      error = new_error
         | 
| 81 | 
            +
                    end
         | 
| 82 | 
            +
                    @cluster_centers = x[@medoid_ids, true].dup if @params[:metric] == 'euclidean'
         | 
| 83 | 
            +
                    self
         | 
| 84 | 
            +
                  end
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                  # Predict cluster labels for samples.
         | 
| 87 | 
            +
                  #
         | 
| 88 | 
            +
                  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the cluster label.
         | 
| 89 | 
            +
                  #   If the metric is 'precomputed', x must be distances between samples and medoids (shape: [n_samples, n_clusters]).
         | 
| 90 | 
            +
                  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
         | 
| 91 | 
            +
                  def predict(x)
         | 
| 92 | 
            +
                    check_sample_array(x)
         | 
| 93 | 
            +
                    distance_mat = @params[:metric] == 'precomputed' ? x : Rumale::PairwiseMetric.euclidean_distance(x, @cluster_centers)
         | 
| 94 | 
            +
                    if @params[:metric] == 'precomputed' && distance_mat.shape[1] != @medoid_ids.size
         | 
| 95 | 
            +
                      raise ArgumentError, 'Expect the size input matrix to be n_samples-by-n_clusters.'
         | 
| 96 | 
            +
                    end
         | 
| 97 | 
            +
                    assign_cluster(distance_mat)
         | 
| 98 | 
            +
                  end
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                  # Analysis clusters and assign samples to clusters.
         | 
| 101 | 
            +
                  #
         | 
| 102 | 
            +
                  # @param x [Numo::DFloat] (shape: [n_samples, n_features]) The training data to be used for cluster analysis.
         | 
| 103 | 
            +
                  #   If the metric is 'precomputed', x must be a square distance matrix (shape: [n_samples, n_samples]).
         | 
| 104 | 
            +
                  # @return [Numo::Int32] (shape: [n_samples]) Predicted cluster label per sample.
         | 
| 105 | 
            +
                  def fit_predict(x)
         | 
| 106 | 
            +
                    check_sample_array(x)
         | 
| 107 | 
            +
                    fit(x)
         | 
| 108 | 
            +
                    if @params[:metric] == 'precomputed'
         | 
| 109 | 
            +
                      predict(x[true, @medoid_ids])
         | 
| 110 | 
            +
                    else
         | 
| 111 | 
            +
                      predict(x)
         | 
| 112 | 
            +
                    end
         | 
| 113 | 
            +
                  end
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                  # Dump marshal data.
         | 
| 116 | 
            +
                  # @return [Hash] The marshal data.
         | 
| 117 | 
            +
                  def marshal_dump
         | 
| 118 | 
            +
                    { params: @params,
         | 
| 119 | 
            +
                      medoid_ids: @medoid_ids,
         | 
| 120 | 
            +
                      cluster_centers: @cluster_centers,
         | 
| 121 | 
            +
                      rng: @rng }
         | 
| 122 | 
            +
                  end
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                  # Load marshal data.
         | 
| 125 | 
            +
                  # @return [nil]
         | 
| 126 | 
            +
                  def marshal_load(obj)
         | 
| 127 | 
            +
                    @params = obj[:params]
         | 
| 128 | 
            +
                    @medoid_ids = obj[:medoid_ids]
         | 
| 129 | 
            +
                    @cluster_centers = obj[:cluster_centers]
         | 
| 130 | 
            +
                    @rng = obj[:rng]
         | 
| 131 | 
            +
                    nil
         | 
| 132 | 
            +
                  end
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                  private
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                  def assign_cluster(distances_to_medoids)
         | 
| 137 | 
            +
                    distances_to_medoids.min_index(axis: 1) - Numo::Int32[*0.step(distances_to_medoids.size - 1, @params[:n_clusters])]
         | 
| 138 | 
            +
                  end
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                  def init_cluster_centers(distance_mat)
         | 
| 141 | 
            +
                    # random initialize
         | 
| 142 | 
            +
                    n_samples = distance_mat.shape[0]
         | 
| 143 | 
            +
                    sub_rng = @rng.dup
         | 
| 144 | 
            +
                    @medoid_ids = Numo::Int32.asarray([*0...n_samples].sample(@params[:n_clusters], random: sub_rng))
         | 
| 145 | 
            +
                    return unless @params[:init] == 'k-means++'
         | 
| 146 | 
            +
                    # k-means++ initialize
         | 
| 147 | 
            +
                    (1...@params[:n_clusters]).each do |n|
         | 
| 148 | 
            +
                      distances = distance_mat[true, @medoid_ids[0...n]]
         | 
| 149 | 
            +
                      min_distances = distances.flatten[distances.min_index(axis: 1)]
         | 
| 150 | 
            +
                      probs = min_distances**2 / (min_distances**2).sum
         | 
| 151 | 
            +
                      cum_probs = probs.cumsum
         | 
| 152 | 
            +
                      @medoid_ids[n] = cum_probs.gt(sub_rng.rand).where.to_a.first
         | 
| 153 | 
            +
                    end
         | 
| 154 | 
            +
                  end
         | 
| 155 | 
            +
                end
         | 
| 156 | 
            +
              end
         | 
| 157 | 
            +
            end
         | 
| @@ -126,11 +126,8 @@ module Rumale | |
| 126 126 | 
             
                    node
         | 
| 127 127 | 
             
                  end
         | 
| 128 128 |  | 
| 129 | 
            -
                  def best_split( | 
| 130 | 
            -
                     | 
| 131 | 
            -
                    sorted_f = features[order].to_a
         | 
| 132 | 
            -
                    sorted_y = y[order, true].to_a
         | 
| 133 | 
            -
                    find_split_params(@params[:criterion], whole_impurity, sorted_f, sorted_y)
         | 
| 129 | 
            +
                  def best_split(f, y, impurity)
         | 
| 130 | 
            +
                    find_split_params(@params[:criterion], impurity, f.sort_index, f, y)
         | 
| 134 131 | 
             
                  end
         | 
| 135 132 |  | 
| 136 133 | 
             
                  def impurity(y)
         | 
    
        data/lib/rumale/version.rb
    CHANGED
    
    
    
        data/rumale.gemspec
    CHANGED
    
    | @@ -19,7 +19,7 @@ Gem::Specification.new do |spec| | |
| 19 19 | 
             
                Rumale currently supports Linear / Kernel Support Vector Machine,
         | 
| 20 20 | 
             
                Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
         | 
| 21 21 | 
             
                Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
         | 
| 22 | 
            -
                K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 22 | 
            +
                K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 23 23 | 
             
                Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
         | 
| 24 24 | 
             
              MSG
         | 
| 25 25 | 
             
              spec.homepage      = 'https://github.com/yoshoku/rumale'
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: rumale
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.12. | 
| 4 | 
            +
              version: 0.12.9
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - yoshoku
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2019-07- | 
| 11 | 
            +
            date: 2019-07-27 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: numo-narray
         | 
| @@ -114,7 +114,7 @@ description: | | |
| 114 114 | 
             
              Rumale currently supports Linear / Kernel Support Vector Machine,
         | 
| 115 115 | 
             
              Logistic Regression, Linear Regression, Ridge, Lasso, Factorization Machine,
         | 
| 116 116 | 
             
              Naive Bayes, Decision Tree, AdaBoost, Gradient Tree Boosting, Random Forest, Extra-Trees, K-nearest neighbor algorithm,
         | 
| 117 | 
            -
              K-Means, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 117 | 
            +
              K-Means, K-Medoids, Gaussian Mixture Model, DBSCAN, Power Iteration Clustering,
         | 
| 118 118 | 
             
              Multidimensional Scaling, t-SNE, Principal Component Analysis, and Non-negative Matrix Factorization.
         | 
| 119 119 | 
             
            email:
         | 
| 120 120 | 
             
            - yoshoku@outlook.com
         | 
| @@ -150,6 +150,7 @@ files: | |
| 150 150 | 
             
            - lib/rumale/clustering/dbscan.rb
         | 
| 151 151 | 
             
            - lib/rumale/clustering/gaussian_mixture.rb
         | 
| 152 152 | 
             
            - lib/rumale/clustering/k_means.rb
         | 
| 153 | 
            +
            - lib/rumale/clustering/k_medoids.rb
         | 
| 153 154 | 
             
            - lib/rumale/clustering/power_iteration.rb
         | 
| 154 155 | 
             
            - lib/rumale/dataset.rb
         | 
| 155 156 | 
             
            - lib/rumale/decomposition/nmf.rb
         |