RubyGems - fselector - Versions diffs - 1.0.1 → 1.1.0 - Mend

fselector 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

data/ChangeLog +9 -0
data/README.md +62 -26
data/lib/fselector.rb +1 -1
data/lib/fselector/algo_base/base.rb +89 -34
data/lib/fselector/algo_base/base_CFS.rb +20 -7
data/lib/fselector/algo_base/base_Relief.rb +5 -5
data/lib/fselector/algo_base/base_ReliefF.rb +11 -3
data/lib/fselector/algo_base/base_discrete.rb +8 -0
data/lib/fselector/algo_continuous/BSS_WSS.rb +3 -1
data/lib/fselector/algo_continuous/CFS_c.rb +3 -1
data/lib/fselector/algo_continuous/FTest.rb +2 -0
data/lib/fselector/algo_continuous/PMetric.rb +4 -2
data/lib/fselector/algo_continuous/ReliefF_c.rb +11 -0
data/lib/fselector/algo_continuous/Relief_c.rb +14 -3
data/lib/fselector/algo_continuous/TScore.rb +5 -3
data/lib/fselector/algo_continuous/WilcoxonRankSum.rb +5 -3
data/lib/fselector/algo_discrete/Accuracy.rb +2 -0
data/lib/fselector/algo_discrete/AccuracyBalanced.rb +2 -0
data/lib/fselector/algo_discrete/BiNormalSeparation.rb +3 -1
data/lib/fselector/algo_discrete/CFS_d.rb +3 -0
data/lib/fselector/algo_discrete/ChiSquaredTest.rb +3 -0
data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +2 -0
data/lib/fselector/algo_discrete/DocumentFrequency.rb +2 -0
data/lib/fselector/algo_discrete/F1Measure.rb +2 -0
data/lib/fselector/algo_discrete/FastCorrelationBasedFilter.rb +12 -1
data/lib/fselector/algo_discrete/FishersExactTest.rb +3 -1
data/lib/fselector/algo_discrete/GMean.rb +2 -0
data/lib/fselector/algo_discrete/GSSCoefficient.rb +2 -0
data/lib/fselector/algo_discrete/GiniIndex.rb +3 -1
data/lib/fselector/algo_discrete/INTERACT.rb +3 -0
data/lib/fselector/algo_discrete/InformationGain.rb +12 -1
data/lib/fselector/algo_discrete/LasVegasFilter.rb +3 -0
data/lib/fselector/algo_discrete/LasVegasIncremental.rb +3 -0
data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +2 -0
data/lib/fselector/algo_discrete/McNemarsTest.rb +3 -0
data/lib/fselector/algo_discrete/MutualInformation.rb +3 -1
data/lib/fselector/algo_discrete/OddsRatio.rb +2 -0
data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +2 -0
data/lib/fselector/algo_discrete/Power.rb +4 -1
data/lib/fselector/algo_discrete/Precision.rb +2 -0
data/lib/fselector/algo_discrete/ProbabilityRatio.rb +2 -0
data/lib/fselector/algo_discrete/Random.rb +3 -0
data/lib/fselector/algo_discrete/ReliefF_d.rb +3 -1
data/lib/fselector/algo_discrete/Relief_d.rb +4 -2
data/lib/fselector/algo_discrete/Sensitivity.rb +2 -0
data/lib/fselector/algo_discrete/Specificity.rb +2 -0
data/lib/fselector/algo_discrete/SymmetricalUncertainty.rb +4 -1
data/lib/fselector/discretizer.rb +7 -7
data/lib/fselector/ensemble.rb +375 -115
data/lib/fselector/entropy.rb +2 -2
data/lib/fselector/fileio.rb +83 -70
data/lib/fselector/normalizer.rb +2 -2
data/lib/fselector/replace_missing_values.rb +137 -3
data/lib/fselector/util.rb +17 -5
metadata +4 -4

data/lib/fselector/entropy.rb CHANGED

@@ -36,8 +36,8 @@ module Entropy
   # @return [Float] H(X|Y)
   # @note vecX and vecY must be of same length
    def get_conditional_entropy(vecX, vecY)
-    abort "[#{__FILE__}@#{__LINE__}]: "+
-          "vector must be of same length" if not vecX.size == vecY.size
+    abort "[#{__FILE__}@#{__LINE__}]: \n"+
+          "  two vectors must be of same length" if not vecX.size == vecY.size
     hxy = 0.0
     n = vecX.size.to_f

data/lib/fselector/fileio.rb CHANGED

@@ -27,9 +27,9 @@ module FileIO
   # @param [Integer] nclass number of classes
   # @param [Integer] nfeature number of features
   # @param [Integer] ncategory number of categories for each feature
-  #  1 => binary feature with only on bit
-  #  >1 => discrete feature with multiple values
-  #  otherwise => continuous feature with vaule in the range of [0, 1)
+  #   1      # binary feature with only on bit
+  #   >1     # discrete feature with multiple values
+  #   other  # continuous feature with vaule in the range of [0, 1)
   # @param [true, false] allow_mv whether missing value of feature is alowed or not
   #
   def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
@@ -38,7 +38,7 @@ module FileIO
     nsample.times do
       k = "c#{rand(nclass)+1}".to_sym
-      data[k] = [] if not data.has_key? k
+      data[k] ||= []
       feats = {}
       fs = (1..nfeature).to_a
@@ -57,7 +57,7 @@ module FileIO
         elsif ncategory > 1
           feats[f] = rand(ncategory)+1
         else
-          feats[f] = rand
+          feats[f] = rand.round(3) # round to 3-digit precision
         end
       end
@@ -77,7 +77,7 @@ module FileIO
   # ....
   #
   # @param [String] fname file to read from
-  #   :stdin => read from standard input instead of file
+  #   :stdin  # read from standard input instead of file
   #
   def data_from_libsvm(fname=:stdin)
     data = {}
@@ -85,8 +85,8 @@ module FileIO
     if fname == :stdin
       ifs = $stdin
     elsif not File.exists? fname
-      abort "[#{__FILE__}@#{__LINE__}]: "+
-            "File '#{fname}' does not exist!"
+      abort "[#{__FILE__}@#{__LINE__}]: \n"+
+            "  File '#{fname}' does not exist!"
     else
       ifs = File.open(fname)
     end
@@ -94,7 +94,7 @@ module FileIO
     ifs.each_line do |ln|
       label, *features = ln.chomp.split(/\s+/)
       label = label.to_sym
-      data[label] = [] if not data.has_key? label
+      data[label] ||= []
       feats = {}
       features.each do |fv|
@@ -116,7 +116,7 @@ module FileIO
   # write to libsvm
   #
   # @param [String] fname file to write
-  #   :stdout => write to standard ouput instead of file
+  #   :stdout  # write to standard ouput instead of file
   #
   def data_to_libsvm(fname=:stdout)
     if fname == :stdout
@@ -139,8 +139,8 @@ module FileIO
     each_sample do |k, s|
       ofs.print "#{k2idx[k]} "
-      s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
-        ofs.print " #{f2idx[i]}:#{s[i]}" if not s[i].zero? # implicit mode
+      s.keys.sort { |x, y| f2idx[x] <=> f2idx[y] }.each do |f|
+        ofs.print " #{f2idx[f]}:#{s[f]}" if not s[f].zero? # implicit mode
       end
       ofs.puts
     end
@@ -155,20 +155,20 @@ module FileIO
   #
   # file should have the format with the first two rows
   # specifying features and their data types e.g.
-  # feat1,feat2,...,featn
-  # data\_type1,data\_type2,...,data\_typen
+  # feat\_name1,feat\_name2,...,feat\_namen
+  # feat\_type1,feat\_type2,...,feat\_typen
   #
   # and the remaing rows showing data e.g.
   # class\_label,feat\_value1,feat\_value2,...,feat\_value3
   # ...
   #
-  # allowed data types are:
+  # allowed feature types (case-insensitive) are:
   # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
   #
   # @param [String] fname file to read from
-  #   :stdin => read from standard input instead of file
+  #   :stdin  # read from standard input instead of file
   #
-  # @note missing values allowed
+  # @note missing values are allowed, and feature types are stored as lower-case symbols
   #
   def data_from_csv(fname=:stdin)
     data = {}
@@ -176,29 +176,26 @@ module FileIO
     if fname == :stdin
       ifs = $stdin
     elsif not File.exists? fname
-      abort "[#{__FILE__}@#{__LINE__}]: "+
-            "File '#{fname}' does not exist!"
+      abort "[#{__FILE__}@#{__LINE__}]: \n"+
+            "  File '#{fname}' does not exist!"
     else
       ifs = File.open(fname)
     end
     first_row, second_row = true, true
-    feats, types = [], []
+    features, types = [], []
     ifs.each_line do |ln|
       if first_row # first row
         first_row = false
-        *feats = ln.chomp.split(/,/).to_sym
+        features = ln.chomp.split(/,/).to_sym
       elsif second_row # second row
         second_row = false
-        *types = ln.chomp.split(/,/)
-        if types.size == feats.size
-          types.each_with_index do |t, i|
-            set_opt(feats[i], t.upcase) # record data type
-          end
-        else
-          abort "[#{__FILE__}@#{__LINE__}]: "+
-                "the first two rows must have same number of fields"
+        # store feature type as lower-case symbol
+        types = ln.chomp.split(/,/).collect { |t| t.downcase.to_sym }
+        if not types.size == features.size
+          abort "[#{__FILE__}@#{__LINE__}]: \n"+
+                "  the first two rows must have same number of fields"
         end
       else # data rows
         label, *fvs = ln.chomp.split(/,/)
@@ -208,20 +205,20 @@ module FileIO
         fs = {}
         fvs.each_with_index do |v, i|
           next if v.empty? # missing value
-          data_type = get_opt(feats[i])
-          if data_type == 'INTEGER'
+          feat_type = types[i]
+          if feat_type == :integer
             v = v.to_i
-          elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
+          elsif [:real, :numeric, :continuous].include? feat_type
             v = v.to_f
-          elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
+          elsif [:string, :nominal, :categorical].include? feat_type
             #
           else
-            abort "[#{__FILE__}@#{__LINE__}]: "+
-                  "please specify correct data type "+
+            abort "[#{__FILE__}@#{__LINE__}]: \n"+
+                  "  please specify correct type "+
                   "for each feature in the 2nd row"
           end
-          fs[feats[i]] = v
+          fs[features[i]] = v
         end
         data[label] << fs
@@ -232,6 +229,11 @@ module FileIO
     ifs.close if not ifs == $stdin
     set_data(data)
+    set_features(features)
+    # set feature type
+    features.each_with_index do |f, i|
+      set_opt(f, types[i])
+    end
   end # data_from_csv
@@ -243,7 +245,7 @@ module FileIO
   # and the remaing rows showing data
   #
   # @param [String] fname file to write
-  #   :stdout => write to standard ouput instead of file
+  #   :stdout  # write to standard ouput instead of file
   #
   def data_to_csv(fname=:stdout)
     if fname == :stdout
@@ -254,7 +256,7 @@ module FileIO
     ofs.puts get_features.join(',')
     ofs.puts get_features.collect { |f|
-      get_opt(f) || 'STRING'
+      get_opt(f) || :string
     }.join(',')
     each_sample do |k, s|
@@ -270,7 +272,7 @@ module FileIO
     end
     # close file
-    ofs.close if not ofs == $stdout
+    ofs.close if not ofs == $stdout
   end # data_to_csv
@@ -278,7 +280,7 @@ module FileIO
   # read from WEKA ARFF file
   #
   # @param [String] fname file to read from
-  #   :stdin => read from standard input instead of file
+  #   :stdin  # read from standard input instead of file
   # @note it's ok if string containes spaces quoted by quote_char
   #
   def data_from_weka(fname=:stdin, quote_char='"')
@@ -287,13 +289,13 @@ module FileIO
     if fname == :stdin
       ifs = $stdin
     elsif not File.exists? fname
-      abort "[#{__FILE__}@#{__LINE__}]: "+
-            "File '#{fname}' does not exist!"
+      abort "[#{__FILE__}@#{__LINE__}]: \n"+
+            "  File '#{fname}' does not exist!"
     else
       ifs = File.open(fname)
     end
-    features, classes, comments = [], [], []
+    relation, features, classes, types, comments = '', [], [], [], []
     has_class, has_data = false, false
     ifs.each_line do |ln|
@@ -307,7 +309,6 @@ module FileIO
       # relation
       elsif ln =~ /^@RELATION/i
         tmp, relation = ln.split_me(/\s+/, quote_char)
-        set_opt('@RELATION', relation)
       # class attribute
       elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
         has_class = true
@@ -318,13 +319,14 @@ module FileIO
         f = $1.to_sym
         features << f
         #$2.split_me(/,\s*/, quote_char) # feature nominal values
-        set_opt(f, 'NOMINAL')
+        types << :nominal
       # feature attribute (integer, real, numeric, string, date)
       elsif ln =~ /^@ATTRIBUTE/i
         tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
         f = v1.to_sym
         features << f
-        set_opt(f, v2.upcase) # record feature data type
+        # store feture type as lower-case symbol
+        types << v2.downcase.to_sym
       # data header
       elsif ln =~ /^@DATA/i
         has_data = true
@@ -337,29 +339,30 @@ module FileIO
           label = label.to_sym
           fs = {}
-          nonzero_fi = []
+          # indices of feature with zero value
+          zero_fi = (0...features.size).to_a
           feats.each do |fi_fv|
             fi, fv = fi_fv.split_me(/\s+/, quote_char)
             fi = fi.to_i
-            add_feature_weka(fs, features[fi], fv)
-            nonzero_fi << fi
+            add_feature_weka(fs, features[fi], fv, types[fi])
+            zero_fi.delete(fi)
           end
           # feature with zero value
-          features.each_with_index do |f0, i|
-            add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
+          zero_fi.each do |zi|
+            add_feature_weka(fs, features[zi], 0, types[zi])
           end
           data[label] << fs
         else # regular ARFF
           feats = ln.split_me(/,\s*/, quote_char)
-          label = feats.pop.to_sym
+          label = feats.pop.to_sym
           fs = {}
           feats.each_with_index do |fv, i|
-            add_feature_weka(fs, features[i], fv)
+            add_feature_weka(fs, features[i], fv, types[i])
           end
           data[label] << fs if label
         end
       else
@@ -373,7 +376,11 @@ module FileIO
     set_data(data)
     set_classes(classes)
     set_features(features)
-    set_opt('COMMENTS', comments) if not comments.empty?
+    set_opt(:relation, relation)
+    features.each_with_index do |f, i|
+      set_opt(f, types[i])
+    end
+    set_opt(:comments, comments) if not comments.empty?
   end # data_from_weak
@@ -381,11 +388,11 @@ module FileIO
   # write to WEKA ARFF file
   #
   # @param [String] fname file to write
-  #   :stdout => write to standard ouput instead of file
+  #   :stdout  # write to standard ouput instead of file
   # @param [Symbol] format sparse or regular ARFF
-  #   :sparse => sparse ARFF, otherwise regular ARFF
+  #   :sparse  # sparse ARFF, otherwise regular ARFF
   #
-  def data_to_weka(fname=:stdout, format=:sparse)
+  def data_to_weka(fname=:stdout, format=nil)
     if fname == :stdout
       ofs = $stdout
     else
@@ -393,14 +400,14 @@ module FileIO
     end
     # comments
-    comments = get_opt('COMMENTS')
+    comments = get_opt(:comments)
     if comments
       ofs.puts comments.join("\n")
       ofs.puts
     end
     # relation
-    relation = get_opt('@RELATION')
+    relation = get_opt(:relation)
     if relation
       ofs.puts "@RELATION #{relation}"
     else
@@ -412,15 +419,15 @@ module FileIO
     # feature attribute
     each_feature do |f|
       ofs.print "@ATTRIBUTE #{f} "
-      type = get_opt(f)
+      type = get_opt(f) # feature type
       if type
-        if type == 'NOMINAL'
+        if type == :nominal
           ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
         else
           ofs.puts type
         end
-      else # treat all other data types as string
-        ofs.puts "STRING"
+      else # treat all other feature types as string
+        ofs.puts :string
       end
     end
@@ -462,21 +469,27 @@ module FileIO
   private
   # handle and add each feature for WEKA format
-  def add_feature_weka(fs, f, v)
+  #
+  # @param [Hash] fs sample that stores feature and its value
+  # @param [Symbol] f feature
+  # @param [String] v feature value
+  # @param [Symbol] type feature type
+  #
+  def add_feature_weka(fs, f, v, type)
     if v == '?' # missing value
       return
-    elsif get_opt(f) == 'INTEGER'
+    elsif type == :integer
       fs[f] = v.to_i
-    elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
+    elsif type == :real or type == :numeric
       fs[f] = v.to_f
-    elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
+    elsif type == :string or type == :nominal
       fs[f] = v
-    elsif get_opt(f) == 'DATE' # convert into integer
+    elsif type == :date # convert into integer
       fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
     else
        return
     end
-  end # add_feature
+  end # add_feature_weka
 end # module

data/lib/fselector/normalizer.rb CHANGED

@@ -13,8 +13,8 @@ module Normalizer
         if s[f] > 0.0
           s[f] = Math.log(s[f], base)
         else
-          abort "[#{__FILE__}@#{__LINE__}]: "+
-             "feature value must be positive"
+          abort "[#{__FILE__}@#{__LINE__}]: \n"+
+                "feature values must be positive!"
         end
       end
     end

data/lib/fselector/replace_missing_values.rb CHANGED

@@ -19,22 +19,28 @@ module ReplaceMissingValues
     # clear variables
     clear_vars
-  end # replace_by_fixed_value
+  end # replace_by_fixed_value!
   #
   # replace missing feature value by mean feature value,
   # applicable only to continuous feature
   #
+  # @param [Symbol] mode column or row mode
+  #   - :by\_column # use the mean value of the same feature among all instances
+  #   - :by\_row    # use the mean value of different features in current instance
+  #
   # @note data structure will be altered
   #
-  def replace_by_mean_value!
+  def replace_by_mean_value!(mode = :by_column)
     each_sample do |k, s|
+      mean = s.values.mean if mode == :by_row
       each_feature do |f|
         fv = get_feature_values(f)
         next if fv.size == get_sample_size # no missing values
-        mean = fv.ave
+        mean = fv.ave if mode == :by_column
         if not s.has_key? f
           s[f] = mean
         end
@@ -46,6 +52,36 @@ module ReplaceMissingValues
   end # replace_by_mean_value!
+  #
+  # replace missing feature value by median feature value,
+  # applicable only to continuous feature
+  #
+  # @param [Symbol] mode column or row mode
+  #   - :by\_column # use the mean value of the same feature among all instances
+  #   - :by\_row    # use the mean value of different features in current instance
+  #
+  # @note data structure will be altered
+  #
+  def replace_by_median_value!(mode = :by_column)
+    each_sample do |k, s|
+      median = s.values.median if mode == :by_row
+      each_feature do |f|
+        fv = get_feature_values(f)
+        next if fv.size == get_sample_size # no missing values
+        median = fv.median if mode == :by_column
+        if not s.has_key? f
+          s[f] = median
+        end
+      end
+    end
+    # clear variables
+    clear_vars
+  end # replace_by_median_value!
   #
   # replace missing feature value by most seen feature value,
   # applicable only to discrete feature
@@ -78,4 +114,102 @@ module ReplaceMissingValues
   end # replace_by_mean_value!
+  #
+  # replace missing feature value by weighted k-nearest neighbors' value,
+  # applicable only to continuous feature
+  #
+  #     val = sigma_k (val_k * w_k)
+  #
+  #     where w_k = (sum_d - d_k) / ((K-1) * sum_d)
+  #           sum_d = sigma_k (d_k)
+  #           K: number of d_k
+  #           sigma_k (w_k) = 1, normalized to 1
+  #
+  # @param [Integer] k number of nearest neighbors
+  # @note data structure will be altered, and the nearest neighbors
+  #   are determined by Euclidean distance
+  #
+  # ref: [Microarray missing data imputation based on a set theoretic framework and biological knowledge](http://nar.oxfordjournals.org/content/34/5/1608)
+  #
+  def replace_by_knn_value!(k=1)
+    each_sample do |ki, si|
+      # potential features having missing value
+      mv_fs = get_features - si.keys
+      next if mv_fs.empty? # sample si has no missing value
+      # record object value for each feature missing value
+      f2val = {}
+      mv_fs.each do |mv_f|
+        knn_s, knn_d = [], []
+        each_sample do |kj, sj|
+          # sample sj also has missing value of mv_f
+          next if not sj.has_key? mv_f
+          d = euclidean_distance(si, sj)
+          idx = knn_d.index { |di| d<di }
+          if idx
+            knn_s.insert(idx, sj)
+            knn_d.insert(idx, d)
+            if knn_s.size > k
+              knn_s = knn_s[0...k]
+              knn_d = knn_d[0...k]
+            end
+          else
+            if knn_s.size < k
+              knn_s << sj
+              knn_d << d
+            end
+          end
+        end
+        # distance-weighted value from knn
+        knn_d_sum = knn_d.sum
+        sz = knn_d.size
+        val = 0.0
+        knn_s.each_with_index do |s, i|
+          if sz > 1
+            if not knn_d_sum.zero?
+              val += s[mv_f] * (knn_d_sum-knn_d[i]) / ((sz-1)*knn_d_sum)
+            else
+              val += s[mv_f] * 1.0 / sz
+            end
+          else # only one nearest neighbor
+            val = s[mv_f]
+          end
+        end
+        f2val[mv_f] = val
+       #pp [si, mv_f, knn_s, knn_d, val]
+      end
+      # set value
+      f2val.each do |f, v|
+        si[f] = v
+      end
+    end
+    # clear variables
+    clear_vars
+  end # replace_by_knn_value!
+  private
+  # Euclidean distance of two samples
+  #
+  # @note features with missing value are ignored
+  def euclidean_distance(s1, s2)
+    d2 = 0.0
+    get_features.each do |f|
+      if s1.has_key? f and s2.has_key? f
+        d2 += (s1[f]-s2[f])**2
+      end
+    end
+    Math.sqrt(d2)
+  end # euclidean_distance
 end # ReplaceMissingValues