fselector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +21 -0
 - data/README.md +195 -0
 - data/lib/fselector.rb +41 -0
 - data/lib/fselector/algo_continuous/PMetric.rb +51 -0
 - data/lib/fselector/algo_continuous/ReliefF_c.rb +190 -0
 - data/lib/fselector/algo_continuous/Relief_c.rb +150 -0
 - data/lib/fselector/algo_continuous/TScore.rb +52 -0
 - data/lib/fselector/algo_continuous/discretizer.rb +219 -0
 - data/lib/fselector/algo_continuous/normalizer.rb +59 -0
 - data/lib/fselector/algo_discrete/Accuracy.rb +35 -0
 - data/lib/fselector/algo_discrete/AccuracyBalanced.rb +37 -0
 - data/lib/fselector/algo_discrete/BiNormalSeparation.rb +45 -0
 - data/lib/fselector/algo_discrete/ChiSquaredTest.rb +69 -0
 - data/lib/fselector/algo_discrete/CorrelationCoefficient.rb +42 -0
 - data/lib/fselector/algo_discrete/DocumentFrequency.rb +36 -0
 - data/lib/fselector/algo_discrete/F1Measure.rb +41 -0
 - data/lib/fselector/algo_discrete/FishersExactTest.rb +47 -0
 - data/lib/fselector/algo_discrete/GMean.rb +37 -0
 - data/lib/fselector/algo_discrete/GSSCoefficient.rb +43 -0
 - data/lib/fselector/algo_discrete/GiniIndex.rb +44 -0
 - data/lib/fselector/algo_discrete/InformationGain.rb +96 -0
 - data/lib/fselector/algo_discrete/MatthewsCorrelationCoefficient.rb +45 -0
 - data/lib/fselector/algo_discrete/McNemarsTest.rb +57 -0
 - data/lib/fselector/algo_discrete/MutualInformation.rb +42 -0
 - data/lib/fselector/algo_discrete/OddsRatio.rb +46 -0
 - data/lib/fselector/algo_discrete/OddsRatioNumerator.rb +41 -0
 - data/lib/fselector/algo_discrete/Power.rb +46 -0
 - data/lib/fselector/algo_discrete/Precision.rb +31 -0
 - data/lib/fselector/algo_discrete/ProbabilityRatio.rb +41 -0
 - data/lib/fselector/algo_discrete/Random.rb +40 -0
 - data/lib/fselector/algo_discrete/ReliefF_d.rb +173 -0
 - data/lib/fselector/algo_discrete/Relief_d.rb +135 -0
 - data/lib/fselector/algo_discrete/Sensitivity.rb +38 -0
 - data/lib/fselector/algo_discrete/Specificity.rb +35 -0
 - data/lib/fselector/base.rb +322 -0
 - data/lib/fselector/base_continuous.rb +25 -0
 - data/lib/fselector/base_discrete.rb +355 -0
 - data/lib/fselector/ensemble.rb +181 -0
 - data/lib/fselector/fileio.rb +455 -0
 - data/lib/fselector/util.rb +707 -0
 - metadata +86 -0
 
| 
         @@ -0,0 +1,455 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            #
         
     | 
| 
      
 2 
     | 
    
         
            +
            # read and write various file formats
         
     | 
| 
      
 3 
     | 
    
         
            +
            #
         
     | 
| 
      
 4 
     | 
    
         
            +
            # @note class labels and features are treated as symbols,
         
     | 
| 
      
 5 
     | 
    
         
            +
            #       e.g. length => :length
         
     | 
| 
      
 6 
     | 
    
         
            +
            #
         
     | 
| 
      
 7 
     | 
    
         
            +
            module FileIO
         
     | 
| 
      
 8 
     | 
    
         
            +
              #
         
     | 
| 
      
 9 
     | 
    
         
            +
              # read from random data (for test)
         
     | 
| 
      
 10 
     | 
    
         
            +
              #
         
     | 
| 
      
 11 
     | 
    
         
            +
              # @param [Integer] nsample number of total samples
         
     | 
| 
      
 12 
     | 
    
         
            +
              # @param [Integer] nclass number of classes
         
     | 
| 
      
 13 
     | 
    
         
            +
              # @param [Integer] nfeature number of features
         
     | 
| 
      
 14 
     | 
    
         
            +
              # @param [Integer] ncategory number of categories for each feature  
         
     | 
| 
      
 15 
     | 
    
         
            +
              #  1 => binary feature with only on bit  
         
     | 
| 
      
 16 
     | 
    
         
            +
              #  >1 => discrete feature with multiple values  
         
     | 
| 
      
 17 
     | 
    
         
            +
              #  otherwise => continuous feature with vaule in the range of [0, 1)
         
     | 
| 
      
 18 
     | 
    
         
            +
              # @param [true, false] allow_mv whether missing value of feature is alowed or not
         
     | 
| 
      
 19 
     | 
    
         
            +
              #
         
     | 
| 
      
 20 
     | 
    
         
            +
              def data_from_random(nsample=100, nclass=2, nfeature=10, ncategory=2, allow_mv=true)
         
     | 
| 
      
 21 
     | 
    
         
            +
                data = {}
         
     | 
| 
      
 22 
     | 
    
         
            +
              
         
     | 
| 
      
 23 
     | 
    
         
            +
                nsample.times do
         
     | 
| 
      
 24 
     | 
    
         
            +
                  k = "c#{rand(nclass)}".to_sym
         
     | 
| 
      
 25 
     | 
    
         
            +
                  
         
     | 
| 
      
 26 
     | 
    
         
            +
                  data[k] = [] if not data.has_key? k
         
     | 
| 
      
 27 
     | 
    
         
            +
                  
         
     | 
| 
      
 28 
     | 
    
         
            +
                  feats = {}
         
     | 
| 
      
 29 
     | 
    
         
            +
                  fs = (1..nfeature).to_a
         
     | 
| 
      
 30 
     | 
    
         
            +
                  
         
     | 
| 
      
 31 
     | 
    
         
            +
                  if allow_mv
         
     | 
| 
      
 32 
     | 
    
         
            +
                    (rand(nfeature)).times do
         
     | 
| 
      
 33 
     | 
    
         
            +
                      v = fs[rand(fs.size)]
         
     | 
| 
      
 34 
     | 
    
         
            +
                      fs.delete(v)
         
     | 
| 
      
 35 
     | 
    
         
            +
                    end
         
     | 
| 
      
 36 
     | 
    
         
            +
                  end
         
     | 
| 
      
 37 
     | 
    
         
            +
                  
         
     | 
| 
      
 38 
     | 
    
         
            +
                  fs.sort.each do |i|
         
     | 
| 
      
 39 
     | 
    
         
            +
                    f = "f#{i}".to_sym
         
     | 
| 
      
 40 
     | 
    
         
            +
                    if ncategory == 1
         
     | 
| 
      
 41 
     | 
    
         
            +
                      feats[f] = 1
         
     | 
| 
      
 42 
     | 
    
         
            +
                    elsif ncategory > 1
         
     | 
| 
      
 43 
     | 
    
         
            +
                      feats[f] = rand(ncategory)
         
     | 
| 
      
 44 
     | 
    
         
            +
                    else
         
     | 
| 
      
 45 
     | 
    
         
            +
                      feats[f] = rand
         
     | 
| 
      
 46 
     | 
    
         
            +
                    end
         
     | 
| 
      
 47 
     | 
    
         
            +
                  end
         
     | 
| 
      
 48 
     | 
    
         
            +
                  
         
     | 
| 
      
 49 
     | 
    
         
            +
                  data[k] << feats
         
     | 
| 
      
 50 
     | 
    
         
            +
                end
         
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
      
 52 
     | 
    
         
            +
                set_data(data)
         
     | 
| 
      
 53 
     | 
    
         
            +
              end # data_from_random
         
     | 
| 
      
 54 
     | 
    
         
            +
              
         
     | 
| 
      
 55 
     | 
    
         
            +
              
         
     | 
| 
      
 56 
     | 
    
         
            +
              #
         
     | 
| 
      
 57 
     | 
    
         
            +
              # read from libsvm
         
     | 
| 
      
 58 
     | 
    
         
            +
              #
         
     | 
| 
      
 59 
     | 
    
         
            +
              # file has the following format  
         
     | 
| 
      
 60 
     | 
    
         
            +
              # +1  2:1 4:1 ...  
         
     | 
| 
      
 61 
     | 
    
         
            +
              # -1  3:1 4:1 ...  
         
     | 
| 
      
 62 
     | 
    
         
            +
              # ....
         
     | 
| 
      
 63 
     | 
    
         
            +
              #
         
     | 
| 
      
 64 
     | 
    
         
            +
              # @param [String] fname file to read from  
         
     | 
| 
      
 65 
     | 
    
         
            +
              #   :stdin => read from standard input instead of file
         
     | 
| 
      
 66 
     | 
    
         
            +
              #
         
     | 
| 
      
 67 
     | 
    
         
            +
              def data_from_libsvm(fname=:stdin)
         
     | 
| 
      
 68 
     | 
    
         
            +
                data = {}
         
     | 
| 
      
 69 
     | 
    
         
            +
                
         
     | 
| 
      
 70 
     | 
    
         
            +
                if fname == :stdin
         
     | 
| 
      
 71 
     | 
    
         
            +
                  ifs = $stdin
         
     | 
| 
      
 72 
     | 
    
         
            +
                elsif not File.exists? fname
         
     | 
| 
      
 73 
     | 
    
         
            +
                  abort "[#{__FILE__}@#{__LINE__}]: "+
         
     | 
| 
      
 74 
     | 
    
         
            +
                        "File '#{fname}' does not exist!"
         
     | 
| 
      
 75 
     | 
    
         
            +
                else
         
     | 
| 
      
 76 
     | 
    
         
            +
                  ifs = File.open(fname)
         
     | 
| 
      
 77 
     | 
    
         
            +
                end
         
     | 
| 
      
 78 
     | 
    
         
            +
                
         
     | 
| 
      
 79 
     | 
    
         
            +
                ifs.each_line do |ln|
         
     | 
| 
      
 80 
     | 
    
         
            +
                  label, *features = ln.chomp.split(/\s+/)
         
     | 
| 
      
 81 
     | 
    
         
            +
                  label = label.to_sym
         
     | 
| 
      
 82 
     | 
    
         
            +
                  data[label] = [] if not data.has_key? label
         
     | 
| 
      
 83 
     | 
    
         
            +
                  
         
     | 
| 
      
 84 
     | 
    
         
            +
                  feats = {}
         
     | 
| 
      
 85 
     | 
    
         
            +
                  features.each do |fv|
         
     | 
| 
      
 86 
     | 
    
         
            +
                    f, v = fv.split(/:/)
         
     | 
| 
      
 87 
     | 
    
         
            +
                    feats[f.to_sym] = v.to_f
         
     | 
| 
      
 88 
     | 
    
         
            +
                  end
         
     | 
| 
      
 89 
     | 
    
         
            +
                  
         
     | 
| 
      
 90 
     | 
    
         
            +
                  data[label] << feats
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
                
         
     | 
| 
      
 93 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 94 
     | 
    
         
            +
                ifs.close if not ifs == $stdin
         
     | 
| 
      
 95 
     | 
    
         
            +
                
         
     | 
| 
      
 96 
     | 
    
         
            +
                set_data(data)
         
     | 
| 
      
 97 
     | 
    
         
            +
              end # data_from_libsvm
         
     | 
| 
      
 98 
     | 
    
         
            +
              
         
     | 
| 
      
 99 
     | 
    
         
            +
              
         
     | 
| 
      
 100 
     | 
    
         
            +
              #
         
     | 
| 
      
 101 
     | 
    
         
            +
              # write to libsvm
         
     | 
| 
      
 102 
     | 
    
         
            +
              #
         
     | 
| 
      
 103 
     | 
    
         
            +
              # @param [String] fname file to write  
         
     | 
| 
      
 104 
     | 
    
         
            +
              #   :stdout => write to standard ouput instead of file
         
     | 
| 
      
 105 
     | 
    
         
            +
              #
         
     | 
| 
      
 106 
     | 
    
         
            +
              def data_to_libsvm(fname=:stdout)
         
     | 
| 
      
 107 
     | 
    
         
            +
                if fname == :stdout
         
     | 
| 
      
 108 
     | 
    
         
            +
                  ofs = $stdout
         
     | 
| 
      
 109 
     | 
    
         
            +
                else
         
     | 
| 
      
 110 
     | 
    
         
            +
                  ofs = File.open(fname, 'w')
         
     | 
| 
      
 111 
     | 
    
         
            +
                end
         
     | 
| 
      
 112 
     | 
    
         
            +
                
         
     | 
| 
      
 113 
     | 
    
         
            +
                each_sample do |k, s|
         
     | 
| 
      
 114 
     | 
    
         
            +
                  ofs.print "#{k} "
         
     | 
| 
      
 115 
     | 
    
         
            +
                  s.keys.sort { |x, y| x.to_s.to_i <=> y.to_s.to_i }.each do |i|
         
     | 
| 
      
 116 
     | 
    
         
            +
                    ofs.print " #{i}:#{s[i]}" if not s[i].zero?
         
     | 
| 
      
 117 
     | 
    
         
            +
                  end
         
     | 
| 
      
 118 
     | 
    
         
            +
                  ofs.puts
         
     | 
| 
      
 119 
     | 
    
         
            +
                end
         
     | 
| 
      
 120 
     | 
    
         
            +
                
         
     | 
| 
      
 121 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 122 
     | 
    
         
            +
                ofs.close if not ofs == $stdout
         
     | 
| 
      
 123 
     | 
    
         
            +
              end # data_to_libsvm
         
     | 
| 
      
 124 
     | 
    
         
            +
              
         
     | 
| 
      
 125 
     | 
    
         
            +
              
         
     | 
| 
      
 126 
     | 
    
         
            +
              #
         
     | 
| 
      
 127 
     | 
    
         
            +
              # read from csv
         
     | 
| 
      
 128 
     | 
    
         
            +
              #
         
     | 
| 
      
 129 
     | 
    
         
            +
              # file should have the format with the first two rows
         
     | 
| 
      
 130 
     | 
    
         
            +
              # specifying features and their data types e.g.  
         
     | 
| 
      
 131 
     | 
    
         
            +
              # feat1,feat2,...,featn  
         
     | 
| 
      
 132 
     | 
    
         
            +
              # data\_type1,data\_type2,...,data\_typen  
         
     | 
| 
      
 133 
     | 
    
         
            +
              # 
         
     | 
| 
      
 134 
     | 
    
         
            +
              # and the remaing rows showing data e.g.  
         
     | 
| 
      
 135 
     | 
    
         
            +
              # class\_label,feat\_value1,feat\_value2,...,feat\_value3  
         
     | 
| 
      
 136 
     | 
    
         
            +
              # ...  
         
     | 
| 
      
 137 
     | 
    
         
            +
              # 
         
     | 
| 
      
 138 
     | 
    
         
            +
              # allowed data types are:  
         
     | 
| 
      
 139 
     | 
    
         
            +
              # INTEGER, REAL, NUMERIC, CONTINUOUS, STRING, NOMINAL, CATEGORICAL
         
     | 
| 
      
 140 
     | 
    
         
            +
              #
         
     | 
| 
      
 141 
     | 
    
         
            +
              # @param [String] fname file to read from  
         
     | 
| 
      
 142 
     | 
    
         
            +
              #   :stdin => read from standard input instead of file
         
     | 
| 
      
 143 
     | 
    
         
            +
              #
         
     | 
| 
      
 144 
     | 
    
         
            +
              # @note missing values allowed
         
     | 
| 
      
 145 
     | 
    
         
            +
              #
         
     | 
| 
      
 146 
     | 
    
         
            +
              def data_from_csv(fname=:stdin)
         
     | 
| 
      
 147 
     | 
    
         
            +
                data = {}
         
     | 
| 
      
 148 
     | 
    
         
            +
                
         
     | 
| 
      
 149 
     | 
    
         
            +
                if fname == :stdin
         
     | 
| 
      
 150 
     | 
    
         
            +
                  ifs = $stdin
         
     | 
| 
      
 151 
     | 
    
         
            +
                elsif not File.exists? fname
         
     | 
| 
      
 152 
     | 
    
         
            +
                  abort "[#{__FILE__}@#{__LINE__}]: "+
         
     | 
| 
      
 153 
     | 
    
         
            +
                        "File '#{fname}' does not exist!"
         
     | 
| 
      
 154 
     | 
    
         
            +
                else
         
     | 
| 
      
 155 
     | 
    
         
            +
                  ifs = File.open(fname)
         
     | 
| 
      
 156 
     | 
    
         
            +
                end
         
     | 
| 
      
 157 
     | 
    
         
            +
                
         
     | 
| 
      
 158 
     | 
    
         
            +
                first_row, second_row = true, true
         
     | 
| 
      
 159 
     | 
    
         
            +
                feats, types = [], []
         
     | 
| 
      
 160 
     | 
    
         
            +
                
         
     | 
| 
      
 161 
     | 
    
         
            +
                ifs.each_line do |ln|
         
     | 
| 
      
 162 
     | 
    
         
            +
                  if first_row # first row
         
     | 
| 
      
 163 
     | 
    
         
            +
                    first_row = false
         
     | 
| 
      
 164 
     | 
    
         
            +
                    *feats = ln.chomp.split(/,/).to_sym
         
     | 
| 
      
 165 
     | 
    
         
            +
                  elsif second_row # second row
         
     | 
| 
      
 166 
     | 
    
         
            +
                    second_row = false
         
     | 
| 
      
 167 
     | 
    
         
            +
                    *types = ln.chomp.split(/,/)
         
     | 
| 
      
 168 
     | 
    
         
            +
                    if types.size == feats.size
         
     | 
| 
      
 169 
     | 
    
         
            +
                      types.each_with_index do |t, i|
         
     | 
| 
      
 170 
     | 
    
         
            +
                        set_opt(feats[i], t.upcase) # record data type
         
     | 
| 
      
 171 
     | 
    
         
            +
                      end
         
     | 
| 
      
 172 
     | 
    
         
            +
                    else
         
     | 
| 
      
 173 
     | 
    
         
            +
                      abort "[#{__FILE__}@#{__LINE__}]: "+
         
     | 
| 
      
 174 
     | 
    
         
            +
                            "1st and 2nd row must have same fields"
         
     | 
| 
      
 175 
     | 
    
         
            +
                    end
         
     | 
| 
      
 176 
     | 
    
         
            +
                  else # data rows
         
     | 
| 
      
 177 
     | 
    
         
            +
                    label, *fvs = ln.chomp.split(/,/)
         
     | 
| 
      
 178 
     | 
    
         
            +
                    label = label.to_sym
         
     | 
| 
      
 179 
     | 
    
         
            +
                    data[label] = [] if not data.has_key? label
         
     | 
| 
      
 180 
     | 
    
         
            +
                    
         
     | 
| 
      
 181 
     | 
    
         
            +
                    fs = {}
         
     | 
| 
      
 182 
     | 
    
         
            +
                    fvs.each_with_index do |v, i|
         
     | 
| 
      
 183 
     | 
    
         
            +
                      next if v.empty? # missing value
         
     | 
| 
      
 184 
     | 
    
         
            +
                      data_type = get_opt(feats[i])
         
     | 
| 
      
 185 
     | 
    
         
            +
                      if data_type == 'INTEGER'
         
     | 
| 
      
 186 
     | 
    
         
            +
                        v = v.to_i
         
     | 
| 
      
 187 
     | 
    
         
            +
                      elsif ['REAL', 'NUMERIC', 'CONTINUOUS'].include? data_type
         
     | 
| 
      
 188 
     | 
    
         
            +
                        v = v.to_f
         
     | 
| 
      
 189 
     | 
    
         
            +
                      elsif ['STRING', 'NOMINAL', 'CATEGORICAL'].include? data_type
         
     | 
| 
      
 190 
     | 
    
         
            +
                        #
         
     | 
| 
      
 191 
     | 
    
         
            +
                      else
         
     | 
| 
      
 192 
     | 
    
         
            +
                        abort "[#{__FILE__}@#{__LINE__}]: "+
         
     | 
| 
      
 193 
     | 
    
         
            +
                              "please specify correct data type "+
         
     | 
| 
      
 194 
     | 
    
         
            +
                              "for each feature in the 2nd row"
         
     | 
| 
      
 195 
     | 
    
         
            +
                      end
         
     | 
| 
      
 196 
     | 
    
         
            +
                      
         
     | 
| 
      
 197 
     | 
    
         
            +
                      fs[feats[i]] = v
         
     | 
| 
      
 198 
     | 
    
         
            +
                    end
         
     | 
| 
      
 199 
     | 
    
         
            +
                    
         
     | 
| 
      
 200 
     | 
    
         
            +
                    data[label] << fs
         
     | 
| 
      
 201 
     | 
    
         
            +
                  end
         
     | 
| 
      
 202 
     | 
    
         
            +
                end
         
     | 
| 
      
 203 
     | 
    
         
            +
                
         
     | 
| 
      
 204 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 205 
     | 
    
         
            +
                ifs.close if not ifs == $stdin
         
     | 
| 
      
 206 
     | 
    
         
            +
                
         
     | 
| 
      
 207 
     | 
    
         
            +
                set_data(data)
         
     | 
| 
      
 208 
     | 
    
         
            +
              end # data_from_csv
         
     | 
| 
      
 209 
     | 
    
         
            +
              
         
     | 
| 
      
 210 
     | 
    
         
            +
              
         
     | 
| 
      
 211 
     | 
    
         
            +
              #
         
     | 
| 
      
 212 
     | 
    
         
            +
              # write to csv
         
     | 
| 
      
 213 
     | 
    
         
            +
              #
         
     | 
| 
      
 214 
     | 
    
         
            +
              # file has the format with the first two rows
         
     | 
| 
      
 215 
     | 
    
         
            +
              # specifying features and their data types
         
     | 
| 
      
 216 
     | 
    
         
            +
              # and the remaing rows showing data
         
     | 
| 
      
 217 
     | 
    
         
            +
              #
         
     | 
| 
      
 218 
     | 
    
         
            +
              # @param [String] fname file to write  
         
     | 
| 
      
 219 
     | 
    
         
            +
              #   :stdout => write to standard ouput instead of file
         
     | 
| 
      
 220 
     | 
    
         
            +
              #
         
     | 
| 
      
 221 
     | 
    
         
            +
              def data_to_csv(fname=:stdout)
         
     | 
| 
      
 222 
     | 
    
         
            +
                if fname == :stdout
         
     | 
| 
      
 223 
     | 
    
         
            +
                  ofs = $stdout
         
     | 
| 
      
 224 
     | 
    
         
            +
                else
         
     | 
| 
      
 225 
     | 
    
         
            +
                  ofs = File.open(fname, 'w')
         
     | 
| 
      
 226 
     | 
    
         
            +
                end
         
     | 
| 
      
 227 
     | 
    
         
            +
                 
         
     | 
| 
      
 228 
     | 
    
         
            +
                ofs.puts get_features.join(',')
         
     | 
| 
      
 229 
     | 
    
         
            +
                ofs.puts get_features.collect { |f| 
         
     | 
| 
      
 230 
     | 
    
         
            +
                  get_opt(f) || 'STRING'
         
     | 
| 
      
 231 
     | 
    
         
            +
                }.join(',')
         
     | 
| 
      
 232 
     | 
    
         
            +
                
         
     | 
| 
      
 233 
     | 
    
         
            +
                each_sample do |k, s|
         
     | 
| 
      
 234 
     | 
    
         
            +
                  ofs.print "#{k}"
         
     | 
| 
      
 235 
     | 
    
         
            +
                  each_feature do |f|
         
     | 
| 
      
 236 
     | 
    
         
            +
                    if s.has_key? f
         
     | 
| 
      
 237 
     | 
    
         
            +
                      ofs.print ",#{s[f]}"
         
     | 
| 
      
 238 
     | 
    
         
            +
                    else
         
     | 
| 
      
 239 
     | 
    
         
            +
                      ofs.print ","
         
     | 
| 
      
 240 
     | 
    
         
            +
                    end
         
     | 
| 
      
 241 
     | 
    
         
            +
                  end
         
     | 
| 
      
 242 
     | 
    
         
            +
                  ofs.puts
         
     | 
| 
      
 243 
     | 
    
         
            +
                end
         
     | 
| 
      
 244 
     | 
    
         
            +
                
         
     | 
| 
      
 245 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 246 
     | 
    
         
            +
                ofs.close if not ofs == $stdout    
         
     | 
| 
      
 247 
     | 
    
         
            +
              end # data_to_csv
         
     | 
| 
      
 248 
     | 
    
         
            +
              
         
     | 
| 
      
 249 
     | 
    
         
            +
              
         
     | 
| 
      
 250 
     | 
    
         
            +
              #
         
     | 
| 
      
 251 
     | 
    
         
            +
              # read from WEKA ARFF file
         
     | 
| 
      
 252 
     | 
    
         
            +
              #
         
     | 
| 
      
 253 
     | 
    
         
            +
              # @param [String] fname file to read from  
         
     | 
| 
      
 254 
     | 
    
         
            +
              #   :stdin => read from standard input instead of file
         
     | 
| 
      
 255 
     | 
    
         
            +
              # @note it's ok if string containes spaces quoted by quote_char
         
     | 
| 
      
 256 
     | 
    
         
            +
              #
         
     | 
| 
      
 257 
     | 
    
         
            +
              def data_from_weka(fname=:stdin, quote_char='"')
         
     | 
| 
      
 258 
     | 
    
         
            +
                data = {}
         
     | 
| 
      
 259 
     | 
    
         
            +
                
         
     | 
| 
      
 260 
     | 
    
         
            +
                if fname == :stdin
         
     | 
| 
      
 261 
     | 
    
         
            +
                  ifs = $stdin
         
     | 
| 
      
 262 
     | 
    
         
            +
                elsif not File.exists? fname
         
     | 
| 
      
 263 
     | 
    
         
            +
                  abort "[#{__FILE__}@#{__LINE__}]: "+
         
     | 
| 
      
 264 
     | 
    
         
            +
                        "File '#{fname}' does not exist!"
         
     | 
| 
      
 265 
     | 
    
         
            +
                else
         
     | 
| 
      
 266 
     | 
    
         
            +
                  ifs = File.open(fname)
         
     | 
| 
      
 267 
     | 
    
         
            +
                end
         
     | 
| 
      
 268 
     | 
    
         
            +
                
         
     | 
| 
      
 269 
     | 
    
         
            +
                features, classes, comments = [], [], []
         
     | 
| 
      
 270 
     | 
    
         
            +
                has_class, has_data = false, false
         
     | 
| 
      
 271 
     | 
    
         
            +
                
         
     | 
| 
      
 272 
     | 
    
         
            +
                ifs.each_line do |ln|
         
     | 
| 
      
 273 
     | 
    
         
            +
                  next if ln.blank? # blank lines
         
     | 
| 
      
 274 
     | 
    
         
            +
                  
         
     | 
| 
      
 275 
     | 
    
         
            +
                  ln = ln.chomp
         
     | 
| 
      
 276 
     | 
    
         
            +
                  
         
     | 
| 
      
 277 
     | 
    
         
            +
                  # comment line
         
     | 
| 
      
 278 
     | 
    
         
            +
                  if ln.comment?('%')
         
     | 
| 
      
 279 
     | 
    
         
            +
                    comments << ln
         
     | 
| 
      
 280 
     | 
    
         
            +
                  # relation
         
     | 
| 
      
 281 
     | 
    
         
            +
                  elsif ln =~ /^@RELATION/i
         
     | 
| 
      
 282 
     | 
    
         
            +
                    tmp, relation = ln.split_me(/\s+/, quote_char)
         
     | 
| 
      
 283 
     | 
    
         
            +
                    set_opt('@RELATION', relation)
         
     | 
| 
      
 284 
     | 
    
         
            +
                  # class attribute
         
     | 
| 
      
 285 
     | 
    
         
            +
                  elsif ln =~ /^@ATTRIBUTE\s+class\s+{(.+)}/i
         
     | 
| 
      
 286 
     | 
    
         
            +
                    has_class = true
         
     | 
| 
      
 287 
     | 
    
         
            +
                    classes = $1.split_me(/,\s*/, quote_char).to_sym
         
     | 
| 
      
 288 
     | 
    
         
            +
                    classes.each { |k| data[k] = [] }
         
     | 
| 
      
 289 
     | 
    
         
            +
                  # feature attribute (nominal)
         
     | 
| 
      
 290 
     | 
    
         
            +
                  elsif ln =~ /^@ATTRIBUTE\s+(\S+)\s+{(.+)}/i
         
     | 
| 
      
 291 
     | 
    
         
            +
                    f = $1.to_sym
         
     | 
| 
      
 292 
     | 
    
         
            +
                    features << f
         
     | 
| 
      
 293 
     | 
    
         
            +
                    #$2.split_me(/,\s*/, quote_char) # feature nominal values
         
     | 
| 
      
 294 
     | 
    
         
            +
                    set_opt(f, 'NOMINAL')
         
     | 
| 
      
 295 
     | 
    
         
            +
                  # feature attribute (integer, real, numeric, string, date)
         
     | 
| 
      
 296 
     | 
    
         
            +
                  elsif ln =~ /^@ATTRIBUTE/i
         
     | 
| 
      
 297 
     | 
    
         
            +
                    tmp, v1, v2 = ln.split_me(/\s+/, quote_char)
         
     | 
| 
      
 298 
     | 
    
         
            +
                    f = v1.to_sym
         
     | 
| 
      
 299 
     | 
    
         
            +
                    features << f
         
     | 
| 
      
 300 
     | 
    
         
            +
                    set_opt(f, v2.upcase) # record feature data type
         
     | 
| 
      
 301 
     | 
    
         
            +
                  # data header
         
     | 
| 
      
 302 
     | 
    
         
            +
                  elsif ln =~ /^@DATA/i
         
     | 
| 
      
 303 
     | 
    
         
            +
                    has_data = true
         
     | 
| 
      
 304 
     | 
    
         
            +
                  # data
         
     | 
| 
      
 305 
     | 
    
         
            +
                  elsif has_data and has_class
         
     | 
| 
      
 306 
     | 
    
         
            +
                    # read data section
         
     | 
| 
      
 307 
     | 
    
         
            +
                    if ln =~ /^{(.+)}$/ # sparse ARFF
         
     | 
| 
      
 308 
     | 
    
         
            +
                      feats = $1.split_me(/,\s*/, quote_char)
         
     | 
| 
      
 309 
     | 
    
         
            +
                      label = feats.pop.split_me(/\s+/, quote_char)[1]
         
     | 
| 
      
 310 
     | 
    
         
            +
                      label = label.to_sym
         
     | 
| 
      
 311 
     | 
    
         
            +
                      
         
     | 
| 
      
 312 
     | 
    
         
            +
                      fs = {}
         
     | 
| 
      
 313 
     | 
    
         
            +
                      nonzero_fi = []
         
     | 
| 
      
 314 
     | 
    
         
            +
                      feats.each do |fi_fv|
         
     | 
| 
      
 315 
     | 
    
         
            +
                        fi, fv = fi_fv.split_me(/\s+/, quote_char)
         
     | 
| 
      
 316 
     | 
    
         
            +
                        fi = fi.to_i             
         
     | 
| 
      
 317 
     | 
    
         
            +
                        add_feature_weka(fs, features[fi], fv)
         
     | 
| 
      
 318 
     | 
    
         
            +
                        nonzero_fi << fi
         
     | 
| 
      
 319 
     | 
    
         
            +
                      end
         
     | 
| 
      
 320 
     | 
    
         
            +
                      
         
     | 
| 
      
 321 
     | 
    
         
            +
                      # feature with zero value
         
     | 
| 
      
 322 
     | 
    
         
            +
                      features.each_with_index do |f0, i|
         
     | 
| 
      
 323 
     | 
    
         
            +
                        add_feature_weka(fs, f0, 0) if not nonzero_fi.include?(i)
         
     | 
| 
      
 324 
     | 
    
         
            +
                      end
         
     | 
| 
      
 325 
     | 
    
         
            +
                      
         
     | 
| 
      
 326 
     | 
    
         
            +
                      data[label] << fs
         
     | 
| 
      
 327 
     | 
    
         
            +
                    else # regular ARFF
         
     | 
| 
      
 328 
     | 
    
         
            +
                      feats = ln.split_me(/,\s*/, quote_char)
         
     | 
| 
      
 329 
     | 
    
         
            +
                      label = feats.pop.to_sym
         
     | 
| 
      
 330 
     | 
    
         
            +
                      
         
     | 
| 
      
 331 
     | 
    
         
            +
                      fs = {}
         
     | 
| 
      
 332 
     | 
    
         
            +
                      feats.each_with_index do |fv, i|
         
     | 
| 
      
 333 
     | 
    
         
            +
                        add_feature_weka(fs, features[i], fv)
         
     | 
| 
      
 334 
     | 
    
         
            +
                      end
         
     | 
| 
      
 335 
     | 
    
         
            +
                      
         
     | 
| 
      
 336 
     | 
    
         
            +
                      data[label] << fs if label
         
     | 
| 
      
 337 
     | 
    
         
            +
                    end
         
     | 
| 
      
 338 
     | 
    
         
            +
                  else
         
     | 
| 
      
 339 
     | 
    
         
            +
                    next
         
     | 
| 
      
 340 
     | 
    
         
            +
                  end
         
     | 
| 
      
 341 
     | 
    
         
            +
                end
         
     | 
| 
      
 342 
     | 
    
         
            +
                
         
     | 
| 
      
 343 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 344 
     | 
    
         
            +
                ifs.close if not ifs == $stdin
         
     | 
| 
      
 345 
     | 
    
         
            +
                
         
     | 
| 
      
 346 
     | 
    
         
            +
                set_data(data)
         
     | 
| 
      
 347 
     | 
    
         
            +
                set_classes(classes)
         
     | 
| 
      
 348 
     | 
    
         
            +
                set_features(features)
         
     | 
| 
      
 349 
     | 
    
         
            +
                set_opt('COMMENTS', comments) if not comments.empty?
         
     | 
| 
      
 350 
     | 
    
         
            +
              end # data_from_weak
         
     | 
| 
      
 351 
     | 
    
         
            +
              
         
     | 
| 
      
 352 
     | 
    
         
            +
              
         
     | 
| 
      
 353 
     | 
    
         
            +
              #
         
     | 
| 
      
 354 
     | 
    
         
            +
              # write to WEKA ARFF file
         
     | 
| 
      
 355 
     | 
    
         
            +
              #
         
     | 
| 
      
 356 
     | 
    
         
            +
              # @param [String] fname file to write  
         
     | 
| 
      
 357 
     | 
    
         
            +
              #   :stdout => write to standard ouput instead of file
         
     | 
| 
      
 358 
     | 
    
         
            +
              # @param [Symbol] format sparse or regular ARFF  
         
     | 
| 
      
 359 
     | 
    
         
            +
              #   :sparse => sparse ARFF, otherwise regular ARFF
         
     | 
| 
      
 360 
     | 
    
         
            +
              #
         
     | 
| 
      
 361 
     | 
    
         
            +
              def data_to_weka(fname=:stdout, format=nil)
         
     | 
| 
      
 362 
     | 
    
         
            +
                if fname == :stdout
         
     | 
| 
      
 363 
     | 
    
         
            +
                  ofs = $stdout
         
     | 
| 
      
 364 
     | 
    
         
            +
                else
         
     | 
| 
      
 365 
     | 
    
         
            +
                  ofs = File.open(fname, 'w')
         
     | 
| 
      
 366 
     | 
    
         
            +
                end
         
     | 
| 
      
 367 
     | 
    
         
            +
                
         
     | 
| 
      
 368 
     | 
    
         
            +
                # comments
         
     | 
| 
      
 369 
     | 
    
         
            +
                comments = get_opt('COMMENTS')
         
     | 
| 
      
 370 
     | 
    
         
            +
                if comments
         
     | 
| 
      
 371 
     | 
    
         
            +
                  ofs.puts comments.join("\n")
         
     | 
| 
      
 372 
     | 
    
         
            +
                  ofs.puts
         
     | 
| 
      
 373 
     | 
    
         
            +
                end         
         
     | 
| 
      
 374 
     | 
    
         
            +
                
         
     | 
| 
      
 375 
     | 
    
         
            +
                # relation
         
     | 
| 
      
 376 
     | 
    
         
            +
                relation = get_opt('@RELATION')
         
     | 
| 
      
 377 
     | 
    
         
            +
                if relation
         
     | 
| 
      
 378 
     | 
    
         
            +
                  ofs.puts "@RELATION #{relation}"
         
     | 
| 
      
 379 
     | 
    
         
            +
                else
         
     | 
| 
      
 380 
     | 
    
         
            +
                  ofs.puts "@RELATION data_gen_by_FSelector"
         
     | 
| 
      
 381 
     | 
    
         
            +
                end
         
     | 
| 
      
 382 
     | 
    
         
            +
                
         
     | 
| 
      
 383 
     | 
    
         
            +
                ofs.puts
         
     | 
| 
      
 384 
     | 
    
         
            +
                
         
     | 
| 
      
 385 
     | 
    
         
            +
                # feature attribute
         
     | 
| 
      
 386 
     | 
    
         
            +
                each_feature do |f|
         
     | 
| 
      
 387 
     | 
    
         
            +
                  ofs.print "@ATTRIBUTE #{f} "
         
     | 
| 
      
 388 
     | 
    
         
            +
                  type = get_opt(f)
         
     | 
| 
      
 389 
     | 
    
         
            +
                  if type
         
     | 
| 
      
 390 
     | 
    
         
            +
                    if type == 'NOMINAL'
         
     | 
| 
      
 391 
     | 
    
         
            +
                      ofs.puts "{#{get_feature_values(f).uniq.sort.join(',')}}"
         
     | 
| 
      
 392 
     | 
    
         
            +
                    else
         
     | 
| 
      
 393 
     | 
    
         
            +
                      ofs.puts type
         
     | 
| 
      
 394 
     | 
    
         
            +
                    end
         
     | 
| 
      
 395 
     | 
    
         
            +
                  else # treat all other data types as string
         
     | 
| 
      
 396 
     | 
    
         
            +
                    ofs.puts "STRING"
         
     | 
| 
      
 397 
     | 
    
         
            +
                  end
         
     | 
| 
      
 398 
     | 
    
         
            +
                end
         
     | 
| 
      
 399 
     | 
    
         
            +
                
         
     | 
| 
      
 400 
     | 
    
         
            +
                # class attribute
         
     | 
| 
      
 401 
     | 
    
         
            +
                ofs.puts "@ATTRIBUTE class {#{get_classes.join(',')}}"
         
     | 
| 
      
 402 
     | 
    
         
            +
                
         
     | 
| 
      
 403 
     | 
    
         
            +
                ofs.puts
         
     | 
| 
      
 404 
     | 
    
         
            +
                
         
     | 
| 
      
 405 
     | 
    
         
            +
                # data header
         
     | 
| 
      
 406 
     | 
    
         
            +
                ofs.puts "@DATA"
         
     | 
| 
      
 407 
     | 
    
         
            +
                each_sample do |k, s|
         
     | 
| 
      
 408 
     | 
    
         
            +
                  if format == :sparse # sparse ARFF
         
     | 
| 
      
 409 
     | 
    
         
            +
                    ofs.print "{"
         
     | 
| 
      
 410 
     | 
    
         
            +
                    get_features.each_with_index do |f, i|
         
     | 
| 
      
 411 
     | 
    
         
            +
                      if s.has_key? f
         
     | 
| 
      
 412 
     | 
    
         
            +
                        ofs.print "#{i} #{s[f]}," if not s[f].zero?
         
     | 
| 
      
 413 
     | 
    
         
            +
                      else # missing value
         
     | 
| 
      
 414 
     | 
    
         
            +
                        ofs.print "#{i} ?,"
         
     | 
| 
      
 415 
     | 
    
         
            +
                      end
         
     | 
| 
      
 416 
     | 
    
         
            +
                    end
         
     | 
| 
      
 417 
     | 
    
         
            +
                    ofs.print "#{get_features.size} #{k}"
         
     | 
| 
      
 418 
     | 
    
         
            +
                    ofs.puts "}"
         
     | 
| 
      
 419 
     | 
    
         
            +
                  else
         
     | 
| 
      
 420 
     | 
    
         
            +
                    each_feature do |f|
         
     | 
| 
      
 421 
     | 
    
         
            +
                      if s.has_key? f
         
     | 
| 
      
 422 
     | 
    
         
            +
                        ofs.print "#{s[f]},"
         
     | 
| 
      
 423 
     | 
    
         
            +
                      else # missing value
         
     | 
| 
      
 424 
     | 
    
         
            +
                        ofs.print "?,"
         
     | 
| 
      
 425 
     | 
    
         
            +
                      end
         
     | 
| 
      
 426 
     | 
    
         
            +
                    end
         
     | 
| 
      
 427 
     | 
    
         
            +
                    ofs.puts "#{k}"
         
     | 
| 
      
 428 
     | 
    
         
            +
                  end
         
     | 
| 
      
 429 
     | 
    
         
            +
                end
         
     | 
| 
      
 430 
     | 
    
         
            +
                
         
     | 
| 
      
 431 
     | 
    
         
            +
                # close file
         
     | 
| 
      
 432 
     | 
    
         
            +
                ofs.close if not ofs == $stdout
         
     | 
| 
      
 433 
     | 
    
         
            +
              end
         
     | 
| 
      
 434 
     | 
    
         
            +
              
         
     | 
| 
      
 435 
     | 
    
         
            +
              private
         
     | 
| 
      
 436 
     | 
    
         
            +
              
         
     | 
| 
      
 437 
     | 
    
         
            +
              # handle and add each feature for WEKA format
         
     | 
| 
      
 438 
     | 
    
         
            +
              def add_feature_weka(fs, f, v)
         
     | 
| 
      
 439 
     | 
    
         
            +
                if v == '?' # missing value
         
     | 
| 
      
 440 
     | 
    
         
            +
                  return
         
     | 
| 
      
 441 
     | 
    
         
            +
                elsif get_opt(f) == 'INTEGER'
         
     | 
| 
      
 442 
     | 
    
         
            +
                  fs[f] = v.to_i
         
     | 
| 
      
 443 
     | 
    
         
            +
                elsif get_opt(f) == 'REAL' or get_opt(f) == 'NUMERIC'
         
     | 
| 
      
 444 
     | 
    
         
            +
                  fs[f] = v.to_f
         
     | 
| 
      
 445 
     | 
    
         
            +
                elsif get_opt(f) == 'STRING' or get_opt(f) == 'NOMINAL'
         
     | 
| 
      
 446 
     | 
    
         
            +
                  fs[f] = v
         
     | 
| 
      
 447 
     | 
    
         
            +
                elsif get_opt(f) == 'DATE' # convert into integer
         
     | 
| 
      
 448 
     | 
    
         
            +
                  fs[f] = (DateTime.parse(v)-DateTime.new(1970,1,1)).to_i
         
     | 
| 
      
 449 
     | 
    
         
            +
                else
         
     | 
| 
      
 450 
     | 
    
         
            +
                   return
         
     | 
| 
      
 451 
     | 
    
         
            +
                end
         
     | 
| 
      
 452 
     | 
    
         
            +
              end # add_feature
         
     | 
| 
      
 453 
     | 
    
         
            +
                 
         
     | 
| 
      
 454 
     | 
    
         
            +
             
         
     | 
| 
      
 455 
     | 
    
         
            +
            end # module
         
     |