RubyGems - bio-band - Versions diffs - 0.1.0 - Mend

bio-band 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/Gemfile +20 -0
data/Gemfile.lock +79 -0
data/Jarfile +9 -0
data/Jarfile.lock +10 -0
data/LICENSE.txt +20 -0
data/README.rdoc +54 -0
data/Rakefile +54 -0
data/VERSION +1 -0
data/bin/bio-band +83 -0
data/bio-band.gemspec +129 -0
data/ext/mkrf_conf.rb +74 -0
data/features/create_dataset.feature +12 -0
data/features/step_definitions/create_dataset.rb +40 -0
data/features/step_definitions/weka_classifiers.rb +42 -0
data/features/step_definitions/weka_clustering.rb +30 -0
data/features/step_definitions/weka_filters.rb +29 -0
data/features/step_definitions/weka_parsers.rb +45 -0
data/features/support/env.rb +3 -0
data/features/weka_classifiers.feature +16 -0
data/features/weka_clustering.feature +14 -0
data/features/weka_filters.feature +12 -0
data/features/weka_parsers.feature +18 -0
data/features/weka_pipeline.feature +13 -0
data/lib/bio-band.rb +10 -0
data/lib/bio-band/apache.rb +1 -0
data/lib/bio-band/apache/stat/inference.rb +145 -0
data/lib/bio-band/core.rb +6 -0
data/lib/bio-band/core/parser/parser.rb +23 -0
data/lib/bio-band/core/type/apache_matrices.rb +35 -0
data/lib/bio-band/core/type/attribute.rb +53 -0
data/lib/bio-band/core/type/instance.rb +10 -0
data/lib/bio-band/core/type/instances.rb +332 -0
data/lib/bio-band/core/type/utils.rb +31 -0
data/lib/bio-band/weka.rb +11 -0
data/lib/bio-band/weka/classifiers/bayes/bayes.rb +75 -0
data/lib/bio-band/weka/classifiers/bayes/bayes_utils.rb +42 -0
data/lib/bio-band/weka/classifiers/evaluation.rb +12 -0
data/lib/bio-band/weka/classifiers/functions/functions.rb +23 -0
data/lib/bio-band/weka/classifiers/functions/functions_utils.rb +39 -0
data/lib/bio-band/weka/classifiers/lazy/lazy.rb +23 -0
data/lib/bio-band/weka/classifiers/lazy/lazy_utils.rb +39 -0
data/lib/bio-band/weka/classifiers/trees/trees.rb +48 -0
data/lib/bio-band/weka/classifiers/trees/trees_utils.rb +42 -0
data/lib/bio-band/weka/clusterers/clusterers.rb +32 -0
data/lib/bio-band/weka/clusterers/clusterers_utils.rb +49 -0
data/lib/bio-band/weka/db/DatabaseUtils_mysql +280 -0
data/lib/bio-band/weka/db/DatabaseUtils_postgresql +594 -0
data/lib/bio-band/weka/db/db.rb +74 -0
data/lib/bio-band/weka/filters/supervised/attribute/attribute.rb +25 -0
data/lib/bio-band/weka/filters/supervised/instance/instance.rb +17 -0
data/lib/bio-band/weka/filters/supervised/supervised_utils.rb +32 -0
data/lib/bio-band/weka/filters/unsupervised/attribute/attribute.rb +70 -0
data/lib/bio-band/weka/filters/unsupervised/instance/instance.rb +48 -0
data/lib/bio-band/weka/filters/unsupervised/unsupervised_utils.rb +33 -0
data/resources/weather.csv +15 -0
data/resources/weather.numeric.arff +23 -0
data/spec/bio-band_spec.rb +7 -0
data/spec/spec_helper.rb +12 -0
metadata +302 -0

data/ext/mkrf_conf.rb ADDED Viewed

@@ -0,0 +1,74 @@
+path = File.expand_path(File.dirname(__FILE__))
+def command?(name)
+  `which #{name}`
+  $?.success?
+end
+module OS
+  def OS.windows?
+    (/cygwin|mswin|mingw|bccwin|wince|emx/ =~ RbConfig::CONFIG['host_os']) != nil
+  end
+  def OS.mac?
+   (/darwin/ =~ RbConfig::CONFIG['host_os']) != nil
+  end
+  def OS.unix?
+    !OS.windows?
+  end
+  def OS.linux?
+    OS.unix? and not OS.mac?
+  end
+end
+File.open(File.join(path,"Rakefile"),"w") do |rakefile|
+  if OS.windows? == true
+    puts "Sorry, still no support is provided for your OS!"
+  elsif OS.mac? == true
+    if command?("mvn")==false && command?("brew")==true
+      rakefile.write <<-RAKE
+        task :brew_install do
+          sh "brew install maven"
+        end
+        task :default => [:brew_install]
+      RAKE
+    elsif command?("brew")==false
+      rakefile.write <<-RAKE
+      task :ok_inst do
+        puts "Sorry, Maven could not be installed. Try installing 'brew' first"
+      end
+      task :default => [:ok_inst]
+RAKE
+    else
+      rakefile.write <<-RAKE
+    task :ok_inst do
+      puts "Maven has been detected on your system"
+    end
+    task :default => [:ok_inst]
+RAKE
+    end
+  elsif OS.linux? == true
+    if command?("mvn")==false
+      rakefile.write <<-RAKE
+  	    task :apt_install do
+          sh "sudo apt-get install maven2"
+        end
+        task :default => [:apt_install]
+      RAKE
+    else
+      rakefile.write <<-RAKE
+    task :ok_inst do
+      puts "Maven has been detected on your system"
+    end
+    task :default => [:ok_inst]
+RAKE
+    end
+  end
+end

data/features/create_dataset.feature ADDED Viewed

@@ -0,0 +1,12 @@
+Feature: Creation of an in-memory dataset
+	In order to perform calculations on a dataset
+	I want to easily build it and store it in memory
+	Scenario: creation of a dataset
+		Given a nominal attribute, named "assertion", with values "yes,no"
+		Given one numeric attribute, named "temperature"
+		Given another numeric attribute, names "days"
+		Given two data rows: "yes,100,30","no,100,0"
+		Then I want to build en empty dataset for my use
+		And I want to populate the dataset by row
+		And I want to print my dataset as a bidimensional Ruby Array

data/features/step_definitions/create_dataset.rb ADDED Viewed

@@ -0,0 +1,40 @@
+Given(/^a nominal attribute, named "(.*?)", with values "(.*?)"$/) do |arg1, arg2|
+  $first_att = arg1.to_sym
+  $values = arg2.split(',')
+end
+Given(/^one numeric attribute, named "(.*?)"$/) do |arg1|
+	$second_att = arg1.to_sym
+end
+Given(/^another numeric attribute, names "(.*?)"$/) do |arg1|
+	$third_att = arg1.to_sym
+end
+Given(/^two data rows: "(.*?)","(.*?)"$/) do |arg1, arg2|
+	@first_row = arg1.split(',')
+	@second_row = arg2.split(',')
+	@first_row[1] = @first_row[1].to_f
+	@first_row[2] = @first_row[2].to_f
+	@second_row[1] = @second_row[1].to_f
+	@second_row[2] = @second_row[2].to_f
+end
+Then(/^I want to build en empty dataset for my use$/) do
+	class Dataset < Core::Type::Instances::Base
+		nominal $first_att, $values
+  	numeric $second_att
+  	string $third_att
+  end
+  @my_instance = Dataset.new
+  @my_instance.summary
+end
+Then(/^I want to populate the dataset by row$/) do
+	@my_instance.populate_by_row([@first_row,@second_row])
+	@my_instance.summary
+end
+Then(/^I want to print my dataset as a bidimensional Ruby Array$/) do
+	puts @my_instance.to_a2d.inspect
+end

data/features/step_definitions/weka_classifiers.rb ADDED Viewed

@@ -0,0 +1,42 @@
+Given(/^the Weka "(.*?)" classifier$/) do |arg1|
+  @classifier = Weka::Classifier::Bayes::NaiveBayes.new
+end
+Then(/^I want to print a "(.*?)"$/) do |arg1|
+  @classifier.send arg1.to_sym
+end
+Then(/^I want to print an options list$/) do
+  @classifier.list_options
+end
+Given(/^the unsupervised Weka classifier "(.*?)"$/) do |arg1|
+  class My_classifier < Weka::Classifier::Bayes::NaiveBayes::Base
+  end
+end
+Then(/^I want to set option "(.*?)" for it$/) do |arg1|
+  My_classifier.set_options(arg1)
+end
+Then(/^I want to set the dataset parsed from "(.*?)"$/) do |arg1|
+  @arff = File.join('resources',arg1)
+  @dataset_ARFF = Core::Parser::parse_ARFF(@arff)
+  My_classifier.set_data(@dataset_ARFF)
+end
+Then(/^I want to print a summary for the dataset$/) do
+  @dataset_ARFF.summary
+end
+Then(/^I want to set the class index for attribute with index "(.*?)"$/) do |arg1|
+    My_classifier.set_class_index(arg1.to_i)
+end
+Then(/^I want to instantiate the classifier for my use$/) do
+  My_classifier.new
+end

data/features/step_definitions/weka_clustering.rb ADDED Viewed

@@ -0,0 +1,30 @@
+Given(/^the ARFF dataset stored in the file "(.*?)"$/) do |arg1|
+  @arff = File.join('resources',arg1)
+end
+Given(/^the SimpleKMeans algorithm implementation from Weka$/) do
+  @clusterer = Weka::Classifier::Bayes::NaiveBayes.new
+end
+Then(/^I want to parse the data from the file$/) do
+  @dataset_ARFF = Core::Parser::parse_ARFF(@arff)
+end
+Then(/^I want to list the options available for SimpleKMeans$/) do
+  puts @clusterer.list_options
+end
+Then(/^I want to set K = "(.*?)" as K\-means option$/) do |arg1|
+  class Clustering < Weka::Clusterer::SimpleKMeans::Base
+  end
+  Clustering.set_options "-N #{arg1}"
+end
+Then(/^I want to perform clustering on the parsed dataset$/) do
+  Clustering.set_data(@dataset_ARFF)
+  @clustered = Clustering.new
+end
+Then(/^I want to report result statistics$/) do
+  puts @clustered
+end

data/features/step_definitions/weka_filters.rb ADDED Viewed

@@ -0,0 +1,29 @@
+Given(/^the example ARFF file "(.*?)"$/) do |arff_file|
+	@arff = File.join('resources',arff_file)
+end
+Given(/^the Weka Attribute Add filter$/) do
+	@filter = Weka::Filter::Unsupervised::Attribute::Add.new
+end
+Then(/^I want to parse the file in order to create an Instances class object$/) do
+	@dataset_ARFF = Core::Parser::parse_ARFF(@arff)
+	@dataset_ARFF.summary
+end
+Then(/^I want to print the available filter options and usage$/) do
+	@filter.filter_options
+end
+Then(/^I want to set the option String "(.*?)"$/) do |arg1|
+	@filter.set_filter_options(arg1)
+	@filter.set_data(@dataset_ARFF)
+end
+Then(/^I want to add an attribute \(a column\) to the dataset using the Weka filter Add$/) do
+	@new_inst = @filter.use
+end
+Then(/^I want to print a "(.*?)" for the modified dataset$/) do |arg1|
+	@new_inst.send arg1.to_sym
+end

data/features/step_definitions/weka_parsers.rb ADDED Viewed

@@ -0,0 +1,45 @@
+Given /^the CSV file "(.*?)"$/ do |csv_file|
+	@csv = File.join('resources',csv_file)
+	@dataset_CSV = Core::Parser::parse_CSV(@csv)
+end
+Then /^I want to print to stdout the summary for the CSV parsed Instances object$/ do
+	@dataset_CSV.summary
+end
+Given /^the ARFF file "(.*?)"$/ do |arff_file|
+	@arff = File.join('resources',arff_file)
+	puts @arff
+	@dataset_ARFF = Core::Parser::parse_ARFF(@arff)
+end
+Then /^I want to print to stdout the summary for the ARFF parsed Instances object$/ do
+	@dataset_ARFF.summary
+end
+Given(/^the database "(.*?)"$/) do |arg1|
+  @db_connection = arg1
+end
+Given(/^a table named "(.*?)"$/) do |arg1|
+	@target_table = arg1
+end
+Then(/^I want to extract data from that table$/) do
+  @dataset = Weka::Db.query_mysql(@db_connection,'root','',"select * from #{@target_table}")
+end
+Then(/^I want to print to stdout the summary for the parsed Instances object$/) do
+  @dataset.summary
+end
+Then(/^I want to convert the data into a bidimensional Ruby Array$/) do
+	@dataset.to_a2d.should == [["sunny", 85.0, 85.0, "FALSE", "no"], ["sunny", 80.0, 90.0, "TRUE", "no"], ["overcast", 83.0, 86.0, "FALSE", "yes"],
+													 ["rainy", 70.0, 96.0, "FALSE", "yes"], ["rainy", 68.0, 80.0, "FALSE", "yes"], ["rainy", 65.0, 70.0, "TRUE", "no"],
+													 ["overcast", 64.0, 65.0, "TRUE", "yes"], ["sunny", 72.0, 95.0, "FALSE", "no"], ["sunny", 69.0, 70.0, "FALSE", "yes"],
+													 ["rainy", 75.0, 80.0, "FALSE", "yes"], ["sunny", 75.0, 70.0, "TRUE", "yes"], ["overcast", 72.0, 90.0, "TRUE", "yes"],
+													 ["overcast", 81.0, 75.0, "FALSE", "yes"], ["rainy", 71.0, 91.0, "TRUE", "no"]]
+end

data/features/support/env.rb ADDED Viewed

@@ -0,0 +1,3 @@
+$LOAD_PATH << File.expand_path('../../../lib', __FILE__)
+$LOAD_PATH << File.expand_path('../../../resources/', __FILE__)
+require 'ruby_mining'

data/features/weka_classifiers.feature ADDED Viewed

@@ -0,0 +1,16 @@
+Feature: Using a Weka classifier
+	In order to classify an instance data
+	I want to use a Weka classifier
+	Scenario: Understand options and usage
+		Given the Weka "NaiveBayes" classifier
+		Then I want to print a "description"
+		And I want to print an options list
+	Scenario: Use a classifier on a data instance
+		Given the unsupervised Weka classifier "NaiveBayes"
+		Then I want to set option "-K" for it
+		And I want to set the dataset parsed from "weather.numeric.arff"
+		And I want to print a summary for the dataset
+		And I want to set the class index for attribute with index "0"
+		And I want to instantiate the classifier for my use

data/features/weka_clustering.feature ADDED Viewed

@@ -0,0 +1,14 @@
+Feature: Weka dataset clustering
+	In order to group similar data vectors in my dataset
+	I want to use Weka clustering algorithms
+	Scenario: use of Kmeans algorithm
+		Given the ARFF dataset stored in the file "weather.numeric.arff"
+    Given the SimpleKMeans algorithm implementation from Weka
+		Then I want to parse the data from the file
+    And I want to list the options available for SimpleKMeans
+    And I want to set K = "4" as K-means option
+    And I want to perform clustering on the parsed dataset
+    And I want to report result statistics

data/features/weka_filters.feature ADDED Viewed

@@ -0,0 +1,12 @@
+Feature: Filter a dataset (Weka Instances class)
+	In order to manipulate a dataset
+	I want to use the Weka Filter class on it
+	Scenario: Use of the Unsupervised Attribute filter 'Add'
+		Given the example ARFF file "weather.numeric.arff"
+		Given the Weka Attribute Add filter
+		Then I want to parse the file in order to create an Instances class object
+		And I want to print the available filter options and usage
+		And I want to set the option String "-T NUM -N dummy"
+		And I want to add an attribute (a column) to the dataset using the Weka filter Add
+		And I want to print a "summary" for the modified dataset

data/features/weka_parsers.feature ADDED Viewed

@@ -0,0 +1,18 @@
+Feature: Weka basic parsing capabilities
+	In order to perform calculations on a dataset
+	I want to import data from .ARFF, .CSV files, and external databases
+	Scenario: parsing a CSV file
+		Given the CSV file "weather.csv"
+		Then I want to print to stdout the summary for the CSV parsed Instances object
+	Scenario: parsing an ARFF file
+		Given the ARFF file "weather.numeric.arff"
+		Then I want to print to stdout the summary for the ARFF parsed Instances object
+	Scenario: parsing data from a mySQL table
+		Given the database "jdbc:mysql://localhost:3306/Gene_classes"
+		And a table named "test_weka"
+		Then I want to extract data from that table
+		And I want to print to stdout the summary for the parsed Instances object
+		And I want to convert the data into a bidimensional Ruby Array

data/features/weka_pipeline.feature ADDED Viewed

@@ -0,0 +1,13 @@
+Feature: Classification pipeline
+	In order to perform text-mining on a dataset
+	I want to parse the data, filter it and then classify it using a Bayesian classifier
+	Scenario: Use of Naive Bayes on a filtered data set
+		Given a file containing the training set data "ReutersGrain-train.arff"
+		Given a file containing the test set data "ReutersGrain-test.arff"
+		Then I want to parse them
+		And I want to filter them using the unsupervised filter "StringToWordVector"
+		And I want to build a "NaiveBayes" classifier using training set data
+		And I want to evaluate the performance of the classifier on the test test
+		And I want want to print to stdout a "summary" for the evaluation

data/lib/bio-band.rb ADDED Viewed

@@ -0,0 +1,10 @@
+start = File.absolute_path '.'
+Dir.chdir(File.join(File.dirname(__FILE__),".."))
+require 'jbundler'
+Dir.chdir start
+require File.join(File.dirname(__FILE__),'..','.jbundler','classpath.rb')
+require "java"
+require "bio-band/core"
+require "bio-band/weka"
+require "bio-band/apache"

data/lib/bio-band/apache.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ require 'bio-band/apache/stat/inference.rb'

data/lib/bio-band/apache/stat/inference.rb ADDED Viewed

@@ -0,0 +1,145 @@
+require 'java'
+module Apache
+  module Stat
+  	module Inference
+  		java_import 'org.apache.commons.math3.stat.inference.ChiSquareTest'
+  		java_import 'org.apache.commons.math3.stat.inference.MannWhitneyUTest'
+  		java_import 'org.apache.commons.math3.stat.inference.OneWayAnova'
+  		java_import 'org.apache.commons.math3.stat.inference.TTest'
+  		java_import 'org.apache.commons.math3.stat.inference.WilcoxonSignedRankTest'
+  		java_import 'org.apache.commons.math3.stat.StatUtils'
+      java_import 'java.util.ArrayList'
+      # An implementation of the Wilcoxon signed-rank test
+      # * *Args*    :
+      #   - +Array1+ -> must be a RubyArray.
+      #   - +Array2+ -> must be a RubyArray.
+  		def self.wilcoxon_test(array_1,array_2)
+  			obj = WilcoxonSignedRankTest.new
+        first = Core::Utils::double_to_a(array_1)
+        second = Core::Utils::double_to_a(array_2)
+  			val = obj.wilcoxonSignedRank first, second
+        p_val = obj.wilcoxonSignedRankTest first, second, true.to_java(:boolean)
+  			return val,p_val
+  		end
+      # Utility class called by 'chi_square' method in this same package
+      class Chi_square
+        def self.chi_square_2d(array_2d)
+          obj = ChiSquareTest.new
+          val = obj.chi_square(array_2d.to_java(Java::long[]))
+          p_value =  obj.chi_square_test(array_2d.to_java(Java::long[]))
+          return val,p_value
+        end
+        def self.chi_square_two_arrays(expected,observed)
+          obj = ChiSquareTest.new
+          val = obj.chi_square(expected.to_java(:double),observed.to_java(:long))
+          p_value = obj.chi_square_test(expected.to_java(:double),observed.to_java(:long))
+          return val,p_value
+        end
+      end
+      # 1) Computes the Chi-Square statistic comparing observed and expected frequency counts.
+      # * *Args*    :
+      #   - +Array+ -> must be a bidimensional RubyArray.
+      # 2) Computes the Chi-Square statistic associated with a chi-square test of independence
+      #   based on the input counts array, viewed as a two-way table.
+      # * *Args*    :
+      #   - +Array1+ -> must be a RubyArray.
+      #   - +Array2+ -> must be a RubyArray.
+      def self.chi_square(*args)
+        if args.length == 2
+          Chi_square.chi_square_two_arrays(*args)
+        elsif args.length == 1
+          raise ArgumentError,"RubyArray must be bidimensional" unless args[0].is_2d?
+          Chi_square.chi_square_2d(*args)
+        else
+          raise ArgumentError, 'Function *args should be two RubyArrays or a bidimensional RubyArray'
+        end
+      end
+      def self.chi_square_dataset_compare(observed1,observed2)
+        obj = ChiSquareTest.new
+        val = obj.chiSquareDataSetsComparison(observed1.to_java(:long),observed2.to_java(:long))
+        p_value = obj.chiSquareTestDataSetsComparison(observed1.to_java(:long),observed2.to_java(:long))
+        return val,p_value
+      end
+      def mann_whitney_u(array1,array2)
+        obj = MannWhitneyUTest.new
+        first = array_1.to_java :double
+        second = array_2.to_java :double
+        value = mannWhitneyU first,second
+        p_value = mannWhitneyUTest first,second
+        return value,p_value
+      end
+      #Utility class called by 't_test' method in this same package
+      class T_test
+        def self.homoscedastic(array_1,array_2)
+          obj = TTest.new
+          first = array_1.to_java :double
+          second = array_2.to_java :double
+          value = obj.homoscedasticT(first,second)
+          p_value = obj.homoscedasticTTest(first,second)
+          return value, p_value
+        end
+        def self.paired(array_1,array_2)
+          obj = TTest.new
+          first = array_1.to_java :double
+          second =array_2.to_java :double
+          value = obj.pairedT(first,second)
+          p_value = obj.pairedTTest(first,second)
+          return value,p_value
+        end
+        def self.t(array_1,array_2)
+          obj = TTest.new
+          first = array_1.to_java :double
+          second =array_2.to_java :double
+          value = obj.t(first,second)
+          p_value =obj.tTest(first,second)
+          return value,p_value
+        end
+      end
+      # An implementation for Student's t-tests
+      # * *Args*    :
+      #   - +sample_1+ -> an array of numeric values representing a sample
+      #   - +sample_2+ -> an array of numeric values representing a sample
+      #   - +homoscedastic+ -> set to true for equal variance assumption
+      #   - +paired+ -> set to true if you want to perform a 'paired' t test
+      def self.t_test(sample_1,sample_2,homoscedastic=false,paired=false)
+        if homoscedastic == true
+          T_test.homoscedastic(sample_1,sample_2)
+        elsif paired == true
+          T_test.paired(sample_1,sample_2)
+        else
+          T_test.t(sample_1,sample_2)
+        end
+      end
+      # Implements one-way ANOVA (analysis of variance) statistics.
+      # Tests for differences between two or more categories of univariate data (for example,
+      # the body mass index of accountants, lawyers, doctors and computer programmers). When
+      # two categories are given, this is equivalent to the TTest.
+      # * *Args*    :
+      #   - +bidimensional_array+ -> a 2d RubyArray
+      def self.one_way_anova(bidimensional_array)
+        collection = ArrayList.new
+        bidimensional_array.each do |array|
+          collection.add(array.to_java :double)
+        end
+        obj = OneWayAnova.new
+        f_value = obj.anovaFValue(collection)
+        p_value = obj.anovaPValue(collection)
+        return f_value,p_value
+      end
+  	end
+  end
+end