ruby-band 0.1.11
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +3 -0
- data/Gemfile +30 -0
- data/Gemfile.lock +119 -0
- data/Jarfile +9 -0
- data/Jarfile.lock +10 -0
- data/LICENSE.txt +22 -0
- data/README.md +321 -0
- data/README.rdoc +70 -0
- data/Rakefile +66 -0
- data/VERSION +1 -0
- data/band_server/client.rb +35 -0
- data/band_server/client_alt.rb +35 -0
- data/band_server/first_dataset.csv +15 -0
- data/band_server/second_dataset.csv +15 -0
- data/band_server/simple_server.rb +90 -0
- data/band_server/third_dataset.csv +15 -0
- data/band_server/uploads/first_dataset.csv +15 -0
- data/band_server/uploads/second_dataset.csv +15 -0
- data/band_server/uploads/third_dataset.csv +15 -0
- data/bin/ruby-band +83 -0
- data/ext/mkrf_conf.rb +74 -0
- data/features/create_dataset.feature +12 -0
- data/features/step_definitions/create_dataset.rb +39 -0
- data/features/step_definitions/weka_classifiers.rb +43 -0
- data/features/step_definitions/weka_clustering.rb +34 -0
- data/features/step_definitions/weka_filters.rb +32 -0
- data/features/step_definitions/weka_parsers.rb +46 -0
- data/features/step_definitions/weka_pipeline.rb +41 -0
- data/features/support/env.rb +3 -0
- data/features/weka_classifiers.feature +16 -0
- data/features/weka_clustering.feature +15 -0
- data/features/weka_filters.feature +12 -0
- data/features/weka_parsers.feature +18 -0
- data/features/weka_pipeline.feature +14 -0
- data/lib/ruby-band.rb +12 -0
- data/lib/ruby-band/apache.rb +2 -0
- data/lib/ruby-band/apache/stat/correlation.rb +42 -0
- data/lib/ruby-band/apache/stat/inference.rb +151 -0
- data/lib/ruby-band/apache/stat/regression.rb +22 -0
- data/lib/ruby-band/core.rb +6 -0
- data/lib/ruby-band/core/parser/parser.rb +27 -0
- data/lib/ruby-band/core/type/apache_matrices.rb +35 -0
- data/lib/ruby-band/core/type/attribute.rb +53 -0
- data/lib/ruby-band/core/type/instance.rb +10 -0
- data/lib/ruby-band/core/type/instances.rb +361 -0
- data/lib/ruby-band/core/type/utils.rb +31 -0
- data/lib/ruby-band/weka.rb +14 -0
- data/lib/ruby-band/weka/attribute_selection/attribute_selection_utils.rb +20 -0
- data/lib/ruby-band/weka/attribute_selection/evaluators.rb +58 -0
- data/lib/ruby-band/weka/attribute_selection/search.rb +52 -0
- data/lib/ruby-band/weka/classifiers/bayes/bayes.rb +86 -0
- data/lib/ruby-band/weka/classifiers/bayes/bayes_utils.rb +82 -0
- data/lib/ruby-band/weka/classifiers/evaluation.rb +13 -0
- data/lib/ruby-band/weka/classifiers/functions/functions.rb +177 -0
- data/lib/ruby-band/weka/classifiers/functions/functions_utils.rb +78 -0
- data/lib/ruby-band/weka/classifiers/lazy/lazy.rb +86 -0
- data/lib/ruby-band/weka/classifiers/lazy/lazy_utils.rb +83 -0
- data/lib/ruby-band/weka/classifiers/mi/mi.rb +191 -0
- data/lib/ruby-band/weka/classifiers/mi/mi_utils.rb +80 -0
- data/lib/ruby-band/weka/classifiers/rules/rules.rb +190 -0
- data/lib/ruby-band/weka/classifiers/rules/rules_utils.rb +81 -0
- data/lib/ruby-band/weka/classifiers/trees/trees.rb +110 -0
- data/lib/ruby-band/weka/classifiers/trees/trees_utils.rb +85 -0
- data/lib/ruby-band/weka/clusterers/clusterers.rb +99 -0
- data/lib/ruby-band/weka/clusterers/clusterers_utils.rb +86 -0
- data/lib/ruby-band/weka/db/DatabaseUtils_mysql +280 -0
- data/lib/ruby-band/weka/db/DatabaseUtils_postgresql +594 -0
- data/lib/ruby-band/weka/db/db.rb +74 -0
- data/lib/ruby-band/weka/filters/supervised/attribute/attribute.rb +55 -0
- data/lib/ruby-band/weka/filters/supervised/instance/instance.rb +17 -0
- data/lib/ruby-band/weka/filters/supervised/supervised_utils.rb +38 -0
- data/lib/ruby-band/weka/filters/unsupervised/attribute/attribute.rb +90 -0
- data/lib/ruby-band/weka/filters/unsupervised/instance/instance.rb +48 -0
- data/lib/ruby-band/weka/filters/unsupervised/unsupervised_utils.rb +38 -0
- data/resources/ReutersGrain-test.arff +611 -0
- data/resources/ReutersGrain-train.arff +1561 -0
- data/resources/weather.csv +15 -0
- data/resources/weather.numeric.arff +23 -0
- data/ruby-band.gemspec +178 -0
- data/spec/ruby-band_spec.rb +7 -0
- data/spec/spec_helper.rb +12 -0
- data/test/helper.rb +18 -0
- data/test/test_apacheCorrelation.rb +22 -0
- data/test/test_apacheInference.rb +46 -0
- data/test/test_ruby-band.rb +9 -0
- metadata +426 -0
data/.travis.yml
ADDED
data/Gemfile
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
platforms :jruby do
|
7
|
+
gem "i18n", "0.6.1"
|
8
|
+
gem "activesupport", "3.2.13"
|
9
|
+
gem "rake"
|
10
|
+
gem "bio", ">= 1.4.2"
|
11
|
+
gem "jbundler", "0.4.3"
|
12
|
+
gem "ruport"
|
13
|
+
gem "json"
|
14
|
+
gem "multi_json"
|
15
|
+
gem "gherkin"
|
16
|
+
gem "git"
|
17
|
+
gem "shoulda", ">= 0"
|
18
|
+
gem "test-unit", ">= 0"
|
19
|
+
gem "rdoc", "~> 3.12"
|
20
|
+
gem "bundler", "~> 1.3.5"
|
21
|
+
gem "jeweler", "~> 1.8.4"
|
22
|
+
gem "simplecov", ">= 0"
|
23
|
+
gem "cucumber"
|
24
|
+
gem "rspec"
|
25
|
+
end
|
26
|
+
|
27
|
+
# Add dependencies to develop your gem here.
|
28
|
+
# Include everything needed to run rake, tests, features, etc.
|
29
|
+
#group :development do
|
30
|
+
#end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
activesupport (3.2.13)
|
5
|
+
i18n (= 0.6.1)
|
6
|
+
multi_json (~> 1.0)
|
7
|
+
addressable (2.3.5)
|
8
|
+
bio (1.4.3.0001)
|
9
|
+
builder (3.2.2)
|
10
|
+
color (1.4.2)
|
11
|
+
cucumber (1.3.8)
|
12
|
+
builder (>= 2.1.2)
|
13
|
+
diff-lcs (>= 1.1.3)
|
14
|
+
gherkin (~> 2.12.1)
|
15
|
+
multi_json (>= 1.7.5, < 2.0)
|
16
|
+
multi_test (>= 0.0.2)
|
17
|
+
diff-lcs (1.2.4)
|
18
|
+
faraday (0.8.8)
|
19
|
+
multipart-post (~> 1.2.0)
|
20
|
+
fastercsv (1.5.5)
|
21
|
+
gherkin (2.12.1-java)
|
22
|
+
multi_json (~> 1.3)
|
23
|
+
git (1.2.6)
|
24
|
+
github_api (0.10.1)
|
25
|
+
addressable
|
26
|
+
faraday (~> 0.8.1)
|
27
|
+
hashie (>= 1.2)
|
28
|
+
multi_json (~> 1.4)
|
29
|
+
nokogiri (~> 1.5.2)
|
30
|
+
oauth2
|
31
|
+
hashie (2.0.5)
|
32
|
+
highline (1.6.19)
|
33
|
+
httpauth (0.2.0)
|
34
|
+
i18n (0.6.1)
|
35
|
+
jbundler (0.4.3)
|
36
|
+
maven-tools (~> 0.32.1)
|
37
|
+
ruby-maven (~> 3.0.4)
|
38
|
+
jeweler (1.8.7)
|
39
|
+
builder
|
40
|
+
bundler (~> 1.0)
|
41
|
+
git (>= 1.2.5)
|
42
|
+
github_api (= 0.10.1)
|
43
|
+
highline (>= 1.6.15)
|
44
|
+
nokogiri (= 1.5.10)
|
45
|
+
rake
|
46
|
+
rdoc
|
47
|
+
json (1.8.0-java)
|
48
|
+
jwt (0.1.8)
|
49
|
+
multi_json (>= 1.5)
|
50
|
+
maven-tools (0.32.5)
|
51
|
+
multi_json (1.8.0)
|
52
|
+
multi_test (0.0.2)
|
53
|
+
multi_xml (0.5.5)
|
54
|
+
multipart-post (1.2.0)
|
55
|
+
nokogiri (1.5.10-java)
|
56
|
+
oauth2 (0.9.2)
|
57
|
+
faraday (~> 0.8)
|
58
|
+
httpauth (~> 0.2)
|
59
|
+
jwt (~> 0.1.4)
|
60
|
+
multi_json (~> 1.0)
|
61
|
+
multi_xml (~> 0.5)
|
62
|
+
rack (~> 1.2)
|
63
|
+
pdf-writer (1.1.8)
|
64
|
+
color (>= 1.4.0)
|
65
|
+
transaction-simple (~> 1.3)
|
66
|
+
rack (1.5.2)
|
67
|
+
rake (10.1.0)
|
68
|
+
rdoc (3.12.2)
|
69
|
+
json (~> 1.4)
|
70
|
+
rspec (2.14.1)
|
71
|
+
rspec-core (~> 2.14.0)
|
72
|
+
rspec-expectations (~> 2.14.0)
|
73
|
+
rspec-mocks (~> 2.14.0)
|
74
|
+
rspec-core (2.14.5)
|
75
|
+
rspec-expectations (2.14.2)
|
76
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
77
|
+
rspec-mocks (2.14.3)
|
78
|
+
ruby-maven (3.0.4.1.4)
|
79
|
+
maven-tools (~> 0.32.3)
|
80
|
+
thor (>= 0.14.6, < 2.0)
|
81
|
+
ruport (1.6.3)
|
82
|
+
fastercsv
|
83
|
+
pdf-writer (= 1.1.8)
|
84
|
+
shoulda (3.5.0)
|
85
|
+
shoulda-context (~> 1.0, >= 1.0.1)
|
86
|
+
shoulda-matchers (>= 1.4.1, < 3.0)
|
87
|
+
shoulda-context (1.1.5)
|
88
|
+
shoulda-matchers (2.3.0)
|
89
|
+
activesupport (>= 3.0.0)
|
90
|
+
simplecov (0.7.1)
|
91
|
+
multi_json (~> 1.0)
|
92
|
+
simplecov-html (~> 0.7.1)
|
93
|
+
simplecov-html (0.7.1)
|
94
|
+
test-unit (2.5.5)
|
95
|
+
thor (0.18.1)
|
96
|
+
transaction-simple (1.4.0.2)
|
97
|
+
|
98
|
+
PLATFORMS
|
99
|
+
java
|
100
|
+
|
101
|
+
DEPENDENCIES
|
102
|
+
activesupport (= 3.2.13)
|
103
|
+
bio (>= 1.4.2)
|
104
|
+
bundler (~> 1.3.5)
|
105
|
+
cucumber
|
106
|
+
gherkin
|
107
|
+
git
|
108
|
+
i18n (= 0.6.1)
|
109
|
+
jbundler (= 0.4.3)
|
110
|
+
jeweler (~> 1.8.4)
|
111
|
+
json
|
112
|
+
multi_json
|
113
|
+
rake
|
114
|
+
rdoc (~> 3.12)
|
115
|
+
rspec
|
116
|
+
ruport
|
117
|
+
shoulda
|
118
|
+
simplecov
|
119
|
+
test-unit
|
data/Jarfile
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
jar 'nz.ac.waikato.cms.weka:weka-stable','3.6.10'
|
2
|
+
jar 'org.apache.commons:commons-math3','3.0'
|
3
|
+
jar 'junit:junit','3.8.1'
|
4
|
+
jar 'mysql:mysql-connector-java','5.1.6'
|
5
|
+
jar 'postgresql:postgresql','9.1-901.jdbc4'
|
6
|
+
jar 'org.xerial:sqlite-jdbc','3.7.2'
|
7
|
+
jar 'hsqldb:hsqldb','1.8.0.7'
|
8
|
+
jar 'idb:idb','3.26'
|
9
|
+
jar 'mckoi:mckoi','0.93'
|
data/Jarfile.lock
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
nz.ac.waikato.cms.weka:weka-stable:jar:3.6.10
|
2
|
+
net.sf.squirrel-sql.thirdparty-non-maven:java-cup:jar:0.11a
|
3
|
+
org.apache.commons:commons-math3:jar:3.0
|
4
|
+
junit:junit:jar:3.8.1
|
5
|
+
mysql:mysql-connector-java:jar:5.1.6
|
6
|
+
postgresql:postgresql:jar:9.1-901.jdbc4
|
7
|
+
org.xerial:sqlite-jdbc:jar:3.7.2
|
8
|
+
hsqldb:hsqldb:jar:1.8.0.7
|
9
|
+
idb:idb:jar:3.26
|
10
|
+
mckoi:mckoi:jar:0.93
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
This version of ruby-band is licensed under the BSD 2-clause license.
|
2
|
+
|
3
|
+
* http://sciruby.com
|
4
|
+
* http://github.com/sciruby/sciruby/wiki/License
|
5
|
+
|
6
|
+
You *must* read the Contributor Agreement before contributing code to the SciRuby Project. This is available online:
|
7
|
+
|
8
|
+
* http://github.com/sciruby/sciruby/wiki/Contributor-Agreement
|
9
|
+
|
10
|
+
-----
|
11
|
+
|
12
|
+
Copyright (c) 2010 - 2013, Ruby Science Foundation
|
13
|
+
All rights reserved.
|
14
|
+
|
15
|
+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
16
|
+
|
17
|
+
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
18
|
+
|
19
|
+
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
20
|
+
|
21
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
22
|
+
|
data/README.md
ADDED
@@ -0,0 +1,321 @@
|
|
1
|
+
# ruby-band
|
2
|
+
|
3
|
+
[![Build Status](https://travis-ci.org/arrigonialberto86/ruby-band.png)](https://travis-ci.org/arrigonialberto86/ruby-band)
|
4
|
+
|
5
|
+
|
6
|
+
Data mining and machine learning algorithms for Ruby
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Install the 'jbundle' gem and 'bundle' for JRuby before trying to install the
|
11
|
+
'ruby-band' gem. Maven is also required for .jars automatic download and
|
12
|
+
installation. On Ubuntu/Debian Maven should already be installed and on OSX
|
13
|
+
system you can get it from Brew
|
14
|
+
|
15
|
+
If you want to use 'ruby-band' APIs without installing the gem you need to run
|
16
|
+
command 'rake -T' once before requiring the gem in your script (this is
|
17
|
+
necessary for jbundler to download the '.jar' files and subsequently set the
|
18
|
+
Java classpath). Otherwise use:
|
19
|
+
|
20
|
+
gem install ruby-band
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
### Dataset parsing
|
25
|
+
One central datatype of ruby-band is derived from the Weka counterpart (the class Weka.core.Instances). By instantiating this class, we obtain a matrix-like structure for storing an entire dataset. Ad-hoc methods were created to guarantee that 'Instances' class objects can be converted to other datatypes (e.g. Apache matrix) and back.
|
26
|
+
There are currently many ways to import data into ruby-band.
|
27
|
+
### Parsing data from ARFF/CSV files
|
28
|
+
You can simply parse an external Weka ARFF/CSV file by doing:
|
29
|
+
```ruby
|
30
|
+
require 'ruby-band'
|
31
|
+
dataset = Core::Parser.parse_ARFF(my_file.arff)
|
32
|
+
dataset = Core::Parser.parse_CSV(my_file.csv)
|
33
|
+
```
|
34
|
+
### In-memory dataset creation
|
35
|
+
Since the dataset type used by ruby-band is derived from Weka Instances class, we must define the domain of the data we want to insert into it. The attribute types supported by ruby-band are 'numeric', 'nominal', 'string' and 'date'. For this reason, each column in the dataset can contain only one data type to be valid.
|
36
|
+
If you want to build an in-memory dataset you can create an empty scaffold at first, then you populate it with your data, like this:
|
37
|
+
```ruby
|
38
|
+
require 'ruby-band'
|
39
|
+
# we create a dataset containing three columns (attributes)
|
40
|
+
dataset = Core::Type::Instances::Base.new do
|
41
|
+
nominal :first_attribute, ['yes','no']
|
42
|
+
nominal :second_attribute, ['maybe','perhaps']
|
43
|
+
numeric :third_attribute
|
44
|
+
end
|
45
|
+
```
|
46
|
+
we can populate 'by row' our matrix-like dataset using a bidimensional array:
|
47
|
+
```ruby
|
48
|
+
dataset.populate_by_row([['yes','maybe',6],['no','perhaps',21]])
|
49
|
+
```
|
50
|
+
Every row in the dataset above must meet this construction criteria: [a_nominal_value,b_nominal_value,c_numeric value], in order to match the structure assigned during dataset initialization.
|
51
|
+
|
52
|
+
### How to operate on a dataset
|
53
|
+
|
54
|
+
The ruby-band `Core::Type::Instances` class offers a wide range of operations to easily access and modify a dataset. Some of the are very intuitive:
|
55
|
+
```ruby
|
56
|
+
require 'ruby-band'
|
57
|
+
dataset = Core::Parser.parse_ARFF(my_file.arff)
|
58
|
+
# we can now access the dataset
|
59
|
+
|
60
|
+
dataset.summary
|
61
|
+
dataset.n_col/.n_row/.dim # R-like functions
|
62
|
+
dataset.each_row/each_column {|row/column| function}
|
63
|
+
```
|
64
|
+
or we can modify it by doing:
|
65
|
+
```ruby
|
66
|
+
# to add a row
|
67
|
+
dataset.add_instance ['yes','maybe',21]
|
68
|
+
|
69
|
+
# to add a column
|
70
|
+
dataset.add_numeric_attribute 'my_numeric_attribute'
|
71
|
+
dataset.add_nominal_attribute 'my_nominal_attribute', ['ruby','is','fun']
|
72
|
+
```
|
73
|
+
In addition to these methods, ruby-band offers a wide range of filters to operate on the structure and the content of the Instances datasets.
|
74
|
+
|
75
|
+
### How to export a dataset
|
76
|
+
It is fairly easy to export a dataset to a CSV/ARFF file or to a Mysql table (with reference to the example above):
|
77
|
+
```ruby
|
78
|
+
dataset.to_ARFF my_output_file.arff
|
79
|
+
dataset.to_CSV my_output_file.csv
|
80
|
+
dataset.save_to_mysql 'jdbc:mysql://localhost:3306/DB_name', user_name, password, table_name
|
81
|
+
```
|
82
|
+
## Weka filters
|
83
|
+
In WEKA, filters are used to preprocess the data.
|
84
|
+
Each filter falls into one of the following two categories:
|
85
|
+
+ supervised – The filter requires a class attribute to be set.
|
86
|
+
+ unsupervised – A class attribute is not required to be present.
|
87
|
+
|
88
|
+
And into one of the two sub-categories:
|
89
|
+
+ attribute-based – Columns are processed, e.g., added or removed.
|
90
|
+
+ instance-based – Rows are processed, e.g., added or deleted.
|
91
|
+
|
92
|
+
As to the namespaces used for the filters available, they can be found here:
|
93
|
+
```ruby
|
94
|
+
Weka::Filter::Supervised::Attribute::my_filter.new
|
95
|
+
Weka::Filter::Supervised::Instance::my_filter.new
|
96
|
+
Weka::Filter::Unsupervised::Attribute::my_filter.new
|
97
|
+
Weka::Filter::Unsupervised::Instance::my_filter.new
|
98
|
+
```
|
99
|
+
|
100
|
+
These categories should make it clear, what the difference between the two Discretize (Weka::Filter::Supervised:: || Unsupervised::Attribute::Discretize) filters in WEKA is. The supervised one takes the class attribute and its distribution over the dataset into account, in order to determine the optimal number and size of bins, whereas the unsupervised one relies on a user-specified number of bins.
|
101
|
+
|
102
|
+
If you want to return a brief description with the required options for a selected filter class you only need to do this:
|
103
|
+
```ruby
|
104
|
+
filter = Weka::Filter::Unsupervised::Instance::my_filter.new
|
105
|
+
puts filter.description
|
106
|
+
puts filter.options_list
|
107
|
+
```
|
108
|
+
|
109
|
+
|
110
|
+
To apply a filter on a dataset we can use a very simple approach:
|
111
|
+
```ruby
|
112
|
+
dataset = Core::Parser::parse_ARFF('example_file.arff')
|
113
|
+
|
114
|
+
# filter instantiation
|
115
|
+
filter = Weka::Filter::Supervised::Attribute::my_filter.new
|
116
|
+
|
117
|
+
# input/options handling
|
118
|
+
filter.set do
|
119
|
+
data dataset
|
120
|
+
filter_options '-W'
|
121
|
+
end
|
122
|
+
|
123
|
+
# return a filtered dataset
|
124
|
+
filtered_dataset = filter.use
|
125
|
+
```
|
126
|
+
to list the available options for a given filter you can use the method `filter.options_list`
|
127
|
+
|
128
|
+
## Weka attribute selection
|
129
|
+
|
130
|
+
Preparing one’s data properly is a very important step for getting the best results. Reducing the number of attributes can not only help speeding up runtime with algorithms, but also help avoid “burying” the algorithm in a mass of attributes, when only a few are essential for building a good model.
|
131
|
+
|
132
|
+
There are three different types of evaluators in Weka at the moment: single attribute evaluators, attribute subset evaluators, attribute set evaluators. Most of the attribute selection schemes currently implemented are supervised, i.e., they require a dataset with a class attribute, but the usage (and the capabilities!) of each evaluator can be accessed by calling the method 'list_options' on an Evaluator (or Search) class object.
|
133
|
+
|
134
|
+
```ruby
|
135
|
+
# let's instantiate an evaluator
|
136
|
+
eval = Weka::Attribute_selection::Evaluator::CfsSubsetEval.new
|
137
|
+
|
138
|
+
puts eval.options_list
|
139
|
+
|
140
|
+
### -M Treat missing values as a separate value.
|
141
|
+
### -L Don't include locally predictive attributes.
|
142
|
+
|
143
|
+
puts eval.select_options '-M'
|
144
|
+
```
|
145
|
+
If we do not need to set any particular option for the evaluator and the search algorithm we can simply instantiate both classes:
|
146
|
+
|
147
|
+
```ruby
|
148
|
+
require 'ruby-band'
|
149
|
+
|
150
|
+
# Evaluator
|
151
|
+
eval = Weka::Attribute_selection::Evaluator::CfsSubsetEval.new
|
152
|
+
|
153
|
+
# Search algorithm
|
154
|
+
search = Weka::Attribute_selection::Search::GreedyStepwise.new
|
155
|
+
|
156
|
+
dataset = Core::Parser::parse_ARFF('weather.numeric.arff')
|
157
|
+
```
|
158
|
+
|
159
|
+
And then we filter our dataset using a supervised filter:
|
160
|
+
```ruby
|
161
|
+
filter = Weka::Filter::Supervised::Attribute::AttributeSelection.new
|
162
|
+
|
163
|
+
filter.set do
|
164
|
+
evaluator eval
|
165
|
+
search search
|
166
|
+
data dataset
|
167
|
+
end
|
168
|
+
|
169
|
+
filtered_dataset = filter.use
|
170
|
+
```
|
171
|
+
The returned Instances class object stores the results of the analysis we performed on the initial data.
|
172
|
+
|
173
|
+
##Classifiers
|
174
|
+
|
175
|
+
Classification and regression algorithms in WEKA are called “classifiers” and are located below the Weka::Classifier:: module. Currently, ruby-band only supports batch-trainable classifiers: this means they get trained on the whole dataset at once.
|
176
|
+
|
177
|
+
If you want to return a brief description with the required options for a selected classifier class you only need to do this:
|
178
|
+
```ruby
|
179
|
+
classifier = Weka::Classifier::Lazy::my_classifier.new
|
180
|
+
puts classifier.description
|
181
|
+
puts classifier.list_options
|
182
|
+
```
|
183
|
+
|
184
|
+
It is fairly easy to build a classifier using the ruby-band APIs:
|
185
|
+
```ruby
|
186
|
+
classifier = Weka::Classifier::Lazy::KStar::Base.new do
|
187
|
+
set_options '-M d'
|
188
|
+
set_data dataset
|
189
|
+
set_class_index 4
|
190
|
+
end
|
191
|
+
```
|
192
|
+
we can then evaluate the trained classifier using cross-validation:
|
193
|
+
```ruby
|
194
|
+
classifier.cross_validate(3) # ARG is 'folds' used by cross-validation
|
195
|
+
```
|
196
|
+
Alternatively, a test set can be for evaluation used by doing:
|
197
|
+
```ruby
|
198
|
+
test_set = Core::Parser::parse_ARFF 'some/where/file.arff'
|
199
|
+
test_set.set_class_index(0)
|
200
|
+
|
201
|
+
evaluator = Weka::Classifier::Evaluation.new $filtered_dataset
|
202
|
+
puts evaluator.evaluate_model(classifier,test_data)
|
203
|
+
```
|
204
|
+
|
205
|
+
###Classifying instances
|
206
|
+
In case you have an unlabeled dataset that you want to classify with your newly trained classifier, you can use the following code snippet.
|
207
|
+
```ruby
|
208
|
+
# 'unlabeled' is a dataset with class index set, but no class value
|
209
|
+
unlabeled.each_row_with_index do |instance,id|
|
210
|
+
label = classifier.classify_instance instance
|
211
|
+
unlabeled.instance(id).set_class_value label
|
212
|
+
end
|
213
|
+
|
214
|
+
unlabeled.to_ARF/to_CSV 'my_file.arff' # save dataset with inserted class values
|
215
|
+
```
|
216
|
+
|
217
|
+
##Clusterers
|
218
|
+
|
219
|
+
Clustering is an unsupervised Machine Learning technique of finding patterns in the data, i.e., these algorithms work without class attributes. Classifiers, on the other hand, are supervised and need a class attribute. This section, similar to the one about classifiers, covers the following topics:
|
220
|
+
* Building a clusterer - batch (incremental must still be implemented) learning.
|
221
|
+
* Evaluating a clusterer - how to evaluate a built clusterer.
|
222
|
+
* Clustering instances - determining what clusters unknown instances belong to.
|
223
|
+
|
224
|
+
Clusterers, just like classifiers, are by design batch-trainable as well. They all can be built on data that is completely stored in memory. But a small subset of the cluster algorithms can also update the internal representation incrementally (this functionality must still be implemented, along with the ‘incremental’ mode for classifiers).
|
225
|
+
|
226
|
+
If you want to return a brief description and the options list for a selected clusterer class you only need to do this:
|
227
|
+
```ruby
|
228
|
+
clusterer = Weka::Clusterer::my_clusterer.new
|
229
|
+
puts clusterer.description
|
230
|
+
puts clusterer.list_options
|
231
|
+
```
|
232
|
+
|
233
|
+
This is an example of the usage for SimpleKMeans class:
|
234
|
+
```ruby
|
235
|
+
# load dataset
|
236
|
+
data_instance = Core::Parser::parse_ARFF 'some/where/file.arff'
|
237
|
+
kmeans = Weka::Clusterer::SimpleKMeans::Base.new do
|
238
|
+
set_data data_instance
|
239
|
+
set_options "-N 10 -c last"
|
240
|
+
end
|
241
|
+
# access description and available options
|
242
|
+
kmeans.list_options
|
243
|
+
kmeans.description
|
244
|
+
```
|
245
|
+
### Evaluating a clusterer
|
246
|
+
The evaluation on a built clusterer can be performed like this:
|
247
|
+
```ruby
|
248
|
+
puts kmeans.evaluate
|
249
|
+
# or, if you want to evaluate the model on a different dataset
|
250
|
+
puts clusterer.evaluate(dataset)
|
251
|
+
```
|
252
|
+
### Adding a 'cluster' attribute to a dataset
|
253
|
+
After performing clustering on a training set, we can use the clusterer to assign a 'cluster label' to a new dataset. In order to do this, we add a new 'cluster' attribute to the dataset, and we subsequently fill it with cluster assignments.
|
254
|
+
```ruby
|
255
|
+
# remember the 'data_instance' dataset from the previous example
|
256
|
+
filter = Weka::Filter::Unsupervised::Attribute::AddCluster.new
|
257
|
+
filter.set do
|
258
|
+
data data_instance
|
259
|
+
clusterer kmeans
|
260
|
+
end
|
261
|
+
# now fill the attribute with values
|
262
|
+
data_instance.each_row do |inst|
|
263
|
+
puts "Inst: #{inst.to_string}\t\t Cluster assignment #{kmeans.cluster_instance(inst)}"
|
264
|
+
end
|
265
|
+
```
|
266
|
+
|
267
|
+
## Classes to clusters
|
268
|
+
Datasets for supervised algorithms, like classifiers, can be used to evaluate a clusterer as well. This evaluation is called classes-to-clusters, as the clusters are mapped back onto the classes.
|
269
|
+
|
270
|
+
In this mode Weka first ignores the class attribute and generates the clustering. Then during the test phase it assigns classes to the clusters, based on the majority value of the class attribute within each cluster. Then it computes the classification error, based on this assignment.
|
271
|
+
```ruby
|
272
|
+
# parse dataset
|
273
|
+
dataset = Core::Parser::parse_CSV some_data.csv
|
274
|
+
|
275
|
+
# eliminate class values
|
276
|
+
filter = Weka::Unsupervised::Attribute::Remove.new
|
277
|
+
filter.set do
|
278
|
+
attribute_indices "#{dataset.class_index+1}"
|
279
|
+
data dataset
|
280
|
+
end
|
281
|
+
unlabeled_dataset = filter.use
|
282
|
+
|
283
|
+
# instantiate the clusterer
|
284
|
+
clusterer = Weka::Clusterer::EM.new { set_data unlabeled_dataset}
|
285
|
+
|
286
|
+
# evaluate the clusterer
|
287
|
+
puts clusterer.evaluate(dataset)
|
288
|
+
```
|
289
|
+
|
290
|
+
## Developers
|
291
|
+
|
292
|
+
To use the library
|
293
|
+
|
294
|
+
require 'ruby-band'
|
295
|
+
|
296
|
+
The API doc is online. For more code examples see also the test files in the
|
297
|
+
source tree.
|
298
|
+
|
299
|
+
## Project home page
|
300
|
+
|
301
|
+
Information on the source tree, documentation, issues and how to contribute,
|
302
|
+
see
|
303
|
+
|
304
|
+
http://github.com/arrigonialberto86/ruby-band
|
305
|
+
|
306
|
+
The BioRuby community is on IRC server: irc.freenode.org, channel: #bioruby.
|
307
|
+
|
308
|
+
## Cite
|
309
|
+
|
310
|
+
If you use this software, please cite one of
|
311
|
+
|
312
|
+
* [BioRuby: bioinformatics software for the Ruby programming
|
313
|
+
language](http://dx.doi.org/10.1093/bioinformatics/btq475)
|
314
|
+
* [Biogem: an effective tool-based approach for scaling up open source
|
315
|
+
software development in
|
316
|
+
bioinformatics](http://dx.doi.org/10.1093/bioinformatics/bts080)
|
317
|
+
|
318
|
+
|
319
|
+
## Copyright
|
320
|
+
|
321
|
+
Copyright (c) 2013 arrigonialberto86. See LICENSE.txt for further details.
|