weka 0.2.0-java → 0.3.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +3 -669
- data/lib/weka/core/attribute.rb +7 -7
- data/lib/weka/core/dense_instance.rb +29 -10
- data/lib/weka/core/instances.rb +6 -0
- data/lib/weka/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 949c1294da4acebd35a534c14a8ddd7ebc7ac126
|
4
|
+
data.tar.gz: de345d42f452a2846dbbbaded9bb0fbfd6fdd7b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 359199eb33e50e51f673468271386d7668f661e495a75efc3b1afeeae9f7c52be5d19d5457106412f5131b2f237c41867c7c08c152553af18a7dbe017187edca
|
7
|
+
data.tar.gz: 7a0e3b343960027d9156b03c885f733f9e50aaf1cbbf9905dedd55ff51113add5e5212ba9dfb1f5b15907951316b5d61546fbe92e9cde37d7da708736e424ad8
|
data/README.md
CHANGED
@@ -23,14 +23,7 @@ Or install it yourself as:
|
|
23
23
|
|
24
24
|
## Usage
|
25
25
|
|
26
|
-
|
27
|
-
* [Filters](#filters)
|
28
|
-
* [Attribute selection](#attribute-selection)
|
29
|
-
* [Classifiers](#classifiers)
|
30
|
-
* [Clusterers](#clusterers)
|
31
|
-
* [Serializing objects](#serializing-objects)
|
32
|
-
|
33
|
-
Start using Weka's Machine Learning and Data Mining algorithms by requiring the gem:
|
26
|
+
Use Weka's Machine Learning and Data Mining algorithms by requiring the gem:
|
34
27
|
|
35
28
|
```ruby
|
36
29
|
require 'weka'
|
@@ -40,667 +33,8 @@ The weka gem tries to carry over the namespaces defined in Weka and enhances som
|
|
40
33
|
|
41
34
|
The idea behind keeping the namespaces is, that you can also use the [Weka documentation](http://weka.sourceforge.net/doc.dev/) for looking up functionality and classes.
|
42
35
|
|
43
|
-
|
44
|
-
|
45
|
-
| Namespace | Description |
|
46
|
-
|----------------------------|------------------------------------------------------------------|
|
47
|
-
| `Weka::Core` | defines base classes for loading, saving, creating, and editing a dataset |
|
48
|
-
| `Weka::Classifiers` | defines classifier classes in different sub-modules (`Bayes`, `Functions`, `Lazy`, `Meta`, `Rules`, and `Trees` ) |
|
49
|
-
| `Weka::Filters` | defines filter classes for processing datasets in the `Supervised` or `Unsupervised`, and `Attribute` or `Instance` sub-modules |
|
50
|
-
| `Weka::Clusterers` | defines clusterer classes |
|
51
|
-
| `Weka::AttributeSelection` | defines classes for selecting attributes from a dataset |
|
52
|
-
|
53
|
-
### Instances
|
54
|
-
|
55
|
-
Instances objects hold the dataset that is used to train a classifier or that
|
56
|
-
should be classified based on training data.
|
57
|
-
|
58
|
-
Instances can be loaded from files and saved to files.
|
59
|
-
Supported formats are *ARFF*, *CSV*, and *JSON*.
|
60
|
-
|
61
|
-
#### Loading Instances from a file
|
62
|
-
|
63
|
-
Instances can be loaded from ARFF, CSV, and JSON files:
|
64
|
-
|
65
|
-
```ruby
|
66
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
67
|
-
instances = Weka::Core::Instances.from_csv('weather.csv')
|
68
|
-
instances = Weka::Core::Instances.from_json('weather.json')
|
69
|
-
```
|
70
|
-
|
71
|
-
#### Creating Instances
|
72
|
-
|
73
|
-
Attributes of an Instances object can be defined in a block using the `with_attributes` method. The class attribute can be set by the `class_attribute: true` option on the fly with defining an attribute.
|
74
|
-
|
75
|
-
```ruby
|
76
|
-
# create instances with relation name 'weather' and attributes
|
77
|
-
instances = Weka::Core::Instances.new(relation_name: 'weather').with_attributes do
|
78
|
-
nominal :outlook, values: ['sunny', 'overcast', 'rainy']
|
79
|
-
numeric :temperature
|
80
|
-
numeric :humidity
|
81
|
-
nominal :windy, values: [true, false]
|
82
|
-
date :last_storm, 'yyyy-MM-dd'
|
83
|
-
nominal :play, values: [:yes, :no], class_attribute: true
|
84
|
-
end
|
85
|
-
```
|
86
|
-
|
87
|
-
You can also pass an array of Attributes on instantiating new Instances:
|
88
|
-
This is useful, if you want to create a new empty Instances object with the same
|
89
|
-
attributes as an already existing one:
|
90
|
-
|
91
|
-
```ruby
|
92
|
-
# Take attributes from existing instances
|
93
|
-
attributes = instances.attributes
|
94
|
-
|
95
|
-
# create an empty Instances object with the given attributes
|
96
|
-
test_instances = Weka::Core::Instances.new(attributes: attributes)
|
97
|
-
```
|
98
|
-
|
99
|
-
#### Saving Instances as files
|
100
|
-
|
101
|
-
You can save Instances as ARFF, CSV, or JSON file.
|
102
|
-
|
103
|
-
```ruby
|
104
|
-
instances.to_arff('weather.arff')
|
105
|
-
instances.to_csv('weather.csv')
|
106
|
-
instances.to_json('weather.json')
|
107
|
-
```
|
108
|
-
|
109
|
-
#### Adding additional attributes
|
110
|
-
|
111
|
-
You can add additional attributes to the Instances after its initialization.
|
112
|
-
All records that are already in the dataset will get an unknown value (`?`) for
|
113
|
-
the new attribute.
|
114
|
-
|
115
|
-
```ruby
|
116
|
-
instances.add_numeric_attribute(:pressure)
|
117
|
-
instances.add_nominal_attribute(:grandma_says, values: [:hm, :bad, :terrible])
|
118
|
-
instances.add_date_attribute(:last_rain, 'yyyy-MM-dd HH:mm')
|
119
|
-
```
|
120
|
-
|
121
|
-
#### Adding a data instance
|
122
|
-
|
123
|
-
You can add a data instance to the Instances by using the `add_instance` method:
|
124
|
-
|
125
|
-
```ruby
|
126
|
-
data = [:sunny, 70, 80, true, '2015-12-06', :yes, 1.1, :hm, '2015-12-24 20:00']
|
127
|
-
instances.add_instance(data)
|
128
|
-
|
129
|
-
# with custom weight:
|
130
|
-
instances.add_instance(data, weight: 2.0)
|
131
|
-
```
|
132
|
-
|
133
|
-
Multiple instances can be added with the `add_instances` method:
|
134
|
-
|
135
|
-
```ruby
|
136
|
-
data = [
|
137
|
-
[:sunny, 70, 80, true, '2015-12-06', :yes, 1.1, :hm, '2015-12-24 20:00'],
|
138
|
-
[:overcast, 80, 85, false, '2015-11-11', :no, 0.9, :bad, '2015-12-25 18:13']
|
139
|
-
]
|
140
|
-
|
141
|
-
instances.add_instances(data, weight: 2.0)
|
142
|
-
```
|
143
|
-
|
144
|
-
If the `weight` argument is not given, then a default weight of 1.0 is used.
|
145
|
-
The weight in `add_instances` is used for all the added instances.
|
146
|
-
|
147
|
-
#### Setting a class attribute
|
148
|
-
|
149
|
-
You can set an earlier defined attribute as the class attribute of the dataset.
|
150
|
-
This allows classifiers to use the class for building a classification model while training.
|
151
|
-
|
152
|
-
```ruby
|
153
|
-
instances.add_nominal_attribute(:size, values: ['L', 'XL'])
|
154
|
-
instances.class_attribute = :size
|
155
|
-
```
|
156
|
-
|
157
|
-
The added attribute can also be directly set as the class attribute:
|
158
|
-
|
159
|
-
```ruby
|
160
|
-
instances.add_nominal_attribute(:size, values: ['L', 'XL'], class_attribute: true)
|
161
|
-
```
|
162
|
-
|
163
|
-
Keep in mind that you can only assign existing attributes to be the class attribute.
|
164
|
-
The class attribute will not appear in the `instances.attributes` anymore and can be accessed with the `class_attribute` method.
|
165
|
-
|
166
|
-
|
167
|
-
#### Alias methods
|
168
|
-
|
169
|
-
`Weka::Core::Instances` has following alias methods:
|
170
|
-
|
171
|
-
| method | alias |
|
172
|
-
|-----------------------|-------------------------|
|
173
|
-
| `numeric` | `add_numeric_attribute` |
|
174
|
-
| `nominal` | `add_nominal_attribute` |
|
175
|
-
| `date` | `add_date_attribute` |
|
176
|
-
| `string` | `add_string_attribute` |
|
177
|
-
| `set_class_attribute` | `class_attribute=` |
|
178
|
-
| `with_attributes` | `add_attributes` |
|
179
|
-
|
180
|
-
The methods on the left side are meant to be used when defining
|
181
|
-
attributes in a block when using `#with_attributes` (or `#add_attributes`).
|
182
|
-
|
183
|
-
The alias methods are meant to be used for explicitly adding
|
184
|
-
attributes to an Instances object or defining its class attribute later on.
|
185
|
-
|
186
|
-
## Filters
|
187
|
-
|
188
|
-
Filters are used to preprocess datasets.
|
189
|
-
|
190
|
-
There are two categories of filters which are also reflected by the namespaces:
|
191
|
-
|
192
|
-
* *supervised* – The filter requires a class atribute to be set
|
193
|
-
* *unsupervised* – A class attribute is not required to be present
|
194
|
-
|
195
|
-
In each category there are two sub-categories:
|
196
|
-
|
197
|
-
* *attribute-based* – Attributes (columns) are processed
|
198
|
-
* *instance-based* – Instances (rows) are processed
|
199
|
-
|
200
|
-
Thus, Filter classes are organized in the following four namespaces:
|
201
|
-
|
202
|
-
```ruby
|
203
|
-
Weka::Filters::Supervised::Attribute
|
204
|
-
Weka::Filters::Supervised::Instance
|
205
|
-
|
206
|
-
Weka::Filters::Unsupervised::Attribute
|
207
|
-
Weka::Filters::Unsupervised::Instance
|
208
|
-
```
|
209
|
-
|
210
|
-
#### Filtering Instances
|
211
|
-
|
212
|
-
Filters can be used directly to filter Instances:
|
213
|
-
|
214
|
-
```ruby
|
215
|
-
# create filter
|
216
|
-
filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
|
217
|
-
|
218
|
-
# filter instances
|
219
|
-
filtered_data = filter.filter(instances)
|
220
|
-
```
|
221
|
-
|
222
|
-
You can also apply a Filter on an Instances object:
|
223
|
-
|
224
|
-
```ruby
|
225
|
-
# create filter
|
226
|
-
filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
|
227
|
-
|
228
|
-
# apply filter on instances
|
229
|
-
filtered_data = instances.apply_filter(filter)
|
230
|
-
```
|
231
|
-
|
232
|
-
With this approach, it is possible to chain multiple filters on a dataset:
|
233
|
-
|
234
|
-
```ruby
|
235
|
-
# create filters
|
236
|
-
include Weka::Filters::Unsupervised::Attribute
|
237
|
-
|
238
|
-
normalize = Normalize.new
|
239
|
-
discretize = Discretize.new
|
240
|
-
|
241
|
-
# apply a filter chain on instances
|
242
|
-
filtered_data = instances.apply_filter(normalize).apply_filter(discretize)
|
243
|
-
|
244
|
-
# or even shorter
|
245
|
-
filtered_data = instances.apply_filters(normalize, discretize)
|
246
|
-
```
|
247
|
-
|
248
|
-
#### Setting Filter options
|
249
|
-
|
250
|
-
Any Filter has several options. You can list a description of all options of a filter:
|
251
|
-
|
252
|
-
```ruby
|
253
|
-
puts Weka::Filters::Unsupervised::Attribute::Normalize.options
|
254
|
-
# -S <num> The scaling factor for the output range.
|
255
|
-
# (default: 1.0)
|
256
|
-
# -T <num> The translation of the output range.
|
257
|
-
# (default: 0.0)
|
258
|
-
# -unset-class-temporarily Unsets the class index temporarily before the filter is
|
259
|
-
# applied to the data.
|
260
|
-
# (default: no)
|
261
|
-
```
|
262
|
-
|
263
|
-
To get the default option set of a Filter you can run `.default_options`:
|
264
|
-
|
265
|
-
```ruby
|
266
|
-
Weka::Filters::Unsupervised::Attribute::Normalize.default_options
|
267
|
-
# => '-S 1.0 -T 0.0'
|
268
|
-
```
|
269
|
-
|
270
|
-
Options can be set while building a Filter:
|
271
|
-
|
272
|
-
```ruby
|
273
|
-
filter = Weka::Filters::Unsupervised::Attribute::Normalize.build do
|
274
|
-
use_options '-S 0.5'
|
275
|
-
end
|
276
|
-
```
|
277
|
-
|
278
|
-
Or they can be set or changed after you created the Filter:
|
279
|
-
|
280
|
-
```ruby
|
281
|
-
filter = Weka::Filters::Unsupervised::Attribute::Normalize.new
|
282
|
-
filter.use_options('-S 0.5')
|
283
|
-
```
|
284
|
-
|
285
|
-
## Attribute selection
|
286
|
-
|
287
|
-
Selecting attributes (features) from a set of instances is important
|
288
|
-
for getting the best result out of a classification or clustering.
|
289
|
-
Attribute selection reduces the number of attributes and thereby can speed up
|
290
|
-
the runtime of the algorithms.
|
291
|
-
It also avoids processing too many attributes when only a certain subset is essential
|
292
|
-
for building a good model.
|
293
|
-
|
294
|
-
For attribute selection you need to apply a search and an evaluation method on a dataset.
|
295
|
-
|
296
|
-
Search methods are defined in the `Weka::AttributeSelection::Search` module.
|
297
|
-
There are search methods for subset search and individual attribute search.
|
298
|
-
|
299
|
-
Evaluators are defined in the `Weka::AttributeSelection::Evaluator` module.
|
300
|
-
Corresponding to search method types there are two evalutor types for subset search and individual search.
|
301
|
-
|
302
|
-
The search methods and evaluators from each category can be combined to perform an attribute selection.
|
303
|
-
|
304
|
-
**Classes for attribute *subset* selection:**
|
305
|
-
|
306
|
-
| Search | Evaluators |
|
307
|
-
|-------------------------------|------------------------------|
|
308
|
-
| `BestFirst`, `GreedyStepwise` | `CfsSubset`, `WrapperSubset` |
|
309
|
-
|
310
|
-
**Classes for *individual* attribute selection:**
|
311
|
-
|
312
|
-
| Search | Evaluators |
|
313
|
-
|----------|------------|
|
314
|
-
| `Ranker` | `CorrelationAttribute`, `GainRatioAttribute`, `InfoGainAttribute`, `OneRAttribute`, `ReliefFAttribute`, `SymmetricalUncertAttribute` |
|
315
|
-
|
316
|
-
An attribute selection can either be performed with the `Weka::AttributeSelection::AttributeSelection` class:
|
317
|
-
|
318
|
-
```ruby
|
319
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
320
|
-
|
321
|
-
selection = Weka::AttributeSelection::AttributeSelection.new
|
322
|
-
selection.search = Weka::AttributeSelection::Search::Ranker.new
|
323
|
-
selection.evaluator = Weka::AttributeSelection::Evaluator::PricipalComponents.new
|
324
|
-
|
325
|
-
selection.select_attribute(instances)
|
326
|
-
puts selection.summary
|
327
|
-
```
|
328
|
-
|
329
|
-
Or you can use the supervised `AttributeSelection` filter to directly filter instances:
|
330
|
-
|
331
|
-
```ruby
|
332
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
333
|
-
search = Weka::AttributeSelection::Search::Ranker.new
|
334
|
-
evaluator = Weka::AttributeSelection::Evaluator::PricipalComponents.new
|
335
|
-
|
336
|
-
filter = Weka::Filters::Supervised::Attribute::AttributeSelection.build do
|
337
|
-
use_search search
|
338
|
-
use_evaluator evaluator
|
339
|
-
end
|
340
|
-
|
341
|
-
filtered_instances = instances.apply_filter(filter)
|
342
|
-
```
|
343
|
-
|
344
|
-
## Classifiers
|
345
|
-
|
346
|
-
Weka‘s classification and regression algorithms can be found in the `Weka::Classifiers`
|
347
|
-
namespace.
|
348
|
-
|
349
|
-
The classifier classes are organised in the following submodules:
|
350
|
-
|
351
|
-
```ruby
|
352
|
-
Weka::Classifiers::Bayes
|
353
|
-
Weka::Classifiers::Functions
|
354
|
-
Weka::Classifiers::Lazy
|
355
|
-
Weka::Classifiers::Meta
|
356
|
-
Weka::Classifiers::Rules
|
357
|
-
Weka::Classifiers::Trees
|
358
|
-
```
|
359
|
-
|
360
|
-
#### Getting information about a classifier
|
361
|
-
|
362
|
-
To get a description about the classifier class and its available options
|
363
|
-
you can use the class methods `.description` and `.options` on each classifier:
|
364
|
-
|
365
|
-
```ruby
|
366
|
-
puts Weka::Classifiers::Trees::RandomForest.description
|
367
|
-
# Class for constructing a forest of random trees.
|
368
|
-
# For more information see:
|
369
|
-
# Leo Breiman (2001). Random Forests. Machine Learning. 45(1):5-32.
|
370
|
-
|
371
|
-
puts Weka::Classifiers::Trees::RandomForest.options
|
372
|
-
# -I <number of trees> Number of trees to build.
|
373
|
-
# (default 100)
|
374
|
-
# -K <number of features> Number of features to consider (<1=int(log_2(#predictors)+1)).
|
375
|
-
# (default 0)
|
376
|
-
# ...
|
377
|
-
|
378
|
-
```
|
379
|
-
|
380
|
-
The default options that are used for a classifier can be displayed with:
|
381
|
-
|
382
|
-
```ruby
|
383
|
-
Weka::Classifiers::Trees::RandomForest.default_options
|
384
|
-
# => "-I 100 -K 0 -S 1 -num-slots 1"
|
385
|
-
```
|
386
|
-
|
387
|
-
#### Creating a new classifier
|
388
|
-
|
389
|
-
To build a new classifiers model based on training instances you can use
|
390
|
-
the following syntax:
|
391
|
-
|
392
|
-
```ruby
|
393
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
394
|
-
instances.class_attribute = :play
|
395
|
-
|
396
|
-
classifier = Weka::Classifiers::Trees::RandomForest.new
|
397
|
-
classifier.use_options('-I 200 -K 5')
|
398
|
-
classifier.train_with_instances(instances)
|
399
|
-
```
|
400
|
-
You can also build a classifier by using the block syntax:
|
401
|
-
|
402
|
-
```ruby
|
403
|
-
classifier = Weka::Classifiers::Trees::RandomForest.build do
|
404
|
-
use_options '-I 200 -K 5'
|
405
|
-
train_with_instances instances
|
406
|
-
end
|
407
|
-
|
408
|
-
```
|
409
|
-
|
410
|
-
#### Evaluating a classifier model
|
411
|
-
|
412
|
-
You can evaluate the trained classifier using [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)):
|
413
|
-
|
414
|
-
```ruby
|
415
|
-
# default number of folds is 3
|
416
|
-
evaluation = classifier.cross_validate
|
417
|
-
|
418
|
-
# with a custom number of folds
|
419
|
-
evaluation = classifier.cross_validate(folds: 10)
|
420
|
-
```
|
421
|
-
|
422
|
-
The cross-validation returns a `Weka::Classifiers::Evaluation` object which can be used to get details about the accuracy of the trained classification model:
|
423
|
-
|
424
|
-
```ruby
|
425
|
-
puts evaluation.summary
|
426
|
-
#
|
427
|
-
# Correctly Classified Instances 10 71.4286 %
|
428
|
-
# Incorrectly Classified Instances 4 28.5714 %
|
429
|
-
# Kappa statistic 0.3778
|
430
|
-
# Mean absolute error 0.4098
|
431
|
-
# Root mean squared error 0.4657
|
432
|
-
# Relative absolute error 87.4588 %
|
433
|
-
# Root relative squared error 96.2945 %
|
434
|
-
# Coverage of cases (0.95 level) 100 %
|
435
|
-
# Mean rel. region size (0.95 level) 96.4286 %
|
436
|
-
# Total Number of Instances 14
|
437
|
-
```
|
438
|
-
|
439
|
-
The evaluation holds detailed information about a number of different meassures of interest,
|
440
|
-
like the [precision and recall](https://en.wikipedia.org/wiki/Precision_and_recall), the FP/FN/TP/TN-rates, [F-Measure](https://en.wikipedia.org/wiki/F1_score) and the areas under PRC and [ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) curve.
|
441
|
-
|
442
|
-
If your trained classifier should be evaluated against a set of *test instances*,
|
443
|
-
you can use `evaluate`:
|
444
|
-
|
445
|
-
```ruby
|
446
|
-
test_instances = Weka::Core::Instances.from_arff('test_data.arff')
|
447
|
-
test_instances.class_attribute = :play
|
448
|
-
|
449
|
-
evaluation = classifier.evaluate(test_instances)
|
450
|
-
```
|
451
|
-
|
452
|
-
#### Classifying new data
|
453
|
-
|
454
|
-
Each classifier implements either a `classify` method or a `distibution_for` method, or both.
|
455
|
-
|
456
|
-
The `classify` method takes a Weka::Core::DenseInstance or an Array of values as argument and returns the predicted class value:
|
457
|
-
|
458
|
-
```ruby
|
459
|
-
instances = Weka::Core::Instances.from_arff('unclassified_data.arff')
|
460
|
-
|
461
|
-
# with an instance as argument
|
462
|
-
instances.map do |instance|
|
463
|
-
classifier.classify(instance)
|
464
|
-
end
|
465
|
-
# => ['no', 'yes', 'yes', ...]
|
466
|
-
|
467
|
-
# with an Array of values as argument
|
468
|
-
classifier.classify [:sunny, 80, 80, :FALSE, '?']
|
469
|
-
# => 'yes'
|
470
|
-
```
|
471
|
-
|
472
|
-
The `distribution_for` method takes a Weka::Core::DenseInstance or an Array of values as argument as well and returns a hash with the distributions per class value:
|
473
|
-
|
474
|
-
```ruby
|
475
|
-
instances = Weka::Core::Instances.from_arff('unclassified_data.arff')
|
476
|
-
|
477
|
-
# with an instance as argument
|
478
|
-
classifier.distribution_for(instances.first)
|
479
|
-
# => { "yes" => 0.26, "no" => 0.74 }
|
480
|
-
|
481
|
-
# with an Array of values as argument
|
482
|
-
classifier.distribution_for [:sunny, 80, 80, :FALSE, '?']
|
483
|
-
# => { "yes" => 0.62, "no" => 0.38 }
|
484
|
-
```
|
485
|
-
|
486
|
-
### Clusterers
|
487
|
-
|
488
|
-
Clustering is an unsupervised machine learning technique which tries to find patterns in data and group sets of data. Clustering algorithms work without class attributes.
|
489
|
-
|
490
|
-
Weka‘s clustering algorithms can be found in the `Weka::Clusterers` namespace.
|
491
|
-
|
492
|
-
The following clusterer classes are available:
|
493
|
-
|
494
|
-
```ruby
|
495
|
-
Weka::Clusterers::Canopy
|
496
|
-
Weka::Clusterers::Cobweb
|
497
|
-
Weka::Clusterers::EM
|
498
|
-
Weka::Clusterers::FarthestFirst
|
499
|
-
Weka::Clusterers::HierarchicalClusterer
|
500
|
-
Weka::Clusterers::SimpleKMeans
|
501
|
-
```
|
502
|
-
|
503
|
-
#### Getting information about a clusterer
|
504
|
-
|
505
|
-
To get a description about the clusterer class and its available options
|
506
|
-
you can use the class methods `.description` and `.options` on each clusterer:
|
507
|
-
|
508
|
-
```ruby
|
509
|
-
puts Weka::Clusterers::SimpleKMeans.description
|
510
|
-
# Cluster data using the k means algorithm.
|
511
|
-
# ...
|
512
|
-
|
513
|
-
puts Weka::Clusterers::SimpleKMeans.options
|
514
|
-
# -N <num> Number of clusters.
|
515
|
-
# (default 2).
|
516
|
-
# -init Initialization method to use.
|
517
|
-
# 0 = random, 1 = k-means++, 2 = canopy, 3 = farthest first.
|
518
|
-
# (default = 0)
|
519
|
-
# ...
|
520
|
-
```
|
521
|
-
|
522
|
-
The default options that are used for a clusterer can be displayed with:
|
523
|
-
|
524
|
-
```ruby
|
525
|
-
Weka::Clusterers::SimpleKMeans.default_options
|
526
|
-
# => "-init 0 -max-candidates 100 -periodic-pruning 10000 -min-density 2.0 -t1 -1.25
|
527
|
-
# -t2 -1.0 -N 2 -A weka.core.EuclideanDistance -R first-last -I 500 -num-slots 1 -S 10"
|
528
|
-
```
|
529
|
-
|
530
|
-
#### Creating a new Clusterer
|
531
|
-
|
532
|
-
To build a new clusterer model based on training instances you can use the following syntax:
|
533
|
-
|
534
|
-
```ruby
|
535
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
536
|
-
|
537
|
-
clusterer = Weka::Clusterers::SimpleKMeans.new
|
538
|
-
clusterer.use_options('-N 3 -I 600')
|
539
|
-
clusterer.train_with_instances(instances)
|
540
|
-
```
|
541
|
-
|
542
|
-
You can also build a clusterer by using the block syntax:
|
543
|
-
|
544
|
-
```ruby
|
545
|
-
classifier = Weka::Clusterers::SimpleKMeans.build do
|
546
|
-
use_options '-N 5 -I 600'
|
547
|
-
train_with_instances instances
|
548
|
-
end
|
549
|
-
```
|
550
|
-
|
551
|
-
#### Evaluating a clusterer model
|
552
|
-
|
553
|
-
You can evaluate trained density-based clusterer using [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics)) (The only density-based clusterer in the Weka lib is `EM` at the moment).
|
554
|
-
|
555
|
-
The cross-validation returns the cross-validated log-likelihood:
|
556
|
-
|
557
|
-
```ruby
|
558
|
-
# default number of folds is 3
|
559
|
-
log_likelihood = clusterer.cross_validate
|
560
|
-
# => -10.556166997137497
|
561
|
-
|
562
|
-
# with a custom number of folds
|
563
|
-
log_likelihood = clusterer.cross_validate(folds: 10)
|
564
|
-
# => -10.262696653333032
|
565
|
-
```
|
566
|
-
|
567
|
-
If your trained classifier should be evaluated against a set of *test instances*,
|
568
|
-
you can use `evaluate`.
|
569
|
-
The evaluation returns a `Weka::Clusterer::ClusterEvaluation` object which can be used to get details about the accuracy of the trained clusterer model:
|
570
|
-
|
571
|
-
```ruby
|
572
|
-
test_instances = Weka::Core::Instances.from_arff('test_data.arff')
|
573
|
-
evaluation = clusterer.evaluate(test_instances)
|
574
|
-
|
575
|
-
puts evaluation.summary
|
576
|
-
# EM
|
577
|
-
# ==
|
578
|
-
#
|
579
|
-
# Number of clusters: 2
|
580
|
-
# Number of iterations performed: 7
|
581
|
-
#
|
582
|
-
# Cluster
|
583
|
-
# Attribute 0 1
|
584
|
-
# (0.35) (0.65)
|
585
|
-
# ==============================
|
586
|
-
# outlook
|
587
|
-
# sunny 3.8732 3.1268
|
588
|
-
# overcast 1.7746 4.2254
|
589
|
-
# rainy 2.1889 4.8111
|
590
|
-
# [total] 7.8368 12.1632
|
591
|
-
# ...
|
592
|
-
```
|
593
|
-
|
594
|
-
#### Clustering new data
|
595
|
-
|
596
|
-
Similar to classifiers, clusterers come with a either a `cluster` method or a `distribution_for` method which both take a Weka::Core::DenseInstance or an Array of values as argument.
|
597
|
-
|
598
|
-
The `classify` method returns the index of the predicted cluster:
|
599
|
-
|
600
|
-
```ruby
|
601
|
-
instances = Weka::Core::Instances.from_arff('unlabeled_data.arff')
|
602
|
-
|
603
|
-
clusterer = Weka::Clusterers::Canopy.build
|
604
|
-
train_with_instances instances
|
605
|
-
end
|
606
|
-
|
607
|
-
# with an instance as argument
|
608
|
-
instances.map do |instance|
|
609
|
-
clusterer.cluster(instance)
|
610
|
-
end
|
611
|
-
# => [3, 3, 4, 0, 0, 1, 2, 3, 0, 0, 2, 2, 4, 1]
|
612
|
-
|
613
|
-
# with an Array of values as argument
|
614
|
-
clusterer.cluster [:sunny, 80, 80, :FALSE]
|
615
|
-
# => 4
|
616
|
-
```
|
617
|
-
|
618
|
-
The `distribution_for` method returns an Array with the distributions at the cluster‘s index:
|
619
|
-
|
620
|
-
```ruby
|
621
|
-
# with an instance as argument
|
622
|
-
clusterer.distribution_for(instances.first)
|
623
|
-
# => [0.17229465277140552, 0.1675583309853506, 0.15089102301329346, 0.3274056122786787, 0.18185038095127165]
|
624
|
-
|
625
|
-
# with an Array of values as argument
|
626
|
-
classifier.distribution_for [:sunny, 80, 80, :FALSE]
|
627
|
-
# => [0.21517055355632506, 0.16012256401406233, 0.17890840384466453, 0.2202344150907843, 0.2255640634941639]
|
628
|
-
```
|
629
|
-
|
630
|
-
#### Adding a cluster attribute to a dataset
|
631
|
-
|
632
|
-
After building and training a clusterer with training instances you can use the clusterer
|
633
|
-
in the unsupervised attribute filter `AddCluster` to assign a cluster to each instance of a dataset:
|
634
|
-
|
635
|
-
```ruby
|
636
|
-
filter = Weka::Filter::Unsupervised::Attribute::AddCluster.new
|
637
|
-
filter.clusterer = clusterer
|
638
|
-
|
639
|
-
instances = Weka::Core::Instances.from_arff('unlabeled_data.arff')
|
640
|
-
clustered_instances = instances.apply_filter(filter)
|
641
|
-
|
642
|
-
puts clustered_instances.to_s
|
643
|
-
```
|
644
|
-
|
645
|
-
`clustered_instance` now has a nominal `cluster` attribute as the last attribute.
|
646
|
-
The values of the cluster attribute are the *N* cluster names, e.g. with *N = 2* clusters, the ARFF representation looks like:
|
647
|
-
|
648
|
-
```
|
649
|
-
...
|
650
|
-
@attribute outlook {sunny,overcast,rainy}
|
651
|
-
@attribute temperature numeric
|
652
|
-
@attribute humidity numeric
|
653
|
-
@attribute windy {TRUE,FALSE}
|
654
|
-
@attribute cluster {cluster1,cluster2}
|
655
|
-
...
|
656
|
-
```
|
657
|
-
|
658
|
-
Each instance is now assigned to a cluster, e.g.:
|
659
|
-
|
660
|
-
```
|
661
|
-
...
|
662
|
-
@data
|
663
|
-
sunny,85,85,FALSE,cluster1
|
664
|
-
sunny,80,90,TRUE,cluster1
|
665
|
-
...
|
666
|
-
```
|
667
|
-
|
668
|
-
### Serializing Objects
|
669
|
-
|
670
|
-
You can serialize objects with the `Weka::Core::SerializationHelper` class:
|
671
|
-
|
672
|
-
```ruby
|
673
|
-
# writing an Object to a file:
|
674
|
-
Weka::Core::SerializationHelper.write('path/to/file.model', classifier)
|
675
|
-
|
676
|
-
# load an Object from a serialized file:
|
677
|
-
object = Weka::Core::SerializationHelper.read('path/to/file.model')
|
678
|
-
```
|
679
|
-
|
680
|
-
Instead of `.write` and `.read` you can also call the aliases `.serialize` and `.deserialize`.
|
681
|
-
|
682
|
-
Serialization can be helpful if the training of e.g. a classifier model takes
|
683
|
-
some minutes. Instead of running the whole training on instantiating a classifier you
|
684
|
-
can speed up this process tremendously by serializing a classifier once it was trained and later load it from the file again.
|
685
|
-
|
686
|
-
Classifiers, Clusterers, Instances and Filters also have a `#serialize` method
|
687
|
-
which you can use to directly serialize an Instance of these, e.g. for a Classifier:
|
688
|
-
|
689
|
-
```ruby
|
690
|
-
instances = Weka::Core::Instances.from_arff('weather.arff')
|
691
|
-
instances.class_attribute = :play
|
692
|
-
|
693
|
-
classifier = Weka::Core::Trees::RandomForest.build do
|
694
|
-
train_with_instances instances
|
695
|
-
end
|
696
|
-
|
697
|
-
# store trained model as binary file
|
698
|
-
classifier.serialize('randomforest.model')
|
699
|
-
|
700
|
-
# load Classifier from binary file
|
701
|
-
loaded_classifier = Weka::Core::SerializationHelper.deserialize('randomforest.model')
|
702
|
-
# => #<Java::WekaClassifiersTrees::RandomForest:0x197db331>
|
703
|
-
```
|
36
|
+
Please refer to [the gem‘s Wiki](https://github.com/paulgoetze/weka-jruby/wiki) for
|
37
|
+
detailed information about how to use weka with JRuby and some examplary code snippets.
|
704
38
|
|
705
39
|
## Development
|
706
40
|
|
data/lib/weka/core/attribute.rb
CHANGED
@@ -11,14 +11,14 @@ module Weka
|
|
11
11
|
# The order of the if statements is important here, because a date is also
|
12
12
|
# a numeric.
|
13
13
|
def internal_value_of(value)
|
14
|
-
if
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
index_of_value(value.to_s)
|
20
|
-
end
|
14
|
+
return value if value === Float::NAN
|
15
|
+
return Float::NAN if [nil, '?'].include?(value)
|
16
|
+
return parse_date(value.to_s) if date?
|
17
|
+
return value.to_f if numeric?
|
18
|
+
return index_of_value(value.to_s) if nominal?
|
21
19
|
end
|
22
20
|
end
|
21
|
+
|
22
|
+
Weka::Core::Attribute.__persistent__ = true
|
23
23
|
end
|
24
24
|
end
|
@@ -7,7 +7,11 @@ module Weka
|
|
7
7
|
java_import "java.text.SimpleDateFormat"
|
8
8
|
|
9
9
|
def initialize(data, weight: 1.0)
|
10
|
-
|
10
|
+
if data.kind_of?(Integer)
|
11
|
+
super(data)
|
12
|
+
else
|
13
|
+
super(weight, to_java_double(data))
|
14
|
+
end
|
11
15
|
end
|
12
16
|
|
13
17
|
def attributes
|
@@ -30,15 +34,7 @@ module Weka
|
|
30
34
|
|
31
35
|
def to_a
|
32
36
|
to_double_array.each_with_index.map do |value, index|
|
33
|
-
|
34
|
-
|
35
|
-
if attribute.date?
|
36
|
-
format_date(value, attribute.date_format)
|
37
|
-
elsif attribute.numeric?
|
38
|
-
value
|
39
|
-
elsif attribute.nominal?
|
40
|
-
attribute.value(value)
|
41
|
-
end
|
37
|
+
value_from(value, index)
|
42
38
|
end
|
43
39
|
end
|
44
40
|
|
@@ -47,6 +43,29 @@ module Weka
|
|
47
43
|
|
48
44
|
private
|
49
45
|
|
46
|
+
def to_java_double(values)
|
47
|
+
data = values.map do |value|
|
48
|
+
['?', nil].include?(value) ? Float::NAN : value
|
49
|
+
end
|
50
|
+
|
51
|
+
data.to_java(:double)
|
52
|
+
end
|
53
|
+
|
54
|
+
def value_from(value, index)
|
55
|
+
return '?' if value.nan?
|
56
|
+
return value if dataset.nil?
|
57
|
+
|
58
|
+
attribute = attribute_at(index)
|
59
|
+
|
60
|
+
if attribute.date?
|
61
|
+
format_date(value, attribute.date_format)
|
62
|
+
elsif attribute.numeric?
|
63
|
+
value
|
64
|
+
elsif attribute.nominal?
|
65
|
+
attribute.value(value)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
50
69
|
def attribute_at(index)
|
51
70
|
return attributes[index] unless dataset.class_attribute_defined?
|
52
71
|
|
data/lib/weka/core/instances.rb
CHANGED
data/lib/weka/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: weka
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Paul Götze
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-02-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: lock_jar
|