davidrichards-data_frame 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +16 -0
- data/VERSION.yml +1 -1
- data/bin/plain_frame +22 -0
- data/lib/data_frame.rb +2 -1
- data/lib/data_frame/arff.rb +43 -36
- data/lib/data_frame/core/column_management.rb +102 -0
- data/lib/data_frame/core/filter.rb +48 -0
- data/lib/data_frame/core/import.rb +112 -0
- data/lib/data_frame/core/pre_process.rb +61 -0
- data/lib/data_frame/core/saving.rb +29 -0
- data/lib/data_frame/core/training.rb +36 -0
- data/lib/data_frame/data_frame.rb +37 -241
- data/lib/data_frame/id3.rb +28 -0
- data/lib/data_frame/kmeans.rb +10 -0
- data/lib/data_frame/labels_from_uci.rb +48 -0
- data/lib/data_frame/mlp.rb +18 -0
- data/lib/data_frame/sbn.rb +18 -0
- data/lib/data_frame/transposable_array.rb +1 -1
- data/lib/ext/array.rb +11 -0
- data/spec/data_frame/arff_spec.rb +1 -0
- data/spec/data_frame/core/column_management_spec.rb +97 -0
- data/spec/data_frame/core/filter_spec.rb +88 -0
- data/spec/data_frame/core/import_spec.rb +41 -0
- data/spec/data_frame/core/pre_process_spec.rb +71 -0
- data/spec/data_frame/core/saving_spec.rb +61 -0
- data/spec/data_frame/core/training_spec.rb +51 -0
- data/spec/data_frame/data_frame_spec.rb +10 -226
- data/spec/data_frame/id3_spec.rb +22 -0
- data/spec/ext/array_spec.rb +13 -0
- data/spec/fixtures/discrete_testing.csv +4 -0
- data/spec/fixtures/discrete_training.csv +21 -0
- metadata +33 -6
data/README.rdoc
CHANGED
@@ -91,6 +91,22 @@ Data Frame can now create sub-models:
|
|
91
91
|
>> df.models
|
92
92
|
=> #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
|
93
93
|
|
94
|
+
== Utilities
|
95
|
+
|
96
|
+
I use data frame for a lot of things, and I've added some utilities for this gem in case you would like to as well. For instance, here is how I take the data in a data frame and load it into a neural network:
|
97
|
+
|
98
|
+
# Show mlp. Will probably need to add a row classifier for training and test data. Also, will probably want to
|
99
|
+
|
100
|
+
== CLI
|
101
|
+
|
102
|
+
There are some really interesting things that have good command-line shortcuts:
|
103
|
+
|
104
|
+
* Make
|
105
|
+
* A
|
106
|
+
* List
|
107
|
+
|
108
|
+
# Now add some demos
|
109
|
+
|
94
110
|
==Installation
|
95
111
|
|
96
112
|
sudo gem install davidrichards-data_frame
|
data/VERSION.yml
CHANGED
data/bin/plain_frame
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby -wKU
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
version_hash = YAML.load_file(File.join(File.dirname(__FILE__), %w(.. VERSION.yml)))
|
5
|
+
version = [version_hash[:major].to_s, version_hash[:minor].to_s, version_hash[:patch].to_s].join(".")
|
6
|
+
df_file = File.join(File.dirname(__FILE__), %w(.. lib data_frame))
|
7
|
+
|
8
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
9
|
+
|
10
|
+
require 'optparse'
|
11
|
+
options = { :irb => irb, :without_stored_procedures => false }
|
12
|
+
OptionParser.new do |opt|
|
13
|
+
opt.banner = "Usage: console [environment] [options]"
|
14
|
+
opt.on("--irb=[#{irb}]", 'Invoke a different irb.') { |v| options[:irb] = v }
|
15
|
+
opt.parse!(ARGV)
|
16
|
+
end
|
17
|
+
|
18
|
+
libs = " -r irb/completion -r #{df_file}"
|
19
|
+
|
20
|
+
puts "Loading Data Frame version: #{version}"
|
21
|
+
|
22
|
+
exec "#{options[:irb]} #{libs} --simple-prompt"
|
data/lib/data_frame.rb
CHANGED
@@ -20,6 +20,7 @@ $:.unshift(File.dirname(__FILE__))
|
|
20
20
|
require 'data_frame/callback_array'
|
21
21
|
require 'data_frame/transposable_array'
|
22
22
|
require 'data_frame/parameter_capture'
|
23
|
-
require 'data_frame/arff'
|
24
23
|
require 'data_frame/data_frame'
|
25
24
|
require 'data_frame/model'
|
25
|
+
|
26
|
+
Dir.glob("#{File.dirname(__FILE__)}/data_frame/core/*.rb").each { |file| require file }
|
data/lib/data_frame/arff.rb
CHANGED
@@ -1,45 +1,52 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
# Used in arff, but generally useful.
|
5
|
-
def to_csv(include_header=true)
|
6
|
-
value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
|
7
|
-
self.items.inject(value) do |list, e|
|
8
|
-
list << e.map {|cell| cell.to_s}.join(',') + "\n"
|
9
|
-
end
|
10
|
-
end
|
1
|
+
module DF #:nodoc:
|
2
|
+
# Turns a data frame into ARFF-formatted content.
|
3
|
+
module ARFF
|
11
4
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def arff_attributes
|
18
|
-
container = defined?(Dictionary) ? Dictionary.new : Hash.new
|
19
|
-
|
20
|
-
self.labels.inject(container) do |list, e|
|
21
|
-
list[e] = self.render_column(e).categories
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def arff_formatted_attributes
|
26
|
-
self.labels.inject('') do |str, e|
|
27
|
-
val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
|
28
|
-
str << "@attribute #{e} #{val}\n"
|
5
|
+
# Used in arff, but generally useful.
|
6
|
+
def to_csv(include_header=true)
|
7
|
+
value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
|
8
|
+
self.items.inject(value) do |list, e|
|
9
|
+
list << e.map {|cell| cell.to_s}.join(',') + "\n"
|
29
10
|
end
|
30
11
|
end
|
31
|
-
|
32
|
-
def
|
33
|
-
|
12
|
+
|
13
|
+
def to_arff
|
14
|
+
arff_header + to_csv(false)
|
34
15
|
end
|
35
|
-
|
36
|
-
|
37
|
-
|
16
|
+
|
17
|
+
protected
|
18
|
+
def arff_attributes
|
19
|
+
container = defined?(Dictionary) ? Dictionary.new : Hash.new
|
20
|
+
|
21
|
+
self.labels.inject(container) do |list, e|
|
22
|
+
list[e] = self.render_column(e).categories
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def arff_formatted_attributes
|
27
|
+
self.labels.inject('') do |str, e|
|
28
|
+
val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
|
29
|
+
str << "@attribute #{e} #{val}\n"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def arff_relation
|
34
|
+
self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
|
35
|
+
end
|
36
|
+
|
37
|
+
def arff_header
|
38
|
+
%[@relation #{arff_relation}
|
38
39
|
|
39
40
|
#{arff_formatted_attributes}
|
40
41
|
@data
|
41
42
|
]
|
42
|
-
|
43
|
-
|
44
|
-
|
43
|
+
end
|
44
|
+
|
45
|
+
alias :arff_items :to_csv
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class DataFrame
|
51
|
+
include DF::ARFF
|
45
52
|
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module ColumnManagement #:nodoc:
|
3
|
+
|
4
|
+
def move_to_last!(orig_name)
|
5
|
+
raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
|
6
|
+
new_name = (orig_name.to_s + "_a_unique_name").to_sym
|
7
|
+
self.append!(new_name, self.render_column(orig_name))
|
8
|
+
self.drop!(orig_name)
|
9
|
+
self.rename!(orig_name, new_name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# In the order of alias: new_name, orig_name
|
13
|
+
def rename!(new_name, orig_name)
|
14
|
+
new_name = new_name.to_underscore_sym
|
15
|
+
orig_name = orig_name.to_underscore_sym
|
16
|
+
raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
|
17
|
+
raise ArgumentError, "Cannot name #{orig_name} to #{new_name}, that column already exists." if self.labels.include?(new_name)
|
18
|
+
i = self.labels.index(orig_name)
|
19
|
+
self.labels[i] = new_name
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds a unique column to the table
|
23
|
+
def append!(column_name, value=nil)
|
24
|
+
raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
|
25
|
+
self.labels << column_name.to_underscore_sym
|
26
|
+
if value.is_a?(Array)
|
27
|
+
self.items.each_with_index do |item, i|
|
28
|
+
item << value[i]
|
29
|
+
end
|
30
|
+
else
|
31
|
+
self.items.each do |item|
|
32
|
+
item << value
|
33
|
+
end
|
34
|
+
end
|
35
|
+
# Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
|
36
|
+
self.items.taint
|
37
|
+
end
|
38
|
+
|
39
|
+
def replace!(column, values=nil, &block)
|
40
|
+
column = validate_column(column)
|
41
|
+
if not values
|
42
|
+
values = self.send(column)
|
43
|
+
values.map! {|e| block.call(e)}
|
44
|
+
end
|
45
|
+
replace_column!(column, values)
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
# Replace a single column with an array of values.
|
50
|
+
# It is helpful to have the values the same size as the rest of the data
|
51
|
+
# frame.
|
52
|
+
def replace_column!(column, values)
|
53
|
+
column = validate_column(column)
|
54
|
+
index = self.labels.index(column)
|
55
|
+
list = []
|
56
|
+
self.items.each_with_index do |item, i|
|
57
|
+
consolidated = item
|
58
|
+
consolidated[index] = values[i]
|
59
|
+
list << consolidated
|
60
|
+
end
|
61
|
+
@items = list.dup
|
62
|
+
end
|
63
|
+
|
64
|
+
# Drop one or more columns
|
65
|
+
def drop!(*labels)
|
66
|
+
labels.each do |label|
|
67
|
+
drop_one!(label)
|
68
|
+
end
|
69
|
+
self
|
70
|
+
end
|
71
|
+
|
72
|
+
# Drop a single column
|
73
|
+
def drop_one!(label)
|
74
|
+
i = self.labels.index(label)
|
75
|
+
return nil unless i
|
76
|
+
self.items.each do |item|
|
77
|
+
item.delete_at(i)
|
78
|
+
end
|
79
|
+
self.labels.delete_at(i)
|
80
|
+
self
|
81
|
+
end
|
82
|
+
|
83
|
+
# Creates a new data frame, only with the specified columns.
|
84
|
+
def subset_from_columns(*cols)
|
85
|
+
new_labels = self.labels.inject([]) do |list, label|
|
86
|
+
list << label if cols.include?(label)
|
87
|
+
list
|
88
|
+
end
|
89
|
+
new_data_frame = DataFrame.new(*self.labels)
|
90
|
+
new_data_frame.import(self.items)
|
91
|
+
self.labels.each do |label|
|
92
|
+
new_data_frame.drop!(label) unless new_labels.include?(label)
|
93
|
+
end
|
94
|
+
new_data_frame
|
95
|
+
end
|
96
|
+
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
class DataFrame
|
101
|
+
include DF::ColumnManagement
|
102
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Filter #:nodoc:
|
3
|
+
|
4
|
+
# Takes a block to evaluate on each row. The row can be converted into
|
5
|
+
# an OpenStruct or a Hash for easier filter methods. Note, don't try this
|
6
|
+
# with a hash or open struct unless you have facets available.
|
7
|
+
def filter!(as=Array, &block)
|
8
|
+
as = infer_class(as)
|
9
|
+
items = []
|
10
|
+
self.items.each do |row|
|
11
|
+
value = block.call(cast_row(row, as))
|
12
|
+
items << row if value
|
13
|
+
end
|
14
|
+
@items = items.dup
|
15
|
+
self
|
16
|
+
end
|
17
|
+
|
18
|
+
def filter(as=Array, &block)
|
19
|
+
new_data_frame = self.clone
|
20
|
+
new_data_frame.filter!(as, &block)
|
21
|
+
end
|
22
|
+
|
23
|
+
def filter_by_category(hash)
|
24
|
+
new_data_frame = self.dup
|
25
|
+
hash.each do |key, value|
|
26
|
+
key = key.to_underscore_sym
|
27
|
+
next unless self.labels.include?(key)
|
28
|
+
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
29
|
+
new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
|
30
|
+
end
|
31
|
+
new_data_frame
|
32
|
+
end
|
33
|
+
|
34
|
+
def filter_by_category!(hash)
|
35
|
+
hash.each do |key, value|
|
36
|
+
key = key.to_underscore_sym
|
37
|
+
next unless self.labels.include?(key)
|
38
|
+
value = [value] unless value.is_a?(Array) or value.is_a?(Range)
|
39
|
+
self.filter!(:hash) {|row| value.include?(row[key])}
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class DataFrame
|
47
|
+
include DF::Filter
|
48
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module Import #:nodoc:
|
3
|
+
|
4
|
+
module InferCSV #:nodoc:
|
5
|
+
|
6
|
+
protected
|
7
|
+
def default_csv_opts; {:converters => :all}; end
|
8
|
+
|
9
|
+
def infer_csv_contents(obj, opts={})
|
10
|
+
contents = File.read(obj) if File.exist?(obj)
|
11
|
+
begin
|
12
|
+
open(obj) {|f| contents = f.read} unless contents
|
13
|
+
rescue
|
14
|
+
nil
|
15
|
+
end
|
16
|
+
contents ||= obj if obj.is_a?(String)
|
17
|
+
return nil unless contents
|
18
|
+
table = FCSV.parse(contents, default_csv_opts.merge(opts))
|
19
|
+
labels = opts.fetch(:headers, true) ? table.shift : []
|
20
|
+
while table.last.empty?
|
21
|
+
table.pop
|
22
|
+
end
|
23
|
+
[labels, table]
|
24
|
+
end
|
25
|
+
|
26
|
+
end # InferCSV
|
27
|
+
|
28
|
+
module ClassMethods #:nodoc:
|
29
|
+
|
30
|
+
include InferCSV
|
31
|
+
|
32
|
+
# This is the neatest part of this neat gem.
|
33
|
+
# DataFrame.from_csv can be called in a lot of ways:
|
34
|
+
# DataFrame.from_csv(csv_contents)
|
35
|
+
# DataFrame.from_csv(filename)
|
36
|
+
# DataFrame.from_csv(url)
|
37
|
+
# If you need to define converters for FasterCSV, do it before calling
|
38
|
+
# this method:
|
39
|
+
# FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
|
40
|
+
# DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
|
41
|
+
# This returns bar where 'foo' was found and 'foo' everywhere else.
|
42
|
+
def from_csv(obj, opts={})
|
43
|
+
labels, table = infer_csv_contents(obj, opts)
|
44
|
+
name = infer_name_from_contents(obj, opts)
|
45
|
+
return nil unless labels and table
|
46
|
+
df = new(*labels)
|
47
|
+
df.import(table)
|
48
|
+
df.name = name
|
49
|
+
df
|
50
|
+
end
|
51
|
+
|
52
|
+
protected
|
53
|
+
|
54
|
+
# Only works for names sources, urls and files
|
55
|
+
def infer_name_from_contents(obj, opts={})
|
56
|
+
begin
|
57
|
+
File.split(obj).last.split('.')[0..-2].join('.').titleize
|
58
|
+
rescue
|
59
|
+
nil
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end # Class Methods
|
64
|
+
|
65
|
+
module InstanceMethods #:nodoc:
|
66
|
+
|
67
|
+
include InferCSV
|
68
|
+
|
69
|
+
def add_item(item)
|
70
|
+
self.items << item
|
71
|
+
end
|
72
|
+
alias :add :add_item
|
73
|
+
|
74
|
+
# Loads a batch of rows. Expects an array of arrays, else you don't
|
75
|
+
# know what you have.
|
76
|
+
def import(rows)
|
77
|
+
case rows
|
78
|
+
when Array
|
79
|
+
import_array(rows)
|
80
|
+
when String
|
81
|
+
labels, table = infer_csv_contents(rows, :headers => false)
|
82
|
+
import(table)
|
83
|
+
else
|
84
|
+
raise ArgumentError, "Don't know how to import data from #{rows.class}"
|
85
|
+
end
|
86
|
+
true
|
87
|
+
end
|
88
|
+
|
89
|
+
protected
|
90
|
+
# Imports a table as an array of arrays.
|
91
|
+
# If the array is one-dimensional and there is more than one label, it
|
92
|
+
# imports only one row.
|
93
|
+
def import_array(rows)
|
94
|
+
raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
|
95
|
+
if self.labels.size > 1 and rows.dimensions == 1
|
96
|
+
self.add_item(rows)
|
97
|
+
else
|
98
|
+
rows.each do |row|
|
99
|
+
self.add_item(row)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
end # Instance Methods
|
105
|
+
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class DataFrame
|
110
|
+
include DF::Import::InstanceMethods
|
111
|
+
extend DF::Import::ClassMethods
|
112
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module PreProcess #:nodoc:
|
3
|
+
# A weird name. This creates a column for every category in a column
|
4
|
+
# and marks each row by its value
|
5
|
+
def j_binary_ize!(*columns)
|
6
|
+
# Allows to mix a hash with the columns.
|
7
|
+
options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
|
8
|
+
columns.delete_if {|e| e.is_a?(Hash)}
|
9
|
+
|
10
|
+
# Generates new columns
|
11
|
+
columns.each do |col|
|
12
|
+
values = render_column(col.to_underscore_sym)
|
13
|
+
values.categories.each do |category|
|
14
|
+
full_name = (col.to_s + "_" + category.to_s).to_sym
|
15
|
+
if options[:allow_overlap]
|
16
|
+
category_map = values.inject([]) do |list, e|
|
17
|
+
list << values.all_categories(e)
|
18
|
+
end
|
19
|
+
self.append!(full_name, category_map.map{|e| e.include?(category)})
|
20
|
+
else
|
21
|
+
self.append!(full_name, values.category_map.map{|e| e == category})
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# Adds a column, numerical_column_name that shows the same data as a
|
28
|
+
# nominal value, but as a number.
|
29
|
+
def numericize!(*columns)
|
30
|
+
columns.each do |col|
|
31
|
+
values = render_column(col.to_underscore_sym)
|
32
|
+
categories = values.categories
|
33
|
+
value_categories = values.map {|v| values.category(v)}
|
34
|
+
|
35
|
+
i = 0
|
36
|
+
category_map = value_categories.uniq.inject({}) do |h, c|
|
37
|
+
h[c] = i
|
38
|
+
i += 1
|
39
|
+
h
|
40
|
+
end
|
41
|
+
|
42
|
+
blank = Array.new(category_map.size, 0)
|
43
|
+
reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
|
44
|
+
|
45
|
+
new_values = values.inject([]) do |list, val|
|
46
|
+
a = blank.dup
|
47
|
+
a[category_map[values.category(val)]] = 1
|
48
|
+
list << a
|
49
|
+
end
|
50
|
+
|
51
|
+
new_name = "numerical #{col.to_s}".to_underscore_sym
|
52
|
+
self.append!(new_name, new_values)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class DataFrame
|
60
|
+
include DF::PreProcess
|
61
|
+
end
|