data_frame 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +122 -0
- data/VERSION.yml +4 -0
- data/bin/plain_frame +22 -0
- data/lib/data_frame.rb +26 -0
- data/lib/data_frame/arff.rb +52 -0
- data/lib/data_frame/callback_array.rb +152 -0
- data/lib/data_frame/core/column_management.rb +147 -0
- data/lib/data_frame/core/filter.rb +48 -0
- data/lib/data_frame/core/import.rb +113 -0
- data/lib/data_frame/core/pre_process.rb +69 -0
- data/lib/data_frame/core/saving.rb +29 -0
- data/lib/data_frame/core/training.rb +46 -0
- data/lib/data_frame/data_frame.rb +115 -0
- data/lib/data_frame/id3.rb +28 -0
- data/lib/data_frame/kmeans.rb +10 -0
- data/lib/data_frame/labels_from_uci.rb +48 -0
- data/lib/data_frame/mlp.rb +18 -0
- data/lib/data_frame/model.rb +22 -0
- data/lib/data_frame/parameter_capture.rb +50 -0
- data/lib/data_frame/sbn.rb +18 -0
- data/lib/data_frame/transposable_array.rb +23 -0
- data/lib/ext/array.rb +11 -0
- data/lib/ext/open_struct.rb +5 -0
- data/lib/ext/string.rb +5 -0
- data/lib/ext/symbol.rb +5 -0
- data/spec/data_frame/arff_spec.rb +48 -0
- data/spec/data_frame/callback_array_spec.rb +148 -0
- data/spec/data_frame/core/column_management_spec.rb +128 -0
- data/spec/data_frame/core/filter_spec.rb +88 -0
- data/spec/data_frame/core/import_spec.rb +41 -0
- data/spec/data_frame/core/pre_process_spec.rb +103 -0
- data/spec/data_frame/core/saving_spec.rb +61 -0
- data/spec/data_frame/core/training_spec.rb +72 -0
- data/spec/data_frame/data_frame_spec.rb +141 -0
- data/spec/data_frame/id3_spec.rb +22 -0
- data/spec/data_frame/model_spec.rb +36 -0
- data/spec/data_frame/parameter_capture_spec.rb +32 -0
- data/spec/data_frame/transposable_array_spec.rb +138 -0
- data/spec/data_frame_spec.rb +29 -0
- data/spec/ext/array_spec.rb +13 -0
- data/spec/fixtures/basic.csv +3 -0
- data/spec/fixtures/discrete_testing.csv +4 -0
- data/spec/fixtures/discrete_training.csv +21 -0
- data/spec/spec_helper.rb +8 -0
- metadata +128 -0
data/README.rdoc
ADDED
@@ -0,0 +1,122 @@
|
|
1
|
+
== Data Frame
|
2
|
+
|
3
|
+
This is a general data frame. Load arrays and labels into it, and you will have a very powerful set of tools on your data set.
|
4
|
+
|
5
|
+
==Usage
|
6
|
+
|
7
|
+
df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
8
|
+
df.labels
|
9
|
+
# => [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
10
|
+
df.dmc
|
11
|
+
# => [26.2, 35.4, 43.7, 33.3, 51.3, 85.3,...]
|
12
|
+
df.dmc.max
|
13
|
+
# => 291.3
|
14
|
+
df.dmc.min
|
15
|
+
# => 1.1
|
16
|
+
df.dmc.mean
|
17
|
+
# => 110.872340425532
|
18
|
+
df.dmc.std
|
19
|
+
# => 64.0464822492543
|
20
|
+
df = DataFrame.new(:list, :of, :things)
|
21
|
+
# => #<DataFrame:0x24ec6e8 @items=[], @labels=[:list, :of, :things]>
|
22
|
+
df.labels
|
23
|
+
# => [:list, :of, :things]
|
24
|
+
df << [1,2,3]
|
25
|
+
# => [[1, 2, 3]]
|
26
|
+
df.import([[2,3,4],[5,6,7]])
|
27
|
+
# => [[2, 3, 4], [5, 6, 7]]
|
28
|
+
df.items
|
29
|
+
# => [[1, 2, 3], [2, 3, 4], [5, 6, 7]]
|
30
|
+
df.list
|
31
|
+
# => [1, 2, 5]
|
32
|
+
df.list.correlation(df.things)
|
33
|
+
# => 1.0
|
34
|
+
df.list
|
35
|
+
# => [1, 2, 5]
|
36
|
+
df.things
|
37
|
+
# => [3, 4, 7]
|
38
|
+
|
39
|
+
There are a few important features to know:
|
40
|
+
|
41
|
+
* DataFrame.from_csv works for a string, a filename, or a URL.
|
42
|
+
* FasterCSV parsing parameters can be passed to DataFrame.from_csv
|
43
|
+
* DataFrame looks for operations first on the column labels, then on the row labels, then on the items table. So don't name things :mean, :standard_deviation, :min, and that sort of thing.
|
44
|
+
* CallbackArray allows you to set a callback anytime an array is tainted or untainted (taint, shift, pop, clear, map!, that sort of thing). This is generally useful and will probably be copied into the Repositories gem.
|
45
|
+
* TransposableArray is a subclass of CallbackArray, demonstrating how to use it. It creates a very simple approach to memoization. It caches the transpose of the table and resets it whenever it is tainted.
|
46
|
+
|
47
|
+
To get your feet wet, you may want to play with data sets found here:
|
48
|
+
|
49
|
+
http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html
|
50
|
+
|
51
|
+
== Transformations
|
52
|
+
|
53
|
+
A lot of the work in the data frame is to transform the actual table. You may need to drop columns, filter results, replace values in a column or create a new data frame based on the existing one. Here's how to do that:
|
54
|
+
|
55
|
+
> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
56
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
57
|
+
> df.drop!(:ffmc)
|
58
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
59
|
+
> df.drop!(:dmc, :dc, :isi, :rh)
|
60
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
61
|
+
> df.x
|
62
|
+
# => [7, 7, 7, 8, 8, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6,...]
|
63
|
+
> df.replace!(:x) {|e| e * 3}
|
64
|
+
# => DataFrame rows: 517 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
65
|
+
> df.x
|
66
|
+
# => [21, 21, 21, 24, 24, 24, 24, 24, 24, 21, 21, 21, 18, 18, 18,...]
|
67
|
+
> df.filter!(:open_struct) {|row| row.x == 24}
|
68
|
+
# => DataFrame rows: 61 labels: [:x, :y, :month, :day, :temp, :wind, :rain, :area]
|
69
|
+
> df.x
|
70
|
+
# => [24, 24, 24, 24, 24, 24, 24, 24, 24,...]
|
71
|
+
> new_data_frame = df.subset_from_columns(:x, :y)
|
72
|
+
# => DataFrame rows: 61 labels: [:x, :y]
|
73
|
+
> new_data_frame.items
|
74
|
+
# => [[24, 6], [24, 6], [24, 6], [24, 6], ...]
|
75
|
+
|
76
|
+
|
77
|
+
Note: most of these transformations are not optimized. I'll work with things for a while before I try to optimize this library. However, I should say that I've used some fairly large data sets (thousands of rows) and have been fine with things so far.
|
78
|
+
|
79
|
+
== Models
|
80
|
+
|
81
|
+
Data Frame can now create sub-models:
|
82
|
+
|
83
|
+
>> df = DataFrame.from_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv')
|
84
|
+
=> DataFrame rows: 517 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
85
|
+
>> df.model(:weekend) do |m|
|
86
|
+
?> m.day %w(sat sun)
|
87
|
+
>> end
|
88
|
+
=> DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]
|
89
|
+
>> df.models.weekend.day.uniq
|
90
|
+
=> ["sat", "sun"]
|
91
|
+
>> df.models
|
92
|
+
=> #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
|
93
|
+
|
94
|
+
== Utilities
|
95
|
+
|
96
|
+
I use data frame for a lot of things, and I've added some utilities for this gem in case you would like to as well. For instance, here is how I take the data in a data frame and load it into a neural network:
|
97
|
+
|
98
|
+
# Show mlp. Will probably need to add a row classifier for training and test data. Also, will probably want to
|
99
|
+
|
100
|
+
== CLI
|
101
|
+
|
102
|
+
There are some really interesting things that have good command-line shortcuts:
|
103
|
+
|
104
|
+
* Make
|
105
|
+
* A
|
106
|
+
* List
|
107
|
+
|
108
|
+
# Now add some demos
|
109
|
+
|
110
|
+
==Installation
|
111
|
+
|
112
|
+
sudo gem install davidrichards-data_frame
|
113
|
+
|
114
|
+
=== Dependencies
|
115
|
+
|
116
|
+
* ActiveSupport: sudo gem install activesupport
|
117
|
+
* JustEnumerableStats: sudo gem install davidrichards-just_enumerable_stats
|
118
|
+
* FasterCSV: sudo gem install fastercsv
|
119
|
+
|
120
|
+
==COPYRIGHT
|
121
|
+
|
122
|
+
Copyright (c) 2009 David Richards. See LICENSE for details.
|
data/VERSION.yml
ADDED
data/bin/plain_frame
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
#!/usr/bin/env ruby -wKU
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
version_hash = YAML.load_file(File.join(File.dirname(__FILE__), %w(.. VERSION.yml)))
|
5
|
+
version = [version_hash[:major].to_s, version_hash[:minor].to_s, version_hash[:patch].to_s].join(".")
|
6
|
+
df_file = File.join(File.dirname(__FILE__), %w(.. lib data_frame))
|
7
|
+
|
8
|
+
irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
|
9
|
+
|
10
|
+
require 'optparse'
|
11
|
+
options = { :irb => irb, :without_stored_procedures => false }
|
12
|
+
OptionParser.new do |opt|
|
13
|
+
opt.banner = "Usage: console [environment] [options]"
|
14
|
+
opt.on("--irb=[#{irb}]", 'Invoke a different irb.') { |v| options[:irb] = v }
|
15
|
+
opt.parse!(ARGV)
|
16
|
+
end
|
17
|
+
|
18
|
+
libs = " -r irb/completion -r #{df_file}"
|
19
|
+
|
20
|
+
puts "Loading Data Frame version: #{version}"
|
21
|
+
|
22
|
+
exec "#{options[:irb]} #{libs} --simple-prompt"
|
data/lib/data_frame.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'activesupport'
|
3
|
+
require 'just_enumerable_stats'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'fastercsv'
|
6
|
+
require 'ostruct'
|
7
|
+
|
8
|
+
# Use a Dictionary if available
|
9
|
+
begin
|
10
|
+
require 'facets/dictionary'
|
11
|
+
rescue LoadError => e
|
12
|
+
# Do nothing
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
Dir.glob("#{File.dirname(__FILE__)}/ext/*.rb").each { |file| require file }
|
17
|
+
|
18
|
+
$:.unshift(File.dirname(__FILE__))
|
19
|
+
|
20
|
+
require 'data_frame/callback_array'
|
21
|
+
require 'data_frame/transposable_array'
|
22
|
+
require 'data_frame/parameter_capture'
|
23
|
+
require 'data_frame/data_frame'
|
24
|
+
require 'data_frame/model'
|
25
|
+
|
26
|
+
Dir.glob("#{File.dirname(__FILE__)}/data_frame/core/*.rb").each { |file| require file }
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
# Turns a data frame into ARFF-formatted content.
|
3
|
+
module ARFF
|
4
|
+
|
5
|
+
# Used in arff, but generally useful.
|
6
|
+
def to_csv(include_header=true)
|
7
|
+
value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
|
8
|
+
self.items.inject(value) do |list, e|
|
9
|
+
list << e.map {|cell| cell.to_s}.join(',') + "\n"
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def to_arff
|
14
|
+
arff_header + to_csv(false)
|
15
|
+
end
|
16
|
+
|
17
|
+
protected
|
18
|
+
def arff_attributes
|
19
|
+
container = defined?(Dictionary) ? Dictionary.new : Hash.new
|
20
|
+
|
21
|
+
self.labels.inject(container) do |list, e|
|
22
|
+
list[e] = self.render_column(e).categories
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def arff_formatted_attributes
|
27
|
+
self.labels.inject('') do |str, e|
|
28
|
+
val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
|
29
|
+
str << "@attribute #{e} #{val}\n"
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def arff_relation
|
34
|
+
self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
|
35
|
+
end
|
36
|
+
|
37
|
+
def arff_header
|
38
|
+
%[@relation #{arff_relation}
|
39
|
+
|
40
|
+
#{arff_formatted_attributes}
|
41
|
+
@data
|
42
|
+
]
|
43
|
+
end
|
44
|
+
|
45
|
+
alias :arff_items :to_csv
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
49
|
+
|
50
|
+
class DataFrame
|
51
|
+
include DF::ARFF
|
52
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
# This overloads the tainting methods in array with callbacks. So, I
|
2
|
+
# can block all changes to an array, or broadcast to observers after a
|
3
|
+
# change, or limit the size of an array. It really just opens up the array to one more dimension: change. Before and after change, stack up any activity to block or enhance the experience. There are also callbacks on untaint. The tainting methods actually
|
4
|
+
class CallbackArray < Array
|
5
|
+
|
6
|
+
include ActiveSupport::Callbacks
|
7
|
+
define_callbacks :before_taint, :after_taint, :before_untaint, :after_untaint
|
8
|
+
|
9
|
+
def wrap_call(safe_method, *args)
|
10
|
+
callback_result = run_callbacks(:before_taint)
|
11
|
+
if callback_result
|
12
|
+
result = self.send(safe_method, *args)
|
13
|
+
self.orig_taint
|
14
|
+
run_callbacks(:after_taint)
|
15
|
+
end
|
16
|
+
result
|
17
|
+
end
|
18
|
+
protected :wrap_call
|
19
|
+
|
20
|
+
# Need the original taint for all tainting methods
|
21
|
+
alias :orig_taint :taint
|
22
|
+
def taint
|
23
|
+
callback_result = run_callbacks(:before_taint)
|
24
|
+
if callback_result
|
25
|
+
result = self.orig_taint
|
26
|
+
run_callbacks(:after_taint)
|
27
|
+
end
|
28
|
+
result
|
29
|
+
end
|
30
|
+
|
31
|
+
# No other method needs orig_untaint, so building this in the cleanest
|
32
|
+
# way possible.
|
33
|
+
orig_untaint = instance_method(:untaint)
|
34
|
+
define_method(:untaint) {
|
35
|
+
callback_result = run_callbacks(:before_untaint)
|
36
|
+
if callback_result
|
37
|
+
val = orig_untaint.bind(self).call
|
38
|
+
run_callbacks(:after_untaint)
|
39
|
+
end
|
40
|
+
val
|
41
|
+
}
|
42
|
+
|
43
|
+
alias :nontainting_assign :[]=
|
44
|
+
def []=(index, value)
|
45
|
+
wrap_call(:nontainting_assign, index, value)
|
46
|
+
end
|
47
|
+
|
48
|
+
alias :nontainting_append :<<
|
49
|
+
def <<(value)
|
50
|
+
wrap_call(:nontainting_append, value)
|
51
|
+
end
|
52
|
+
|
53
|
+
alias :nontainting_delete :delete
|
54
|
+
def delete(value)
|
55
|
+
wrap_call(:nontainting_delete, value)
|
56
|
+
end
|
57
|
+
|
58
|
+
alias :nontainting_push :push
|
59
|
+
def push(value)
|
60
|
+
wrap_call(:nontainting_push, value)
|
61
|
+
end
|
62
|
+
|
63
|
+
alias :nontainting_pop :pop
|
64
|
+
def pop
|
65
|
+
wrap_call(:nontainting_pop)
|
66
|
+
end
|
67
|
+
|
68
|
+
alias :nontainting_shift :shift
|
69
|
+
def shift
|
70
|
+
wrap_call(:nontainting_shift)
|
71
|
+
end
|
72
|
+
|
73
|
+
alias :nontainting_unshift :unshift
|
74
|
+
def unshift(value)
|
75
|
+
wrap_call(:nontainting_unshift, value)
|
76
|
+
end
|
77
|
+
|
78
|
+
alias :nontainting_map! :map!
|
79
|
+
def map!(&block)
|
80
|
+
callback_result = run_callbacks(:before_taint)
|
81
|
+
if callback_result
|
82
|
+
result = nontainting_map!(&block)
|
83
|
+
self.orig_taint
|
84
|
+
run_callbacks(:after_taint)
|
85
|
+
end
|
86
|
+
result
|
87
|
+
end
|
88
|
+
|
89
|
+
alias :nontainting_sort! :sort!
|
90
|
+
def sort!(&block)
|
91
|
+
callback_result = run_callbacks(:before_taint)
|
92
|
+
if callback_result
|
93
|
+
result = nontainting_sort!(&block)
|
94
|
+
self.orig_taint
|
95
|
+
run_callbacks(:after_taint)
|
96
|
+
end
|
97
|
+
result
|
98
|
+
end
|
99
|
+
|
100
|
+
alias :nontainting_reverse! :reverse!
|
101
|
+
def reverse!
|
102
|
+
wrap_call(:nontainting_reverse!)
|
103
|
+
end
|
104
|
+
|
105
|
+
alias :nontainting_collect! :collect!
|
106
|
+
def collect!(&block)
|
107
|
+
callback_result = run_callbacks(:before_taint)
|
108
|
+
if callback_result
|
109
|
+
result = nontainting_collect!(&block)
|
110
|
+
self.orig_taint
|
111
|
+
run_callbacks(:after_taint)
|
112
|
+
end
|
113
|
+
result
|
114
|
+
end
|
115
|
+
|
116
|
+
alias :nontainting_compact! :compact!
|
117
|
+
def compact!
|
118
|
+
wrap_call(:nontainting_compact!)
|
119
|
+
end
|
120
|
+
|
121
|
+
alias :nontainting_reject! :reject!
|
122
|
+
def reject!(&block)
|
123
|
+
callback_result = run_callbacks(:before_taint)
|
124
|
+
if callback_result
|
125
|
+
result = nontainting_reject!(&block)
|
126
|
+
self.orig_taint
|
127
|
+
run_callbacks(:after_taint)
|
128
|
+
end
|
129
|
+
result
|
130
|
+
end
|
131
|
+
|
132
|
+
alias :nontainting_slice! :slice!
|
133
|
+
def slice!(*args)
|
134
|
+
wrap_call(:nontainting_slice!, *args)
|
135
|
+
end
|
136
|
+
|
137
|
+
alias :nontainting_flatten! :flatten!
|
138
|
+
def flatten!
|
139
|
+
wrap_call(:nontainting_flatten!)
|
140
|
+
end
|
141
|
+
|
142
|
+
alias :nontainting_uniq! :uniq!
|
143
|
+
def uniq!
|
144
|
+
wrap_call(:nontainting_uniq!)
|
145
|
+
end
|
146
|
+
|
147
|
+
alias :nontainting_clear :clear
|
148
|
+
def clear
|
149
|
+
wrap_call(:nontainting_clear)
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
@@ -0,0 +1,147 @@
|
|
1
|
+
module DF #:nodoc:
|
2
|
+
module ColumnManagement #:nodoc:
|
3
|
+
|
4
|
+
def move_to_last!(orig_name)
|
5
|
+
raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
|
6
|
+
new_name = (orig_name.to_s + "_a_unique_name").to_sym
|
7
|
+
self.append!(new_name, self.render_column(orig_name))
|
8
|
+
self.drop!(orig_name)
|
9
|
+
self.rename!(orig_name, new_name)
|
10
|
+
end
|
11
|
+
|
12
|
+
# In the order of alias: new_name, orig_name
|
13
|
+
def rename!(new_name, orig_name)
|
14
|
+
new_name = new_name.to_underscore_sym
|
15
|
+
orig_name = orig_name.to_underscore_sym
|
16
|
+
raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
|
17
|
+
raise ArgumentError, "Cannot name #{orig_name} to #{new_name}, that column already exists." if self.labels.include?(new_name)
|
18
|
+
i = self.labels.index(orig_name)
|
19
|
+
self.labels[i] = new_name
|
20
|
+
end
|
21
|
+
|
22
|
+
# Adds a unique column to the table
|
23
|
+
def append!(column_name, value=nil)
|
24
|
+
raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
|
25
|
+
self.labels << column_name.to_underscore_sym
|
26
|
+
if value.is_a?(Array)
|
27
|
+
self.items.each_with_index do |item, i|
|
28
|
+
item << value[i]
|
29
|
+
end
|
30
|
+
else
|
31
|
+
self.items.each do |item|
|
32
|
+
item << value
|
33
|
+
end
|
34
|
+
end
|
35
|
+
self.columns(true)
|
36
|
+
# Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
|
37
|
+
self.items.taint
|
38
|
+
end
|
39
|
+
|
40
|
+
def replace!(column, values=nil, &block)
|
41
|
+
column = validate_column(column)
|
42
|
+
if not values
|
43
|
+
values = self.send(column)
|
44
|
+
values.map! {|e| block.call(e)}
|
45
|
+
end
|
46
|
+
replace_column!(column, values)
|
47
|
+
self
|
48
|
+
end
|
49
|
+
|
50
|
+
# Replace a single column with an array of values.
|
51
|
+
# It is helpful to have the values the same size as the rest of the data
|
52
|
+
# frame.
|
53
|
+
def replace_column!(column, values)
|
54
|
+
store_range_hashes
|
55
|
+
column = validate_column(column)
|
56
|
+
index = self.labels.index(column)
|
57
|
+
@items.each_with_index do |item, i|
|
58
|
+
item[index] = values[i]
|
59
|
+
end
|
60
|
+
|
61
|
+
# Make sure we recalculate things after changing a column
|
62
|
+
self.items.taint
|
63
|
+
@columns = nil
|
64
|
+
self.columns
|
65
|
+
restore_range_hashes
|
66
|
+
|
67
|
+
# Return the items
|
68
|
+
@items
|
69
|
+
end
|
70
|
+
|
71
|
+
# Drop one or more columns
|
72
|
+
def drop!(*labels)
|
73
|
+
labels.each do |label|
|
74
|
+
drop_one!(label)
|
75
|
+
end
|
76
|
+
self
|
77
|
+
end
|
78
|
+
|
79
|
+
# Drop a single column
|
80
|
+
def drop_one!(label)
|
81
|
+
i = self.labels.index(label)
|
82
|
+
return nil unless i
|
83
|
+
self.items.each do |item|
|
84
|
+
item.delete_at(i)
|
85
|
+
end
|
86
|
+
self.labels.delete_at(i)
|
87
|
+
self
|
88
|
+
end
|
89
|
+
|
90
|
+
# Creates a new data frame, only with the specified columns.
|
91
|
+
def subset_from_columns(*cols)
|
92
|
+
new_labels = self.labels.inject([]) do |list, label|
|
93
|
+
list << label if cols.include?(label)
|
94
|
+
list
|
95
|
+
end
|
96
|
+
new_data_frame = DataFrame.new(*self.labels)
|
97
|
+
new_data_frame.import(self.items)
|
98
|
+
self.labels.each do |label|
|
99
|
+
new_data_frame.drop!(label) unless new_labels.include?(label)
|
100
|
+
end
|
101
|
+
new_data_frame
|
102
|
+
end
|
103
|
+
|
104
|
+
# Duplicates a column, the values only. This is useful when creating a related column, such as values by category.
|
105
|
+
def duplicate!(column_name)
|
106
|
+
return false unless self.labels.include?(column_name)
|
107
|
+
i = 1
|
108
|
+
i += 1 while self.labels.include?(new_column_name(column_name, i))
|
109
|
+
self.append!(new_column_name(column_name, i), self.render_column(column_name).dup)
|
110
|
+
true
|
111
|
+
end
|
112
|
+
|
113
|
+
def new_column_name(column_name, i)
|
114
|
+
(column_name.to_s + i.to_s).to_sym
|
115
|
+
end
|
116
|
+
protected :new_column_name
|
117
|
+
|
118
|
+
protected
|
119
|
+
def store_range_hashes
|
120
|
+
@stored_range_hashes = self.labels.inject({}) do |h, label|
|
121
|
+
h[label] = self.render_column(label).range_hash
|
122
|
+
h
|
123
|
+
end
|
124
|
+
@stored_range_hashes = nil if @stored_range_hashes.all? {|k, v| v.nil?}
|
125
|
+
end
|
126
|
+
|
127
|
+
def restore_range_hashes
|
128
|
+
return false unless @stored_range_hashes
|
129
|
+
@stored_range_hashes.each do |label, range_hash|
|
130
|
+
self.render_column(label).set_categories(range_hash) if range_hash
|
131
|
+
end
|
132
|
+
true
|
133
|
+
end
|
134
|
+
|
135
|
+
def category_map_from_stored_range_hash(column)
|
136
|
+
self.render_column(column).set_categories(@stored_range_hashes[column]) if
|
137
|
+
@stored_range_hashes and @stored_range_hashes.keys.include?(column)
|
138
|
+
self.render_column(column).category_map.dup
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
class DataFrame
|
146
|
+
include DF::ColumnManagement
|
147
|
+
end
|