davidrichards-data_frame 0.0.18 → 0.0.19

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -91,6 +91,22 @@ Data Frame can now create sub-models:
91
91
  >> df.models
92
92
  => #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
93
93
 
94
+ == Utilities
95
+
96
+ I use data frame for a lot of things, and I've added some utilities for this gem in case you would like to as well. For instance, here is how I take the data in a data frame and load it into a neural network:
97
+
98
+ # Show mlp. Will probably need to add a row classifier for training and test data. Also, will probably want to
99
+
100
+ == CLI
101
+
102
+ There are some really interesting things that have good command-line shortcuts:
103
+
104
+ * Make
105
+ * A
106
+ * List
107
+
108
+ # Now add some demos
109
+
94
110
  ==Installation
95
111
 
96
112
  sudo gem install davidrichards-data_frame
data/VERSION.yml CHANGED
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 18
4
+ :patch: 20
data/bin/plain_frame ADDED
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby -wKU
2
+ require 'yaml'
3
+
4
+ version_hash = YAML.load_file(File.join(File.dirname(__FILE__), %w(.. VERSION.yml)))
5
+ version = [version_hash[:major].to_s, version_hash[:minor].to_s, version_hash[:patch].to_s].join(".")
6
+ df_file = File.join(File.dirname(__FILE__), %w(.. lib data_frame))
7
+
8
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
9
+
10
+ require 'optparse'
11
+ options = { :irb => irb, :without_stored_procedures => false }
12
+ OptionParser.new do |opt|
13
+ opt.banner = "Usage: console [environment] [options]"
14
+ opt.on("--irb=[#{irb}]", 'Invoke a different irb.') { |v| options[:irb] = v }
15
+ opt.parse!(ARGV)
16
+ end
17
+
18
+ libs = " -r irb/completion -r #{df_file}"
19
+
20
+ puts "Loading Data Frame version: #{version}"
21
+
22
+ exec "#{options[:irb]} #{libs} --simple-prompt"
data/lib/data_frame.rb CHANGED
@@ -20,6 +20,7 @@ $:.unshift(File.dirname(__FILE__))
20
20
  require 'data_frame/callback_array'
21
21
  require 'data_frame/transposable_array'
22
22
  require 'data_frame/parameter_capture'
23
- require 'data_frame/arff'
24
23
  require 'data_frame/data_frame'
25
24
  require 'data_frame/model'
25
+
26
+ Dir.glob("#{File.dirname(__FILE__)}/data_frame/core/*.rb").each { |file| require file }
@@ -1,45 +1,52 @@
1
- # Turns a data frame into ARFF-formatted content.
2
- module ARFF
3
-
4
- # Used in arff, but generally useful.
5
- def to_csv(include_header=true)
6
- value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
7
- self.items.inject(value) do |list, e|
8
- list << e.map {|cell| cell.to_s}.join(',') + "\n"
9
- end
10
- end
1
+ module DF #:nodoc:
2
+ # Turns a data frame into ARFF-formatted content.
3
+ module ARFF
11
4
 
12
- def to_arff
13
- arff_header + to_csv(false)
14
- end
15
-
16
- protected
17
- def arff_attributes
18
- container = defined?(Dictionary) ? Dictionary.new : Hash.new
19
-
20
- self.labels.inject(container) do |list, e|
21
- list[e] = self.render_column(e).categories
22
- end
23
- end
24
-
25
- def arff_formatted_attributes
26
- self.labels.inject('') do |str, e|
27
- val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
28
- str << "@attribute #{e} #{val}\n"
5
+ # Used in arff, but generally useful.
6
+ def to_csv(include_header=true)
7
+ value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
8
+ self.items.inject(value) do |list, e|
9
+ list << e.map {|cell| cell.to_s}.join(',') + "\n"
29
10
  end
30
11
  end
31
-
32
- def arff_relation
33
- self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
12
+
13
+ def to_arff
14
+ arff_header + to_csv(false)
34
15
  end
35
-
36
- def arff_header
37
- %[@relation #{arff_relation}
16
+
17
+ protected
18
+ def arff_attributes
19
+ container = defined?(Dictionary) ? Dictionary.new : Hash.new
20
+
21
+ self.labels.inject(container) do |list, e|
22
+ list[e] = self.render_column(e).categories
23
+ end
24
+ end
25
+
26
+ def arff_formatted_attributes
27
+ self.labels.inject('') do |str, e|
28
+ val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
29
+ str << "@attribute #{e} #{val}\n"
30
+ end
31
+ end
32
+
33
+ def arff_relation
34
+ self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
35
+ end
36
+
37
+ def arff_header
38
+ %[@relation #{arff_relation}
38
39
 
39
40
  #{arff_formatted_attributes}
40
41
  @data
41
42
  ]
42
- end
43
-
44
- alias :arff_items :to_csv
43
+ end
44
+
45
+ alias :arff_items :to_csv
46
+ end
47
+
48
+ end
49
+
50
+ class DataFrame
51
+ include DF::ARFF
45
52
  end
@@ -0,0 +1,102 @@
1
+ module DF #:nodoc:
2
+ module ColumnManagement #:nodoc:
3
+
4
+ def move_to_last!(orig_name)
5
+ raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
6
+ new_name = (orig_name.to_s + "_a_unique_name").to_sym
7
+ self.append!(new_name, self.render_column(orig_name))
8
+ self.drop!(orig_name)
9
+ self.rename!(orig_name, new_name)
10
+ end
11
+
12
+ # In the order of alias: new_name, orig_name
13
+ def rename!(new_name, orig_name)
14
+ new_name = new_name.to_underscore_sym
15
+ orig_name = orig_name.to_underscore_sym
16
+ raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
17
+ raise ArgumentError, "Cannot name #{orig_name} to #{new_name}, that column already exists." if self.labels.include?(new_name)
18
+ i = self.labels.index(orig_name)
19
+ self.labels[i] = new_name
20
+ end
21
+
22
+ # Adds a unique column to the table
23
+ def append!(column_name, value=nil)
24
+ raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
25
+ self.labels << column_name.to_underscore_sym
26
+ if value.is_a?(Array)
27
+ self.items.each_with_index do |item, i|
28
+ item << value[i]
29
+ end
30
+ else
31
+ self.items.each do |item|
32
+ item << value
33
+ end
34
+ end
35
+ # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
36
+ self.items.taint
37
+ end
38
+
39
+ def replace!(column, values=nil, &block)
40
+ column = validate_column(column)
41
+ if not values
42
+ values = self.send(column)
43
+ values.map! {|e| block.call(e)}
44
+ end
45
+ replace_column!(column, values)
46
+ self
47
+ end
48
+
49
+ # Replace a single column with an array of values.
50
+ # It is helpful to have the values the same size as the rest of the data
51
+ # frame.
52
+ def replace_column!(column, values)
53
+ column = validate_column(column)
54
+ index = self.labels.index(column)
55
+ list = []
56
+ self.items.each_with_index do |item, i|
57
+ consolidated = item
58
+ consolidated[index] = values[i]
59
+ list << consolidated
60
+ end
61
+ @items = list.dup
62
+ end
63
+
64
+ # Drop one or more columns
65
+ def drop!(*labels)
66
+ labels.each do |label|
67
+ drop_one!(label)
68
+ end
69
+ self
70
+ end
71
+
72
+ # Drop a single column
73
+ def drop_one!(label)
74
+ i = self.labels.index(label)
75
+ return nil unless i
76
+ self.items.each do |item|
77
+ item.delete_at(i)
78
+ end
79
+ self.labels.delete_at(i)
80
+ self
81
+ end
82
+
83
+ # Creates a new data frame, only with the specified columns.
84
+ def subset_from_columns(*cols)
85
+ new_labels = self.labels.inject([]) do |list, label|
86
+ list << label if cols.include?(label)
87
+ list
88
+ end
89
+ new_data_frame = DataFrame.new(*self.labels)
90
+ new_data_frame.import(self.items)
91
+ self.labels.each do |label|
92
+ new_data_frame.drop!(label) unless new_labels.include?(label)
93
+ end
94
+ new_data_frame
95
+ end
96
+
97
+ end
98
+ end
99
+
100
+ class DataFrame
101
+ include DF::ColumnManagement
102
+ end
@@ -0,0 +1,48 @@
1
+ module DF #:nodoc:
2
+ module Filter #:nodoc:
3
+
4
+ # Takes a block to evaluate on each row. The row can be converted into
5
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
6
+ # with a hash or open struct unless you have facets available.
7
+ def filter!(as=Array, &block)
8
+ as = infer_class(as)
9
+ items = []
10
+ self.items.each do |row|
11
+ value = block.call(cast_row(row, as))
12
+ items << row if value
13
+ end
14
+ @items = items.dup
15
+ self
16
+ end
17
+
18
+ def filter(as=Array, &block)
19
+ new_data_frame = self.clone
20
+ new_data_frame.filter!(as, &block)
21
+ end
22
+
23
+ def filter_by_category(hash)
24
+ new_data_frame = self.dup
25
+ hash.each do |key, value|
26
+ key = key.to_underscore_sym
27
+ next unless self.labels.include?(key)
28
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
29
+ new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
30
+ end
31
+ new_data_frame
32
+ end
33
+
34
+ def filter_by_category!(hash)
35
+ hash.each do |key, value|
36
+ key = key.to_underscore_sym
37
+ next unless self.labels.include?(key)
38
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
39
+ self.filter!(:hash) {|row| value.include?(row[key])}
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ class DataFrame
47
+ include DF::Filter
48
+ end
@@ -0,0 +1,112 @@
1
+ module DF #:nodoc:
2
+ module Import #:nodoc:
3
+
4
+ module InferCSV #:nodoc:
5
+
6
+ protected
7
+ def default_csv_opts; {:converters => :all}; end
8
+
9
+ def infer_csv_contents(obj, opts={})
10
+ contents = File.read(obj) if File.exist?(obj)
11
+ begin
12
+ open(obj) {|f| contents = f.read} unless contents
13
+ rescue
14
+ nil
15
+ end
16
+ contents ||= obj if obj.is_a?(String)
17
+ return nil unless contents
18
+ table = FCSV.parse(contents, default_csv_opts.merge(opts))
19
+ labels = opts.fetch(:headers, true) ? table.shift : []
20
+ while table.last.empty?
21
+ table.pop
22
+ end
23
+ [labels, table]
24
+ end
25
+
26
+ end # InferCSV
27
+
28
+ module ClassMethods #:nodoc:
29
+
30
+ include InferCSV
31
+
32
+ # This is the neatest part of this neat gem.
33
+ # DataFrame.from_csv can be called in a lot of ways:
34
+ # DataFrame.from_csv(csv_contents)
35
+ # DataFrame.from_csv(filename)
36
+ # DataFrame.from_csv(url)
37
+ # If you need to define converters for FasterCSV, do it before calling
38
+ # this method:
39
+ # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
40
+ # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
41
+ # This returns bar where 'foo' was found and 'foo' everywhere else.
42
+ def from_csv(obj, opts={})
43
+ labels, table = infer_csv_contents(obj, opts)
44
+ name = infer_name_from_contents(obj, opts)
45
+ return nil unless labels and table
46
+ df = new(*labels)
47
+ df.import(table)
48
+ df.name = name
49
+ df
50
+ end
51
+
52
+ protected
53
+
54
+ # Only works for names sources, urls and files
55
+ def infer_name_from_contents(obj, opts={})
56
+ begin
57
+ File.split(obj).last.split('.')[0..-2].join('.').titleize
58
+ rescue
59
+ nil
60
+ end
61
+ end
62
+
63
+ end # Class Methods
64
+
65
+ module InstanceMethods #:nodoc:
66
+
67
+ include InferCSV
68
+
69
+ def add_item(item)
70
+ self.items << item
71
+ end
72
+ alias :add :add_item
73
+
74
+ # Loads a batch of rows. Expects an array of arrays, else you don't
75
+ # know what you have.
76
+ def import(rows)
77
+ case rows
78
+ when Array
79
+ import_array(rows)
80
+ when String
81
+ labels, table = infer_csv_contents(rows, :headers => false)
82
+ import(table)
83
+ else
84
+ raise ArgumentError, "Don't know how to import data from #{rows.class}"
85
+ end
86
+ true
87
+ end
88
+
89
+ protected
90
+ # Imports a table as an array of arrays.
91
+ # If the array is one-dimensional and there is more than one label, it
92
+ # imports only one row.
93
+ def import_array(rows)
94
+ raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
95
+ if self.labels.size > 1 and rows.dimensions == 1
96
+ self.add_item(rows)
97
+ else
98
+ rows.each do |row|
99
+ self.add_item(row)
100
+ end
101
+ end
102
+ end
103
+
104
+ end # Instance Methods
105
+
106
+ end
107
+ end
108
+
109
+ class DataFrame
110
+ include DF::Import::InstanceMethods
111
+ extend DF::Import::ClassMethods
112
+ end
@@ -0,0 +1,61 @@
1
+ module DF #:nodoc:
2
+ module PreProcess #:nodoc:
3
+ # A weird name. This creates a column for every category in a column
4
+ # and marks each row by its value
5
+ def j_binary_ize!(*columns)
6
+ # Allows to mix a hash with the columns.
7
+ options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
8
+ columns.delete_if {|e| e.is_a?(Hash)}
9
+
10
+ # Generates new columns
11
+ columns.each do |col|
12
+ values = render_column(col.to_underscore_sym)
13
+ values.categories.each do |category|
14
+ full_name = (col.to_s + "_" + category.to_s).to_sym
15
+ if options[:allow_overlap]
16
+ category_map = values.inject([]) do |list, e|
17
+ list << values.all_categories(e)
18
+ end
19
+ self.append!(full_name, category_map.map{|e| e.include?(category)})
20
+ else
21
+ self.append!(full_name, values.category_map.map{|e| e == category})
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ # Adds a column, numerical_column_name that shows the same data as a
28
+ # nominal value, but as a number.
29
+ def numericize!(*columns)
30
+ columns.each do |col|
31
+ values = render_column(col.to_underscore_sym)
32
+ categories = values.categories
33
+ value_categories = values.map {|v| values.category(v)}
34
+
35
+ i = 0
36
+ category_map = value_categories.uniq.inject({}) do |h, c|
37
+ h[c] = i
38
+ i += 1
39
+ h
40
+ end
41
+
42
+ blank = Array.new(category_map.size, 0)
43
+ reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
44
+
45
+ new_values = values.inject([]) do |list, val|
46
+ a = blank.dup
47
+ a[category_map[values.category(val)]] = 1
48
+ list << a
49
+ end
50
+
51
+ new_name = "numerical #{col.to_s}".to_underscore_sym
52
+ self.append!(new_name, new_values)
53
+ end
54
+ end
55
+
56
+ end
57
+ end
58
+
59
+ class DataFrame
60
+ include DF::PreProcess
61
+ end