davidrichards-data_frame 0.0.18 → 0.0.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -91,6 +91,22 @@ Data Frame can now create sub-models:
91
91
  >> df.models
92
92
  => #<OpenStruct weekend=DataFrame rows: 179 labels: [:x, :y, :month, :day, :ffmc, :dmc, :dc, :isi, :temp, :rh, :wind, :rain, :area]>
93
93
 
94
+ == Utilities
95
+
96
+ I use data frame for a lot of things, and I've added some utilities for this gem in case you would like to as well. For instance, here is how I take the data in a data frame and load it into a neural network:
97
+
98
+ # Show mlp. Will probably need to add a row classifier for training and test data. Also, will probably want to
99
+
100
+ == CLI
101
+
102
+ There are some really interesting things that have good command-line shortcuts:
103
+
104
+ * Make
105
+ * A
106
+ * List
107
+
108
+ # Now add some demos
109
+
94
110
  ==Installation
95
111
 
96
112
  sudo gem install davidrichards-data_frame
data/VERSION.yml CHANGED
@@ -1,4 +1,4 @@
1
1
  ---
2
2
  :major: 0
3
3
  :minor: 0
4
- :patch: 18
4
+ :patch: 20
data/bin/plain_frame ADDED
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby -wKU
2
+ require 'yaml'
3
+
4
+ version_hash = YAML.load_file(File.join(File.dirname(__FILE__), %w(.. VERSION.yml)))
5
+ version = [version_hash[:major].to_s, version_hash[:minor].to_s, version_hash[:patch].to_s].join(".")
6
+ df_file = File.join(File.dirname(__FILE__), %w(.. lib data_frame))
7
+
8
+ irb = RUBY_PLATFORM =~ /(:?mswin|mingw)/ ? 'irb.bat' : 'irb'
9
+
10
+ require 'optparse'
11
+ options = { :irb => irb, :without_stored_procedures => false }
12
+ OptionParser.new do |opt|
13
+ opt.banner = "Usage: console [environment] [options]"
14
+ opt.on("--irb=[#{irb}]", 'Invoke a different irb.') { |v| options[:irb] = v }
15
+ opt.parse!(ARGV)
16
+ end
17
+
18
+ libs = " -r irb/completion -r #{df_file}"
19
+
20
+ puts "Loading Data Frame version: #{version}"
21
+
22
+ exec "#{options[:irb]} #{libs} --simple-prompt"
data/lib/data_frame.rb CHANGED
@@ -20,6 +20,7 @@ $:.unshift(File.dirname(__FILE__))
20
20
  require 'data_frame/callback_array'
21
21
  require 'data_frame/transposable_array'
22
22
  require 'data_frame/parameter_capture'
23
- require 'data_frame/arff'
24
23
  require 'data_frame/data_frame'
25
24
  require 'data_frame/model'
25
+
26
+ Dir.glob("#{File.dirname(__FILE__)}/data_frame/core/*.rb").each { |file| require file }
@@ -1,45 +1,52 @@
1
- # Turns a data frame into ARFF-formatted content.
2
- module ARFF
3
-
4
- # Used in arff, but generally useful.
5
- def to_csv(include_header=true)
6
- value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
7
- self.items.inject(value) do |list, e|
8
- list << e.map {|cell| cell.to_s}.join(',') + "\n"
9
- end
10
- end
1
+ module DF #:nodoc:
2
+ # Turns a data frame into ARFF-formatted content.
3
+ module ARFF
11
4
 
12
- def to_arff
13
- arff_header + to_csv(false)
14
- end
15
-
16
- protected
17
- def arff_attributes
18
- container = defined?(Dictionary) ? Dictionary.new : Hash.new
19
-
20
- self.labels.inject(container) do |list, e|
21
- list[e] = self.render_column(e).categories
22
- end
23
- end
24
-
25
- def arff_formatted_attributes
26
- self.labels.inject('') do |str, e|
27
- val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
28
- str << "@attribute #{e} #{val}\n"
5
+ # Used in arff, but generally useful.
6
+ def to_csv(include_header=true)
7
+ value = include_header ? self.labels.map{|e| e.to_s}.join(',') + "\n" : ''
8
+ self.items.inject(value) do |list, e|
9
+ list << e.map {|cell| cell.to_s}.join(',') + "\n"
29
10
  end
30
11
  end
31
-
32
- def arff_relation
33
- self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
12
+
13
+ def to_arff
14
+ arff_header + to_csv(false)
34
15
  end
35
-
36
- def arff_header
37
- %[@relation #{arff_relation}
16
+
17
+ protected
18
+ def arff_attributes
19
+ container = defined?(Dictionary) ? Dictionary.new : Hash.new
20
+
21
+ self.labels.inject(container) do |list, e|
22
+ list[e] = self.render_column(e).categories
23
+ end
24
+ end
25
+
26
+ def arff_formatted_attributes
27
+ self.labels.inject('') do |str, e|
28
+ val = "{" + self.render_column(e).categories.map{|x| x.to_s}.join(',') + "}"
29
+ str << "@attribute #{e} #{val}\n"
30
+ end
31
+ end
32
+
33
+ def arff_relation
34
+ self.name ? self.name.to_underscore_sym.to_s : 'unamed_relation'
35
+ end
36
+
37
+ def arff_header
38
+ %[@relation #{arff_relation}
38
39
 
39
40
  #{arff_formatted_attributes}
40
41
  @data
41
42
  ]
42
- end
43
-
44
- alias :arff_items :to_csv
43
+ end
44
+
45
+ alias :arff_items :to_csv
46
+ end
47
+
48
+ end
49
+
50
+ class DataFrame
51
+ include DF::ARFF
45
52
  end
@@ -0,0 +1,102 @@
1
+ module DF #:nodoc:
2
+ module ColumnManagement #:nodoc:
3
+
4
+ def move_to_last!(orig_name)
5
+ raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
6
+ new_name = (orig_name.to_s + "_a_unique_name").to_sym
7
+ self.append!(new_name, self.render_column(orig_name))
8
+ self.drop!(orig_name)
9
+ self.rename!(orig_name, new_name)
10
+ end
11
+
12
+ # In the order of alias: new_name, orig_name
13
+ def rename!(new_name, orig_name)
14
+ new_name = new_name.to_underscore_sym
15
+ orig_name = orig_name.to_underscore_sym
16
+ raise ArgumentError, "Column not found" unless self.labels.include?(orig_name)
17
+ raise ArgumentError, "Cannot name #{orig_name} to #{new_name}, that column already exists." if self.labels.include?(new_name)
18
+ i = self.labels.index(orig_name)
19
+ self.labels[i] = new_name
20
+ end
21
+
22
+ # Adds a unique column to the table
23
+ def append!(column_name, value=nil)
24
+ raise ArgumentError, "Can't have duplicate column names" if self.labels.include?(column_name)
25
+ self.labels << column_name.to_underscore_sym
26
+ if value.is_a?(Array)
27
+ self.items.each_with_index do |item, i|
28
+ item << value[i]
29
+ end
30
+ else
31
+ self.items.each do |item|
32
+ item << value
33
+ end
34
+ end
35
+ # Because we are tainting the sub arrays, the TaintableArray doesn't know it's been changed.
36
+ self.items.taint
37
+ end
38
+
39
+ def replace!(column, values=nil, &block)
40
+ column = validate_column(column)
41
+ if not values
42
+ values = self.send(column)
43
+ values.map! {|e| block.call(e)}
44
+ end
45
+ replace_column!(column, values)
46
+ self
47
+ end
48
+
49
+ # Replace a single column with an array of values.
50
+ # It is helpful to have the values the same size as the rest of the data
51
+ # frame.
52
+ def replace_column!(column, values)
53
+ column = validate_column(column)
54
+ index = self.labels.index(column)
55
+ list = []
56
+ self.items.each_with_index do |item, i|
57
+ consolidated = item
58
+ consolidated[index] = values[i]
59
+ list << consolidated
60
+ end
61
+ @items = list.dup
62
+ end
63
+
64
+ # Drop one or more columns
65
+ def drop!(*labels)
66
+ labels.each do |label|
67
+ drop_one!(label)
68
+ end
69
+ self
70
+ end
71
+
72
+ # Drop a single column
73
+ def drop_one!(label)
74
+ i = self.labels.index(label)
75
+ return nil unless i
76
+ self.items.each do |item|
77
+ item.delete_at(i)
78
+ end
79
+ self.labels.delete_at(i)
80
+ self
81
+ end
82
+
83
+ # Creates a new data frame, only with the specified columns.
84
+ def subset_from_columns(*cols)
85
+ new_labels = self.labels.inject([]) do |list, label|
86
+ list << label if cols.include?(label)
87
+ list
88
+ end
89
+ new_data_frame = DataFrame.new(*self.labels)
90
+ new_data_frame.import(self.items)
91
+ self.labels.each do |label|
92
+ new_data_frame.drop!(label) unless new_labels.include?(label)
93
+ end
94
+ new_data_frame
95
+ end
96
+
97
+ end
98
+ end
99
+
100
+ class DataFrame
101
+ include DF::ColumnManagement
102
+ end
@@ -0,0 +1,48 @@
1
+ module DF #:nodoc:
2
+ module Filter #:nodoc:
3
+
4
+ # Takes a block to evaluate on each row. The row can be converted into
5
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
6
+ # with a hash or open struct unless you have facets available.
7
+ def filter!(as=Array, &block)
8
+ as = infer_class(as)
9
+ items = []
10
+ self.items.each do |row|
11
+ value = block.call(cast_row(row, as))
12
+ items << row if value
13
+ end
14
+ @items = items.dup
15
+ self
16
+ end
17
+
18
+ def filter(as=Array, &block)
19
+ new_data_frame = self.clone
20
+ new_data_frame.filter!(as, &block)
21
+ end
22
+
23
+ def filter_by_category(hash)
24
+ new_data_frame = self.dup
25
+ hash.each do |key, value|
26
+ key = key.to_underscore_sym
27
+ next unless self.labels.include?(key)
28
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
29
+ new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
30
+ end
31
+ new_data_frame
32
+ end
33
+
34
+ def filter_by_category!(hash)
35
+ hash.each do |key, value|
36
+ key = key.to_underscore_sym
37
+ next unless self.labels.include?(key)
38
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
39
+ self.filter!(:hash) {|row| value.include?(row[key])}
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ class DataFrame
47
+ include DF::Filter
48
+ end
@@ -0,0 +1,112 @@
1
+ module DF #:nodoc:
2
+ module Import #:nodoc:
3
+
4
+ module InferCSV #:nodoc:
5
+
6
+ protected
7
+ def default_csv_opts; {:converters => :all}; end
8
+
9
+ def infer_csv_contents(obj, opts={})
10
+ contents = File.read(obj) if File.exist?(obj)
11
+ begin
12
+ open(obj) {|f| contents = f.read} unless contents
13
+ rescue
14
+ nil
15
+ end
16
+ contents ||= obj if obj.is_a?(String)
17
+ return nil unless contents
18
+ table = FCSV.parse(contents, default_csv_opts.merge(opts))
19
+ labels = opts.fetch(:headers, true) ? table.shift : []
20
+ while table.last.empty?
21
+ table.pop
22
+ end
23
+ [labels, table]
24
+ end
25
+
26
+ end # InferCSV
27
+
28
+ module ClassMethods #:nodoc:
29
+
30
+ include InferCSV
31
+
32
+ # This is the neatest part of this neat gem.
33
+ # DataFrame.from_csv can be called in a lot of ways:
34
+ # DataFrame.from_csv(csv_contents)
35
+ # DataFrame.from_csv(filename)
36
+ # DataFrame.from_csv(url)
37
+ # If you need to define converters for FasterCSV, do it before calling
38
+ # this method:
39
+ # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
40
+ # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
41
+ # This returns bar where 'foo' was found and 'foo' everywhere else.
42
+ def from_csv(obj, opts={})
43
+ labels, table = infer_csv_contents(obj, opts)
44
+ name = infer_name_from_contents(obj, opts)
45
+ return nil unless labels and table
46
+ df = new(*labels)
47
+ df.import(table)
48
+ df.name = name
49
+ df
50
+ end
51
+
52
+ protected
53
+
54
+ # Only works for names sources, urls and files
55
+ def infer_name_from_contents(obj, opts={})
56
+ begin
57
+ File.split(obj).last.split('.')[0..-2].join('.').titleize
58
+ rescue
59
+ nil
60
+ end
61
+ end
62
+
63
+ end # Class Methods
64
+
65
+ module InstanceMethods #:nodoc:
66
+
67
+ include InferCSV
68
+
69
+ def add_item(item)
70
+ self.items << item
71
+ end
72
+ alias :add :add_item
73
+
74
+ # Loads a batch of rows. Expects an array of arrays, else you don't
75
+ # know what you have.
76
+ def import(rows)
77
+ case rows
78
+ when Array
79
+ import_array(rows)
80
+ when String
81
+ labels, table = infer_csv_contents(rows, :headers => false)
82
+ import(table)
83
+ else
84
+ raise ArgumentError, "Don't know how to import data from #{rows.class}"
85
+ end
86
+ true
87
+ end
88
+
89
+ protected
90
+ # Imports a table as an array of arrays.
91
+ # If the array is one-dimensional and there is more than one label, it
92
+ # imports only one row.
93
+ def import_array(rows)
94
+ raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
95
+ if self.labels.size > 1 and rows.dimensions == 1
96
+ self.add_item(rows)
97
+ else
98
+ rows.each do |row|
99
+ self.add_item(row)
100
+ end
101
+ end
102
+ end
103
+
104
+ end # Instance Methods
105
+
106
+ end
107
+ end
108
+
109
+ class DataFrame
110
+ include DF::Import::InstanceMethods
111
+ extend DF::Import::ClassMethods
112
+ end
@@ -0,0 +1,61 @@
1
+ module DF #:nodoc:
2
+ module PreProcess #:nodoc:
3
+ # A weird name. This creates a column for every category in a column
4
+ # and marks each row by its value
5
+ def j_binary_ize!(*columns)
6
+ # Allows to mix a hash with the columns.
7
+ options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
8
+ columns.delete_if {|e| e.is_a?(Hash)}
9
+
10
+ # Generates new columns
11
+ columns.each do |col|
12
+ values = render_column(col.to_underscore_sym)
13
+ values.categories.each do |category|
14
+ full_name = (col.to_s + "_" + category.to_s).to_sym
15
+ if options[:allow_overlap]
16
+ category_map = values.inject([]) do |list, e|
17
+ list << values.all_categories(e)
18
+ end
19
+ self.append!(full_name, category_map.map{|e| e.include?(category)})
20
+ else
21
+ self.append!(full_name, values.category_map.map{|e| e == category})
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ # Adds a column, numerical_column_name that shows the same data as a
28
+ # nominal value, but as a number.
29
+ def numericize!(*columns)
30
+ columns.each do |col|
31
+ values = render_column(col.to_underscore_sym)
32
+ categories = values.categories
33
+ value_categories = values.map {|v| values.category(v)}
34
+
35
+ i = 0
36
+ category_map = value_categories.uniq.inject({}) do |h, c|
37
+ h[c] = i
38
+ i += 1
39
+ h
40
+ end
41
+
42
+ blank = Array.new(category_map.size, 0)
43
+ reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
44
+
45
+ new_values = values.inject([]) do |list, val|
46
+ a = blank.dup
47
+ a[category_map[values.category(val)]] = 1
48
+ list << a
49
+ end
50
+
51
+ new_name = "numerical #{col.to_s}".to_underscore_sym
52
+ self.append!(new_name, new_values)
53
+ end
54
+ end
55
+
56
+ end
57
+ end
58
+
59
+ class DataFrame
60
+ include DF::PreProcess
61
+ end