data_frame 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. data/README.rdoc +122 -0
  2. data/VERSION.yml +4 -0
  3. data/bin/plain_frame +22 -0
  4. data/lib/data_frame.rb +26 -0
  5. data/lib/data_frame/arff.rb +52 -0
  6. data/lib/data_frame/callback_array.rb +152 -0
  7. data/lib/data_frame/core/column_management.rb +147 -0
  8. data/lib/data_frame/core/filter.rb +48 -0
  9. data/lib/data_frame/core/import.rb +113 -0
  10. data/lib/data_frame/core/pre_process.rb +69 -0
  11. data/lib/data_frame/core/saving.rb +29 -0
  12. data/lib/data_frame/core/training.rb +46 -0
  13. data/lib/data_frame/data_frame.rb +115 -0
  14. data/lib/data_frame/id3.rb +28 -0
  15. data/lib/data_frame/kmeans.rb +10 -0
  16. data/lib/data_frame/labels_from_uci.rb +48 -0
  17. data/lib/data_frame/mlp.rb +18 -0
  18. data/lib/data_frame/model.rb +22 -0
  19. data/lib/data_frame/parameter_capture.rb +50 -0
  20. data/lib/data_frame/sbn.rb +18 -0
  21. data/lib/data_frame/transposable_array.rb +23 -0
  22. data/lib/ext/array.rb +11 -0
  23. data/lib/ext/open_struct.rb +5 -0
  24. data/lib/ext/string.rb +5 -0
  25. data/lib/ext/symbol.rb +5 -0
  26. data/spec/data_frame/arff_spec.rb +48 -0
  27. data/spec/data_frame/callback_array_spec.rb +148 -0
  28. data/spec/data_frame/core/column_management_spec.rb +128 -0
  29. data/spec/data_frame/core/filter_spec.rb +88 -0
  30. data/spec/data_frame/core/import_spec.rb +41 -0
  31. data/spec/data_frame/core/pre_process_spec.rb +103 -0
  32. data/spec/data_frame/core/saving_spec.rb +61 -0
  33. data/spec/data_frame/core/training_spec.rb +72 -0
  34. data/spec/data_frame/data_frame_spec.rb +141 -0
  35. data/spec/data_frame/id3_spec.rb +22 -0
  36. data/spec/data_frame/model_spec.rb +36 -0
  37. data/spec/data_frame/parameter_capture_spec.rb +32 -0
  38. data/spec/data_frame/transposable_array_spec.rb +138 -0
  39. data/spec/data_frame_spec.rb +29 -0
  40. data/spec/ext/array_spec.rb +13 -0
  41. data/spec/fixtures/basic.csv +3 -0
  42. data/spec/fixtures/discrete_testing.csv +4 -0
  43. data/spec/fixtures/discrete_training.csv +21 -0
  44. data/spec/spec_helper.rb +8 -0
  45. metadata +128 -0
@@ -0,0 +1,48 @@
1
+ module DF #:nodoc:
2
+ module Filter #:nodoc:
3
+
4
+ # Takes a block to evaluate on each row. The row can be converted into
5
+ # an OpenStruct or a Hash for easier filter methods. Note, don't try this
6
+ # with a hash or open struct unless you have facets available.
7
+ def filter!(as=Array, &block)
8
+ as = infer_class(as)
9
+ items = []
10
+ self.items.each do |row|
11
+ value = block.call(cast_row(row, as))
12
+ items << row if value
13
+ end
14
+ @items = items.dup
15
+ self
16
+ end
17
+
18
+ def filter(as=Array, &block)
19
+ new_data_frame = self.clone
20
+ new_data_frame.filter!(as, &block)
21
+ end
22
+
23
+ def filter_by_category(hash)
24
+ new_data_frame = self.dup
25
+ hash.each do |key, value|
26
+ key = key.to_underscore_sym
27
+ next unless self.labels.include?(key)
28
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
29
+ new_data_frame.filter!(:hash) {|row| value.include?(row[key])}
30
+ end
31
+ new_data_frame
32
+ end
33
+
34
+ def filter_by_category!(hash)
35
+ hash.each do |key, value|
36
+ key = key.to_underscore_sym
37
+ next unless self.labels.include?(key)
38
+ value = [value] unless value.is_a?(Array) or value.is_a?(Range)
39
+ self.filter!(:hash) {|row| value.include?(row[key])}
40
+ end
41
+ end
42
+
43
+ end
44
+ end
45
+
46
+ class DataFrame
47
+ include DF::Filter
48
+ end
@@ -0,0 +1,113 @@
1
+ module DF #:nodoc:
2
+ module Import #:nodoc:
3
+
4
+ module InferCSV #:nodoc:
5
+
6
+ protected
7
+ def default_csv_opts; {:converters => :all}; end
8
+
9
+ def infer_csv_contents(obj, opts={})
10
+ contents = File.read(obj) if File.exist?(obj)
11
+ begin
12
+ open(obj) {|f| contents = f.read} unless contents
13
+ rescue
14
+ nil
15
+ end
16
+ contents ||= obj if obj.is_a?(String)
17
+ return nil unless contents
18
+ table = FCSV.parse(contents, default_csv_opts.merge(opts))
19
+ labels = opts.fetch(:headers, true) ? table.shift : []
20
+ while table.last.empty?
21
+ table.pop
22
+ end
23
+ [labels, table]
24
+ end
25
+
26
+ end # InferCSV
27
+
28
+ module ClassMethods #:nodoc:
29
+
30
+ include InferCSV
31
+
32
+ # This is the neatest part of this neat gem.
33
+ # DataFrame.from_csv can be called in a lot of ways:
34
+ # DataFrame.from_csv(csv_contents)
35
+ # DataFrame.from_csv(filename)
36
+ # DataFrame.from_csv(url)
37
+ # If you need to define converters for FasterCSV, do it before calling
38
+ # this method:
39
+ # FasterCSV::Converters[:special] = lambda{|f| f == 'foo' ? 'bar' : 'foo'}
40
+ # DataFrame.from_csv('http://example.com/my_special_url.csv', :converters => :special)
41
+ # This returns bar where 'foo' was found and 'foo' everywhere else.
42
+ def from_csv(obj, opts={})
43
+ labels, table = infer_csv_contents(obj, opts)
44
+ name = infer_name_from_contents(obj, opts)
45
+ return nil unless labels and table
46
+ df = new(*labels)
47
+ df.import(table)
48
+ df.name = name
49
+ df
50
+ end
51
+
52
+ protected
53
+
54
+ # Only works for names sources, urls and files
55
+ def infer_name_from_contents(obj, opts={})
56
+ begin
57
+ File.split(obj).last.split('.')[0..-2].join('.').titleize
58
+ rescue
59
+ nil
60
+ end
61
+ end
62
+
63
+ end # Class Methods
64
+
65
+ module InstanceMethods #:nodoc:
66
+
67
+ include InferCSV
68
+
69
+ def add_item(item)
70
+ self.items << item
71
+ end
72
+ alias :add :add_item
73
+
74
+ # Loads a batch of rows. Expects an array of arrays, else you don't
75
+ # know what you have.
76
+ def import(rows)
77
+ case rows
78
+ when Array
79
+ import_array(rows)
80
+ when String
81
+ labels, table = infer_csv_contents(rows, :headers => false)
82
+ import(table)
83
+ else
84
+ raise ArgumentError, "Don't know how to import data from #{rows.class}"
85
+ end
86
+ true
87
+ end
88
+
89
+ protected
90
+ # Imports a table as an array of arrays.
91
+ # If the array is one-dimensional and there is more than one label, it
92
+ # imports only one row.
93
+ def import_array(rows)
94
+ raise ArgumentError, "Can only work with arrays" unless rows.is_a?(Array)
95
+ if self.labels.size > 1 and rows.dimensions == 1
96
+ self.add_item(rows)
97
+ else
98
+ # self.items = self.items + rows
99
+ rows.each do |row|
100
+ self.add_item(row)
101
+ end
102
+ end
103
+ end
104
+
105
+ end # Instance Methods
106
+
107
+ end
108
+ end
109
+
110
+ class DataFrame
111
+ include DF::Import::InstanceMethods
112
+ extend DF::Import::ClassMethods
113
+ end
@@ -0,0 +1,69 @@
1
+ module DF #:nodoc:
2
+ module PreProcess #:nodoc:
3
+ # A weird name. This creates a column for every category in a column
4
+ # and marks each row by its value
5
+ def j_binary_ize!(*columns)
6
+ # Allows to mix a hash with the columns.
7
+ options = columns.find_all {|e| e.is_a?(Hash)}.inject({}) {|h, e| h.merge!(e)}
8
+ columns.delete_if {|e| e.is_a?(Hash)}
9
+
10
+ # Generates new columns
11
+ columns.each do |col|
12
+ values = render_column(col.to_underscore_sym)
13
+ values.categories.each do |category|
14
+ full_name = (col.to_s + "_" + category.to_s).to_sym
15
+ if options[:allow_overlap]
16
+ category_map = values.inject([]) do |list, e|
17
+ list << values.all_categories(e)
18
+ end
19
+ self.append!(full_name, category_map.map{|e| e.include?(category)})
20
+ else
21
+ self.append!(full_name, values.category_map.map{|e| e == category})
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ # Adds a column, numerical_column_name that shows the same data as a
28
+ # nominal value, but as a number.
29
+ def numericize!(*columns)
30
+ columns.each do |col|
31
+ values = render_column(col.to_underscore_sym)
32
+ categories = values.categories
33
+ value_categories = values.map {|v| values.category(v)}
34
+
35
+ i = 0
36
+ category_map = value_categories.uniq.inject({}) do |h, c|
37
+ h[c] = i
38
+ i += 1
39
+ h
40
+ end
41
+
42
+ blank = Array.new(category_map.size, 0)
43
+ reverse_category_map = category_map.inject({}) {|h, e| h[e.last] = e.first; h}
44
+
45
+ new_values = values.inject([]) do |list, val|
46
+ a = blank.dup
47
+ a[category_map[values.category(val)]] = 1
48
+ list << a
49
+ end
50
+
51
+ new_name = "numerical #{col.to_s}".to_underscore_sym
52
+ self.append!(new_name, new_values)
53
+ end
54
+ end
55
+
56
+ def categorize!(*cs)
57
+ store_range_hashes
58
+ cs.each do |column|
59
+ self.replace!(column, category_map_from_stored_range_hash(column))
60
+ end
61
+ restore_range_hashes
62
+ end
63
+
64
+ end
65
+ end
66
+
67
+ class DataFrame
68
+ include DF::PreProcess
69
+ end
@@ -0,0 +1,29 @@
1
+ module DF #:nodoc:
2
+ module Saving #:nodoc:
3
+
4
+ # Saves a data frame as CSV.
5
+ # Examples:
6
+ # df.save('/tmp/some_filename.csv')
7
+ # df.save('/tmp/some_filename.csv', :include_header => false) # No header information is saved
8
+ # df.save('/tmp/some_filename.csv', :only => [:list, :of, :columns])
9
+ # df.save('/tmp/some_filename.csv', :subset => [:list, :of, :columns])
10
+ # df.save('/tmp/some_filename.csv',
11
+ # :filter => {:column_name => :category_value,
12
+ # :another_column_name => (range..values)}) # Filter by category
13
+ def save(filename, opts={})
14
+
15
+ df = self
16
+ df = df.subset_from_columns(*Array(opts[:only])) if opts[:only]
17
+ df = df.subset_from_columns(*Array(opts[:subset])) if opts[:subset]
18
+ df = df.filter_by_category(opts[:filter]) if opts[:filter]
19
+ df = df.filter_by_category(opts[:filter_by_category]) if opts[:filter_by_category]
20
+
21
+ File.open(filename, "w") { |f| f.write df.to_csv(opts.fetch(:include_header, true)) }
22
+ end
23
+
24
+ end
25
+ end
26
+
27
+ class DataFrame
28
+ include DF::Saving
29
+ end
@@ -0,0 +1,46 @@
1
+ module Training #:nodoc:
2
+
3
+ # Remove the training set if reset
4
+ # Return cached training_set, if there is one
5
+ # Get the proportion or 80%
6
+ # Get the number of items to choose, n, or a proportion of the items
7
+ # Store and return n random items
8
+ def training_set(opts={})
9
+ if opts[:reset]
10
+ @training_set = nil
11
+ @test_set = nil
12
+ end
13
+ return @training_set if @training_set
14
+
15
+ items_size = self.items.size
16
+ proportion = opts.fetch(:proportion, 0.8)
17
+ n = opts[:n]
18
+ n ||= (items_size * proportion).to_i
19
+ n = self.items.size if n > items_size
20
+ n = 0 if n < 0
21
+
22
+ @training_set = []
23
+ while n > @training_set.size
24
+ @training_set << random_next(items_size) while n > @training_set.size
25
+ @training_set.uniq!
26
+ end
27
+ @training_set
28
+ end
29
+
30
+
31
+ def test_set(opts={})
32
+ @test_set = nil if opts[:reset]
33
+ return @test_set if @test_set
34
+ @test_set = self.items.exclusive_not(self.training_set)
35
+ end
36
+
37
+ protected
38
+ def random_next(n)
39
+ self.items[rand(n)]
40
+ end
41
+
42
+ end
43
+
44
+ class DataFrame
45
+ include Training
46
+ end
@@ -0,0 +1,115 @@
1
+ # This allows me to have named columns and optionally named rows in a
2
+ # data frame, to work calculations (usually on the columns), to
3
+ # transpose the matrix and store the transposed matrix until the object
4
+ # is tainted.
5
+ class DataFrame
6
+
7
+ def inspect
8
+ "DataFrame rows: #{self.rows.size} labels: #{self.labels.inspect}"
9
+ end
10
+
11
+ # The labels of the data items
12
+ attr_reader :labels
13
+ alias :variables :labels
14
+
15
+ # The items stored in the frame
16
+ attr_accessor :items
17
+
18
+ # An optional name, useful for arff files
19
+ attr_accessor :name
20
+
21
+ def initialize(*labels)
22
+ labels = labels.first if labels.size == 1 and labels.first.is_a?(Array)
23
+ @labels = labels.map {|e| e.to_underscore_sym }
24
+ @items = TransposableArray.new
25
+ end
26
+
27
+ def row_labels
28
+ @row_labels ||= []
29
+ end
30
+
31
+ def row_labels=(ary)
32
+ raise ArgumentError, "Row labels must be an array" unless ary.is_a?(Array)
33
+ @row_labels = ary
34
+ end
35
+
36
+ # The rows as an array of arrays, an alias for items.
37
+ alias :rows :items
38
+
39
+ def render_row(sym)
40
+ i = self.row_labels.index(sym)
41
+ return nil unless i
42
+ @items[i]
43
+ end
44
+
45
+ # Return the column, given its name
46
+ def render_column(sym)
47
+ i = @labels.index(sym.to_underscore_sym)
48
+ return nil unless i
49
+ @items.transpose[i]
50
+ end
51
+
52
+ # The columns as a Dictionary or Hash
53
+ # This is cached, call columns(true) to reset the cache.
54
+ def columns(reset=false)
55
+ @columns = nil if reset
56
+ return @columns if @columns
57
+
58
+ container = defined?(Dictionary) ? Dictionary.new : Hash.new
59
+ i = 0
60
+
61
+ @columns = @items.transpose.inject(container) do |cont, col|
62
+ cont[@labels[i]] = col
63
+ i += 1
64
+ cont
65
+ end
66
+ end
67
+ alias :to_hash :columns
68
+ alias :to_dictionary :columns
69
+
70
+ def method_missing(sym, *args, &block)
71
+ if self.labels.include?(sym)
72
+ render_column(sym)
73
+ elsif self.row_labels.include?(sym)
74
+ render_row(sym)
75
+ elsif @items.respond_to?(sym)
76
+ @items.send(sym, *args, &block)
77
+ else
78
+ super
79
+ end
80
+ end
81
+
82
+ protected
83
+
84
+ def validate_column(column)
85
+ column = column.to_sym
86
+ raise ArgumentError, "Must provide the name of an existing column. Provided #{column.inspect}, needed to provide one of #{self.labels.inspect}" unless self.labels.include?(column)
87
+ column
88
+ end
89
+
90
+ def infer_class(obj)
91
+ obj = obj.to_s.classify.constantize if obj.is_a?(Symbol)
92
+ obj = obj.classify.constantize if obj.is_a?(String)
93
+ obj
94
+ end
95
+
96
+ def cast_row(row, as)
97
+ if as == Hash
98
+ obj = {}
99
+ self.labels.each_with_index do |label, i|
100
+ obj[label] = row[i]
101
+ end
102
+ obj
103
+ elsif as == OpenStruct
104
+ obj = OpenStruct.new
105
+ self.labels.each_with_index do |label, i|
106
+ obj.table[label] = row[i]
107
+ end
108
+ obj
109
+ elsif as == Array
110
+ row
111
+ else
112
+ as.new(*row)
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,28 @@
1
+ module DF #:nodoc:
2
+ # Uses Ilya Grigorik's ID3 decision_tree gem. Installs it if you don't have it.
3
+ module ID3
4
+ begin
5
+ gem 'decisiontree'
6
+ require 'decisiontree'
7
+ rescue
8
+ `sudo gem install decisiontree`
9
+ gem 'decisiontree'
10
+ require 'decisiontree'
11
+ end
12
+
13
+ def create_id3(dependent_column, opts={})
14
+ # Need to put the dependent column in the last column
15
+ # Probably have other pre processing as well.
16
+ default = opts.fetch(:default, 1)
17
+ @id3 = DecisionTree::ID3Tree.new(self.labels, self.training_data, default, :discrete)
18
+ # ...
19
+ end
20
+
21
+ def id3
22
+ end
23
+ end
24
+ end
25
+
26
+ class DataFrame
27
+ include DF::ID3
28
+ end