mikon 0.1.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/mikon.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'nmatrix'
2
+ require_relative "mikon/version"
3
+ require_relative "mikon/stats"
4
+ require_relative "mikon/plot"
5
+ require_relative "mikon/pivot"
6
+ require_relative "mikon/core/array"
7
+ require_relative "mikon/core/index"
8
+ require_relative "mikon/core/series"
9
+ require_relative "mikon/core/dataframe"
@@ -0,0 +1,139 @@
1
+ module Mikon
2
+
3
+ # Internal data structure to wrap NMatrix
4
+ # Its stastical methods (i.e. #median) is compartible with Statsample::Vector's
5
+ # @example
6
+ # Mikon::DArray.new([1, 2, 3]) #-> #<Mikon::DArray:0xbacfc99c @data=[1, 2, 3], @dtype=:int32>
7
+ #
8
+ class DArray
9
+ include Enumerable, Mikon::Stats
10
+ attr_reader :dtype, :data
11
+
12
+ # @param [NMatrix|Array] source
13
+ # @param [Hash] options
14
+ def initialize(source, options={})
15
+ case
16
+ when source.is_a?(Array)
17
+ if source.all? {|el| el.is_a?(Numeric)}
18
+ @data = NMatrix.new([source.length], source, options)
19
+ else
20
+ #
21
+ # NMatrix instance whose dtype is :object frequently causes Segmentation Fault
22
+ # @example
23
+ # df = DataFrame.new({a: ["a", "b"], b: [1, 2]})
24
+ # df[:a].to_html #-> Segmentation Fault
25
+ #
26
+
27
+ # @data = NMatrix.new([source.length], source, options.merge({:dtype => :object}))
28
+ extend UseArray
29
+ @data = Mikon::ArrayWrapper.new(source)
30
+ end
31
+
32
+ when source.is_a?(NMatrix)
33
+ unless source.shape.length == 1 && source.shape.first.is_a?(Numeric)
34
+ raise "Matrix shape is not valid"
35
+ end
36
+ @data = source
37
+ else
38
+ raise "Non-acceptable Argument Error"
39
+ end
40
+ @dtype = @data.dtype
41
+ end
42
+
43
+ def each(&block)
44
+ @data.each(&block)
45
+ end
46
+
47
+ def reduce(init, &block)
48
+ @data.inject_rank(0, init, &block).first
49
+ end
50
+
51
+ def expand(length)
52
+ raise "The argument 'length' should be greater than length of now." if length < self.length
53
+ data = NMatrix.new([expand], @data.to_a)
54
+ @data = data.map.with_index{|val, i| i < self.length ? val : 0}
55
+ end
56
+
57
+ def length
58
+ @data.shape.first
59
+ end
60
+
61
+ def [](pos)
62
+ @data[pos]
63
+ end
64
+
65
+ def sort
66
+ Mikon::DArray.new(@data.sort)
67
+ end
68
+
69
+ def sort_by(&block)
70
+ return self.to_enum(:sort_by) unless block_given?
71
+ Mikon::DArray.new(@data.sort_by(&block))
72
+ end
73
+
74
+ def reverse
75
+ len = self.length
76
+ Mikon::DArray.new(@data.map.with_index{|v, i| @data[self.length-i-1]})
77
+ end
78
+
79
+ [:+, :-].each do |op|
80
+ define_method(op) do |arg|
81
+ if arg.is_a?(DArray)
82
+ DArray.new(arg.coerce(@data).inject(op))
83
+ else
84
+ super
85
+ end
86
+ end
87
+ end
88
+
89
+ [:*, :/, :%].each do |op|
90
+ define_method(op) do |arg|
91
+ if arg.is_a?(Numeric)
92
+ DArray.new(@data.send(op, arg))
93
+ else
94
+ super
95
+ end
96
+ end
97
+ end
98
+
99
+ def coerce(other)
100
+ if [NMatrix, Array].any?{|cls| other.is_a?(cls) && @data.is_a?(cls)}
101
+ return other, @data
102
+ else
103
+ super
104
+ end
105
+ end
106
+
107
+ def to_a
108
+ @data.to_a
109
+ end
110
+
111
+ def fillna(fill_value=0)
112
+ @data = @data.map{|val| val.to_f.nan? ? fill_value : val}
113
+ end
114
+ end
115
+
116
+ class ArrayWrapper < Array
117
+ def dtype
118
+ :object
119
+ end
120
+
121
+ def sorted_indices
122
+ self.map.with_index.sort_by(&:first).map(&:last)
123
+ end
124
+ end
125
+
126
+ module UseArray
127
+ def expand(length)
128
+ @data = @data + Array(length - @data.length, 0)
129
+ end
130
+
131
+ def length
132
+ @data.length
133
+ end
134
+
135
+ def reduce(init, &block)
136
+ @data.reduce(int, &block)
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,400 @@
1
+ require 'securerandom'
2
+ require 'formatador'
3
+ require 'json'
4
+ require 'csv'
5
+
6
+ module Mikon
7
+
8
+ # The main data structure in Mikon gem.
9
+ # DataFrame consists of labels(column name), index(row name), and labels.
10
+ class DataFrame
11
+
12
+ def initialize(source, options={})
13
+ options = {
14
+ name: SecureRandom.uuid(),
15
+ index: nil,
16
+ labels: nil
17
+ }.merge(options)
18
+
19
+ case
20
+ when source.is_a?(Array)
21
+ case
22
+ when source.all? {|el| el.is_a?(Mikon::Series)}
23
+ raise "NotImplementedError"
24
+
25
+ when source.all? {|el| el.is_a?(Mikon::DArray)}
26
+ @data = source
27
+
28
+ when source.all? {|el| el.is_a?(Mikon::Row)}
29
+ @labels = source.first.labels
30
+ @index = source.map{|row| row.index}
31
+ @data = source.map{|row| row.to_hash.values}.transpose.map do |arr|
32
+ Mikon::DArray.new(arr)
33
+ end
34
+
35
+ when source.all? {|el| el.is_a?(Hash)}
36
+ @labels = source.first.keys
37
+ @data = source.map{|hash| hash.values}.transpose.map do |arr|
38
+ Mikon::DArray.new(arr)
39
+ end
40
+
41
+ when source.all? {|el| el.is_a?(Array)}
42
+ @data = source.map do |arr|
43
+ Mikon::DArray.new(arr)
44
+ end
45
+
46
+ else raise "Non-acceptable Arguments Error"
47
+ end
48
+
49
+ when source.is_a?(Hash)
50
+ case
51
+ when source.values.all? {|val| val.is_a?(Array)}
52
+ @labels = source.keys
53
+ @data = source.values.map do |arr|
54
+ Mikon::DArray.new(arr)
55
+ end
56
+ when source.all? {|arr| arr[1].is_a?(Series)}
57
+ else raise "Non-acceptable Arguments Error"
58
+ end
59
+
60
+ else raise "Non-acceptable Arguments Error"
61
+ end
62
+
63
+ @labels = options[:labels] unless options[:labels].nil?
64
+ @name = options[:name]
65
+
66
+ unless (index = options[:index]).nil?
67
+ if index.is_a?(Symbol)
68
+ raise "labels should be set" if @labels.nil?
69
+ pos = @labels.index(index)
70
+ raise "Thre is no column named" + index.to_s if pos.nil?
71
+ name = @labels.delete(index)
72
+ @index = @data.delete(@data[pos])
73
+ elsif index.is_a?(Array)
74
+ @index = index
75
+ else
76
+ raise "Invalid index type"
77
+ end
78
+ end
79
+
80
+ _check_if_valid
81
+ end
82
+
83
+ def _check_if_valid
84
+ # All array should should have the same length
85
+ length = @data.map{|darr| darr.length}.max
86
+ @data.each{|darr| darr.expand(length) if darr.length < length}
87
+
88
+ # DataFrame should have index object
89
+ @index = (0..(length-1)).to_a if @index.nil?
90
+ raise "index should have the same length as arrays" if @index.length != length
91
+
92
+ # Labels should be an instance of Symbol
93
+ if @labels.nil?
94
+ @labels = @data.map.with_index{|darr, i| i.to_s.to_sym}
95
+ elsif @labels.any?{|label| !label.is_a?(Symbol)}
96
+ @labels = @labels.map{|label| label.to_sym}
97
+ end
98
+ end
99
+
100
+ # return the length of columns
101
+ def length
102
+ @data.first.length
103
+ end
104
+
105
+ # Create Mikon::DataFrame from a csv/tsv file
106
+ # @param [String] path path to csv
107
+ # @param options
108
+ # :col_sep [String] string to separate by
109
+ # :headers [Array] headers
110
+ #
111
+ def self.from_csv(path, options={})
112
+ csv_options = {
113
+ :col_sep => ',',
114
+ :headers => true,
115
+ :converters => :numeric,
116
+ :header_converters => :symbol,
117
+ }
118
+
119
+ options = csv_options.merge(options)
120
+ raise ArgumentError, "options[:hearders] should be set" if options[:headers] == false
121
+ options.delete(:header_converters) if options[:headers].is_a?(Array)
122
+
123
+ csv = CSV.readlines(path, "r", options)
124
+ yield csv if block_given?
125
+
126
+ hash = {}
127
+ csv.by_col.each {|label, arr| hash[label] = arr}
128
+ csv_options.keys.each{|key| options.delete(key)}
129
+
130
+ self.new(hash, options)
131
+ end
132
+
133
+ # Accessor for column and rows
134
+ # @example
135
+ # df = DataFrame.new({a: [1, 2, 3], b: [2, 3, 4]})
136
+ # df[0..1].to_json #-> {a: [1, 2], b: [2, 3]}
137
+ # df[:a] #-> <Mikon::Series>
138
+ def [](arg)
139
+ case
140
+ when arg.is_a?(Range)
141
+ index = @index.select{|i| arg.include?(i)}
142
+ Mikon::DataFrame.new(index.map{|i| self.row(i)}, {index: index})
143
+
144
+ when arg.is_a?(Symbol)
145
+ self.column(arg)
146
+ end
147
+ end
148
+
149
+ # Access column with its name
150
+ def column(label)
151
+ pos = @labels.index(label)
152
+ raise "There is no column named " + label if pos.nil?
153
+ Mikon::Series.new(label, @data[pos], index: @index)
154
+ end
155
+
156
+ # same as head of Linux
157
+ def head(num)
158
+ self[0..(num-1)]
159
+ end
160
+
161
+ # same as tail of Linux
162
+ def tail(num)
163
+ last = self.length-1
164
+ self[(last-num+1)..last]
165
+ end
166
+
167
+ # Compartible with Nyaplot::DataFrame.to_json
168
+ def to_json(*args)
169
+ rows = []
170
+ self.each_row do |row|
171
+ rows.push(row.to_hash)
172
+ end
173
+ rows.to_json
174
+ end
175
+
176
+ # IRuby notebook automatically call this method
177
+ def to_html(threshold=50)
178
+ html = "<html><table><tr><td></td>"
179
+ html += @labels.map{|label| "<th>" + label.to_s + "</th>"}.join
180
+ html += "</tr>"
181
+ self.each_row.with_index do |row, pos|
182
+ next if pos > threshold && pos != self.length-1
183
+ html += "<tr><th>" + @index[pos].to_s + "</th>"
184
+ html += @labels.map{|label| "<td>" + row[label].to_s + "</td>"}.join
185
+ html += "</tr>"
186
+ html += "<tr><th>...</th>" + "<td>...</td>"*@labels.length + "</tr>" if pos == threshold
187
+ end
188
+ html += "</table>"
189
+ end
190
+
191
+ def to_s(threshold=50)
192
+ arr = []
193
+ self.each_row.with_index do |row, pos|
194
+ next nil if pos > threshold && pos != self.length-1
195
+ arr.push({"" => @index[pos]}.merge(row.to_hash))
196
+ if pos == threshold
197
+ arr.push(@labels.reduce({"" => "..."}){|memo, label| memo[label] = "..."; memo})
198
+ end
199
+ end
200
+ Formatador.display_table(arr.select{|el| !(el.nil?)})
201
+ end
202
+
203
+ # Select rows using Mikon::Row DSL and create new DataFrame
204
+ # @example
205
+ # df = Mikon::DataFrame.new({a: [1,2,3], b: [2,3,4]})
206
+ # df.select{a%2==0}[:a].to_a #-> [2]
207
+ #
208
+ def select(&block)
209
+ return self.to_enum(:select) unless block_given?
210
+ rows = []
211
+ i = 0
212
+ self.each_row do |row|
213
+ if row.instance_eval(&block)
214
+ rows.push(row)
215
+ end
216
+ end
217
+ Mikon::DataFrame.new(rows)
218
+ end
219
+
220
+ alias_method :filter, :select
221
+
222
+ # Iterate rows using Mikon::Row DSL
223
+ def each(&block)
224
+ return self.to_enum(:each) unless block_given?
225
+ self.each_row do |row|
226
+ row.instance_eval(&block)
227
+ end
228
+ self
229
+ end
230
+
231
+ # Iterate rows using Mikon::Row DSL and return new Mikon::Series
232
+ def map(&block)
233
+ return self.to_enum(:map) unless block_given?
234
+ arr = []
235
+ self.each_row do |row|
236
+ arr.push(row.instance_eval(&block))
237
+ end
238
+ Mikon::Series.new(:new_series, arr, index: @index.clone)
239
+ end
240
+
241
+ alias_method :collect, :map
242
+
243
+ # Mikon::Row DSL
244
+ def all?(&block)
245
+ self.each_row {|row| return false unless row.instance_eval(&block)}
246
+ true
247
+ end
248
+
249
+ # Mikon::Row DSL
250
+ def any?(&block)
251
+ self.each_row {|row| return true if row.instance_eval(&block)}
252
+ false
253
+ end
254
+
255
+ # Sort using Mikon::Row DSL
256
+ # @param [Bool] ascending default true
257
+ #
258
+ def sort_by(ascending=true, &block)
259
+ return self.to_enum(:sort_by) unless block_given?
260
+ order = self.map(&block).to_darr.sorted_indices
261
+ order.reverse! unless ascending
262
+ data = @data.map{|darr| darr.sort_by.with_index{|val, i| order.index(i)}}
263
+ index = @index.sort_by.with_index{|val, i| order.index(i)}
264
+ Mikon::DataFrame.new(data, {index: index, labels: @labels})
265
+ end
266
+
267
+ # Sort by label
268
+ # @param [Symbol] label column name to sort by
269
+ # @param [Bool] ascending default true
270
+ #
271
+ def sort(label, ascending=true)
272
+ i = @labels.index(label)
273
+ raise "No column named" + label.to_s if i.nil?
274
+ order = @data[i].sorted_indices
275
+ order.reverse! unless ascending
276
+ self.sort_by.with_index{|val, i| order.index(i)}
277
+ end
278
+
279
+ # Insert column using Mikon::Row DSL or raw Array
280
+ # @param [label] Symbol the name of new column (optional)
281
+ # @param [Array|Series|DArray] the content of new column (optional)
282
+ # @example
283
+ # df = Mikon::DataFrame.new({a: [1,2,3], b: [2,3,4]})
284
+ # df.insert_column(:c){a + b}.to_json #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7]}
285
+ # df.insert_column(:d, [1, 2, 3]).to_json #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7], d: [1,2,3]}
286
+ # df.insert_column((df[:d]*2).name(:e)) #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7], d: [1,2,3], e: [2,4,6]
287
+ #
288
+ def insert_column(*args, &block)
289
+ if block_given?
290
+ rows = []
291
+ name = args[0]
292
+ self.each_row do |row|
293
+ val = row.instance_eval(&block)
294
+ row[name] = val
295
+ rows.push(row)
296
+ end
297
+ @data = rows.map{|row| row.to_hash.values}.transpose.map do |arr|
298
+ Mikon::DArray.new(arr)
299
+ end
300
+ @labels = rows.first.labels
301
+ else
302
+ if args[0].is_a?(Symbol)
303
+ name = args[0]
304
+ case
305
+ when args[1].is_a?(Mikon::DArray)
306
+ @data.push(args[1])
307
+ when args[1].is_a?(Mikon::Series)
308
+ @data.push(args[1].to_darr)
309
+ when args[1].is_a?(Array)
310
+ @data.push(Mikon::DArray.new(args[1]))
311
+ else
312
+ raise ArgumentError
313
+ end
314
+ elsif args[0].is_a?(Mikon::Series)
315
+ @data.push(args[0].to_darr)
316
+ name = args[0].name
317
+ end
318
+ @labels.push(name)
319
+ end
320
+ _check_if_valid
321
+ return self
322
+ end
323
+
324
+ # Access row using index
325
+ def row(index)
326
+ pos = @index.index(index)
327
+ arr = @data.map{|column| column[pos]}
328
+ Mikon::Row.new(@labels, arr, index)
329
+ end
330
+
331
+ # Iterate row
332
+ def each_row(&block)
333
+ return self.to_enum(:each_row) unless block_given?
334
+ @index.each.with_index do |el, i|
335
+ row_arr = @data.map{|darr| darr[i]}
336
+ row = Mikon::Row.new(@labels, row_arr, @index[i])
337
+ block.call(row)
338
+ end
339
+ end
340
+
341
+ # Replace NaN with specified value (destructive)
342
+ # @param [Float|Fixnum] value new value to replace NaN
343
+ def fillna(value=0)
344
+ @data.each {|darr| darr.fillna(value)}
345
+ self
346
+ end
347
+
348
+ # Delete column
349
+ def delete(label)
350
+ pos = @labels.index(label)
351
+ raise "there is no column named " + label.to_s if pos.nil?
352
+ @labels.delete_at(pos)
353
+ @data.delete_at(pos)
354
+ end
355
+
356
+ attr_reader :name, :index, :labels
357
+ end
358
+
359
+ # Row class for internal use
360
+ class Row
361
+ def initialize(labels, arr, index)
362
+ @labels = labels
363
+ @arr = arr
364
+ @index = index
365
+ end
366
+
367
+ def [](name)
368
+ pos = @labels.index(name)
369
+ pos.nil? ? nil : @arr[pos]
370
+ end
371
+
372
+ def []=(name, val)
373
+ pos = @labels.index(name)
374
+ if pos.nil?
375
+ @labels.push(name)
376
+ @arr.push(val)
377
+ else
378
+ @arr[pos] = val
379
+ end
380
+ end
381
+
382
+ # @example
383
+ # row = Row.new([:a, :b, :c], [1, 2, 3], :example_row)
384
+ # puts row.instance_eval { a * b * c} #-> 7
385
+ def method_missing(name, *args)
386
+ super unless args.length == 0
387
+ pos = @labels.index(name)
388
+ pos.nil? ? super : @arr[pos]
389
+ end
390
+
391
+ def to_hash
392
+ @labels.each.with_index.reduce({}) do |memo, (label, i)|
393
+ memo[label] = @arr[i]
394
+ memo
395
+ end
396
+ end
397
+
398
+ attr_reader :labels, :arr, :index
399
+ end
400
+ end