mikon 0.1.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/mikon.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'nmatrix'
2
+ require_relative "mikon/version"
3
+ require_relative "mikon/stats"
4
+ require_relative "mikon/plot"
5
+ require_relative "mikon/pivot"
6
+ require_relative "mikon/core/array"
7
+ require_relative "mikon/core/index"
8
+ require_relative "mikon/core/series"
9
+ require_relative "mikon/core/dataframe"
@@ -0,0 +1,139 @@
1
+ module Mikon
2
+
3
+ # Internal data structure to wrap NMatrix
4
+ # Its stastical methods (i.e. #median) is compartible with Statsample::Vector's
5
+ # @example
6
+ # Mikon::DArray.new([1, 2, 3]) #-> #<Mikon::DArray:0xbacfc99c @data=[1, 2, 3], @dtype=:int32>
7
+ #
8
+ class DArray
9
+ include Enumerable, Mikon::Stats
10
+ attr_reader :dtype, :data
11
+
12
+ # @param [NMatrix|Array] source
13
+ # @param [Hash] options
14
+ def initialize(source, options={})
15
+ case
16
+ when source.is_a?(Array)
17
+ if source.all? {|el| el.is_a?(Numeric)}
18
+ @data = NMatrix.new([source.length], source, options)
19
+ else
20
+ #
21
+ # NMatrix instance whose dtype is :object frequently causes Segmentation Fault
22
+ # @example
23
+ # df = DataFrame.new({a: ["a", "b"], b: [1, 2]})
24
+ # df[:a].to_html #-> Segmentation Fault
25
+ #
26
+
27
+ # @data = NMatrix.new([source.length], source, options.merge({:dtype => :object}))
28
+ extend UseArray
29
+ @data = Mikon::ArrayWrapper.new(source)
30
+ end
31
+
32
+ when source.is_a?(NMatrix)
33
+ unless source.shape.length == 1 && source.shape.first.is_a?(Numeric)
34
+ raise "Matrix shape is not valid"
35
+ end
36
+ @data = source
37
+ else
38
+ raise "Non-acceptable Argument Error"
39
+ end
40
+ @dtype = @data.dtype
41
+ end
42
+
43
+ def each(&block)
44
+ @data.each(&block)
45
+ end
46
+
47
+ def reduce(init, &block)
48
+ @data.inject_rank(0, init, &block).first
49
+ end
50
+
51
+ def expand(length)
52
+ raise "The argument 'length' should be greater than length of now." if length < self.length
53
+ data = NMatrix.new([expand], @data.to_a)
54
+ @data = data.map.with_index{|val, i| i < self.length ? val : 0}
55
+ end
56
+
57
+ def length
58
+ @data.shape.first
59
+ end
60
+
61
+ def [](pos)
62
+ @data[pos]
63
+ end
64
+
65
+ def sort
66
+ Mikon::DArray.new(@data.sort)
67
+ end
68
+
69
+ def sort_by(&block)
70
+ return self.to_enum(:sort_by) unless block_given?
71
+ Mikon::DArray.new(@data.sort_by(&block))
72
+ end
73
+
74
+ def reverse
75
+ len = self.length
76
+ Mikon::DArray.new(@data.map.with_index{|v, i| @data[self.length-i-1]})
77
+ end
78
+
79
+ [:+, :-].each do |op|
80
+ define_method(op) do |arg|
81
+ if arg.is_a?(DArray)
82
+ DArray.new(arg.coerce(@data).inject(op))
83
+ else
84
+ super
85
+ end
86
+ end
87
+ end
88
+
89
+ [:*, :/, :%].each do |op|
90
+ define_method(op) do |arg|
91
+ if arg.is_a?(Numeric)
92
+ DArray.new(@data.send(op, arg))
93
+ else
94
+ super
95
+ end
96
+ end
97
+ end
98
+
99
+ def coerce(other)
100
+ if [NMatrix, Array].any?{|cls| other.is_a?(cls) && @data.is_a?(cls)}
101
+ return other, @data
102
+ else
103
+ super
104
+ end
105
+ end
106
+
107
+ def to_a
108
+ @data.to_a
109
+ end
110
+
111
+ def fillna(fill_value=0)
112
+ @data = @data.map{|val| val.to_f.nan? ? fill_value : val}
113
+ end
114
+ end
115
+
116
+ class ArrayWrapper < Array
117
+ def dtype
118
+ :object
119
+ end
120
+
121
+ def sorted_indices
122
+ self.map.with_index.sort_by(&:first).map(&:last)
123
+ end
124
+ end
125
+
126
+ module UseArray
127
+ def expand(length)
128
+ @data = @data + Array(length - @data.length, 0)
129
+ end
130
+
131
+ def length
132
+ @data.length
133
+ end
134
+
135
+ def reduce(init, &block)
136
+ @data.reduce(int, &block)
137
+ end
138
+ end
139
+ end
@@ -0,0 +1,400 @@
1
+ require 'securerandom'
2
+ require 'formatador'
3
+ require 'json'
4
+ require 'csv'
5
+
6
+ module Mikon
7
+
8
+ # The main data structure in Mikon gem.
9
+ # DataFrame consists of labels(column name), index(row name), and labels.
10
+ class DataFrame
11
+
12
+ def initialize(source, options={})
13
+ options = {
14
+ name: SecureRandom.uuid(),
15
+ index: nil,
16
+ labels: nil
17
+ }.merge(options)
18
+
19
+ case
20
+ when source.is_a?(Array)
21
+ case
22
+ when source.all? {|el| el.is_a?(Mikon::Series)}
23
+ raise "NotImplementedError"
24
+
25
+ when source.all? {|el| el.is_a?(Mikon::DArray)}
26
+ @data = source
27
+
28
+ when source.all? {|el| el.is_a?(Mikon::Row)}
29
+ @labels = source.first.labels
30
+ @index = source.map{|row| row.index}
31
+ @data = source.map{|row| row.to_hash.values}.transpose.map do |arr|
32
+ Mikon::DArray.new(arr)
33
+ end
34
+
35
+ when source.all? {|el| el.is_a?(Hash)}
36
+ @labels = source.first.keys
37
+ @data = source.map{|hash| hash.values}.transpose.map do |arr|
38
+ Mikon::DArray.new(arr)
39
+ end
40
+
41
+ when source.all? {|el| el.is_a?(Array)}
42
+ @data = source.map do |arr|
43
+ Mikon::DArray.new(arr)
44
+ end
45
+
46
+ else raise "Non-acceptable Arguments Error"
47
+ end
48
+
49
+ when source.is_a?(Hash)
50
+ case
51
+ when source.values.all? {|val| val.is_a?(Array)}
52
+ @labels = source.keys
53
+ @data = source.values.map do |arr|
54
+ Mikon::DArray.new(arr)
55
+ end
56
+ when source.all? {|arr| arr[1].is_a?(Series)}
57
+ else raise "Non-acceptable Arguments Error"
58
+ end
59
+
60
+ else raise "Non-acceptable Arguments Error"
61
+ end
62
+
63
+ @labels = options[:labels] unless options[:labels].nil?
64
+ @name = options[:name]
65
+
66
+ unless (index = options[:index]).nil?
67
+ if index.is_a?(Symbol)
68
+ raise "labels should be set" if @labels.nil?
69
+ pos = @labels.index(index)
70
+ raise "Thre is no column named" + index.to_s if pos.nil?
71
+ name = @labels.delete(index)
72
+ @index = @data.delete(@data[pos])
73
+ elsif index.is_a?(Array)
74
+ @index = index
75
+ else
76
+ raise "Invalid index type"
77
+ end
78
+ end
79
+
80
+ _check_if_valid
81
+ end
82
+
83
+ def _check_if_valid
84
+ # All array should should have the same length
85
+ length = @data.map{|darr| darr.length}.max
86
+ @data.each{|darr| darr.expand(length) if darr.length < length}
87
+
88
+ # DataFrame should have index object
89
+ @index = (0..(length-1)).to_a if @index.nil?
90
+ raise "index should have the same length as arrays" if @index.length != length
91
+
92
+ # Labels should be an instance of Symbol
93
+ if @labels.nil?
94
+ @labels = @data.map.with_index{|darr, i| i.to_s.to_sym}
95
+ elsif @labels.any?{|label| !label.is_a?(Symbol)}
96
+ @labels = @labels.map{|label| label.to_sym}
97
+ end
98
+ end
99
+
100
+ # return the length of columns
101
+ def length
102
+ @data.first.length
103
+ end
104
+
105
+ # Create Mikon::DataFrame from a csv/tsv file
106
+ # @param [String] path path to csv
107
+ # @param options
108
+ # :col_sep [String] string to separate by
109
+ # :headers [Array] headers
110
+ #
111
+ def self.from_csv(path, options={})
112
+ csv_options = {
113
+ :col_sep => ',',
114
+ :headers => true,
115
+ :converters => :numeric,
116
+ :header_converters => :symbol,
117
+ }
118
+
119
+ options = csv_options.merge(options)
120
+ raise ArgumentError, "options[:hearders] should be set" if options[:headers] == false
121
+ options.delete(:header_converters) if options[:headers].is_a?(Array)
122
+
123
+ csv = CSV.readlines(path, "r", options)
124
+ yield csv if block_given?
125
+
126
+ hash = {}
127
+ csv.by_col.each {|label, arr| hash[label] = arr}
128
+ csv_options.keys.each{|key| options.delete(key)}
129
+
130
+ self.new(hash, options)
131
+ end
132
+
133
+ # Accessor for column and rows
134
+ # @example
135
+ # df = DataFrame.new({a: [1, 2, 3], b: [2, 3, 4]})
136
+ # df[0..1].to_json #-> {a: [1, 2], b: [2, 3]}
137
+ # df[:a] #-> <Mikon::Series>
138
+ def [](arg)
139
+ case
140
+ when arg.is_a?(Range)
141
+ index = @index.select{|i| arg.include?(i)}
142
+ Mikon::DataFrame.new(index.map{|i| self.row(i)}, {index: index})
143
+
144
+ when arg.is_a?(Symbol)
145
+ self.column(arg)
146
+ end
147
+ end
148
+
149
+ # Access column with its name
150
+ def column(label)
151
+ pos = @labels.index(label)
152
+ raise "There is no column named " + label if pos.nil?
153
+ Mikon::Series.new(label, @data[pos], index: @index)
154
+ end
155
+
156
+ # same as head of Linux
157
+ def head(num)
158
+ self[0..(num-1)]
159
+ end
160
+
161
+ # same as tail of Linux
162
+ def tail(num)
163
+ last = self.length-1
164
+ self[(last-num+1)..last]
165
+ end
166
+
167
+ # Compartible with Nyaplot::DataFrame.to_json
168
+ def to_json(*args)
169
+ rows = []
170
+ self.each_row do |row|
171
+ rows.push(row.to_hash)
172
+ end
173
+ rows.to_json
174
+ end
175
+
176
+ # IRuby notebook automatically call this method
177
+ def to_html(threshold=50)
178
+ html = "<html><table><tr><td></td>"
179
+ html += @labels.map{|label| "<th>" + label.to_s + "</th>"}.join
180
+ html += "</tr>"
181
+ self.each_row.with_index do |row, pos|
182
+ next if pos > threshold && pos != self.length-1
183
+ html += "<tr><th>" + @index[pos].to_s + "</th>"
184
+ html += @labels.map{|label| "<td>" + row[label].to_s + "</td>"}.join
185
+ html += "</tr>"
186
+ html += "<tr><th>...</th>" + "<td>...</td>"*@labels.length + "</tr>" if pos == threshold
187
+ end
188
+ html += "</table>"
189
+ end
190
+
191
+ def to_s(threshold=50)
192
+ arr = []
193
+ self.each_row.with_index do |row, pos|
194
+ next nil if pos > threshold && pos != self.length-1
195
+ arr.push({"" => @index[pos]}.merge(row.to_hash))
196
+ if pos == threshold
197
+ arr.push(@labels.reduce({"" => "..."}){|memo, label| memo[label] = "..."; memo})
198
+ end
199
+ end
200
+ Formatador.display_table(arr.select{|el| !(el.nil?)})
201
+ end
202
+
203
+ # Select rows using Mikon::Row DSL and create new DataFrame
204
+ # @example
205
+ # df = Mikon::DataFrame.new({a: [1,2,3], b: [2,3,4]})
206
+ # df.select{a%2==0}[:a].to_a #-> [2]
207
+ #
208
+ def select(&block)
209
+ return self.to_enum(:select) unless block_given?
210
+ rows = []
211
+ i = 0
212
+ self.each_row do |row|
213
+ if row.instance_eval(&block)
214
+ rows.push(row)
215
+ end
216
+ end
217
+ Mikon::DataFrame.new(rows)
218
+ end
219
+
220
+ alias_method :filter, :select
221
+
222
+ # Iterate rows using Mikon::Row DSL
223
+ def each(&block)
224
+ return self.to_enum(:each) unless block_given?
225
+ self.each_row do |row|
226
+ row.instance_eval(&block)
227
+ end
228
+ self
229
+ end
230
+
231
+ # Iterate rows using Mikon::Row DSL and return new Mikon::Series
232
+ def map(&block)
233
+ return self.to_enum(:map) unless block_given?
234
+ arr = []
235
+ self.each_row do |row|
236
+ arr.push(row.instance_eval(&block))
237
+ end
238
+ Mikon::Series.new(:new_series, arr, index: @index.clone)
239
+ end
240
+
241
+ alias_method :collect, :map
242
+
243
+ # Mikon::Row DSL
244
+ def all?(&block)
245
+ self.each_row {|row| return false unless row.instance_eval(&block)}
246
+ true
247
+ end
248
+
249
+ # Mikon::Row DSL
250
+ def any?(&block)
251
+ self.each_row {|row| return true if row.instance_eval(&block)}
252
+ false
253
+ end
254
+
255
+ # Sort using Mikon::Row DSL
256
+ # @param [Bool] ascending default true
257
+ #
258
+ def sort_by(ascending=true, &block)
259
+ return self.to_enum(:sort_by) unless block_given?
260
+ order = self.map(&block).to_darr.sorted_indices
261
+ order.reverse! unless ascending
262
+ data = @data.map{|darr| darr.sort_by.with_index{|val, i| order.index(i)}}
263
+ index = @index.sort_by.with_index{|val, i| order.index(i)}
264
+ Mikon::DataFrame.new(data, {index: index, labels: @labels})
265
+ end
266
+
267
+ # Sort by label
268
+ # @param [Symbol] label column name to sort by
269
+ # @param [Bool] ascending default true
270
+ #
271
+ def sort(label, ascending=true)
272
+ i = @labels.index(label)
273
+ raise "No column named" + label.to_s if i.nil?
274
+ order = @data[i].sorted_indices
275
+ order.reverse! unless ascending
276
+ self.sort_by.with_index{|val, i| order.index(i)}
277
+ end
278
+
279
+ # Insert column using Mikon::Row DSL or raw Array
280
+ # @param [label] Symbol the name of new column (optional)
281
+ # @param [Array|Series|DArray] the content of new column (optional)
282
+ # @example
283
+ # df = Mikon::DataFrame.new({a: [1,2,3], b: [2,3,4]})
284
+ # df.insert_column(:c){a + b}.to_json #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7]}
285
+ # df.insert_column(:d, [1, 2, 3]).to_json #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7], d: [1,2,3]}
286
+ # df.insert_column((df[:d]*2).name(:e)) #-> {a: [1,2,3], b: [2,3,4], c: [3,5,7], d: [1,2,3], e: [2,4,6]
287
+ #
288
+ def insert_column(*args, &block)
289
+ if block_given?
290
+ rows = []
291
+ name = args[0]
292
+ self.each_row do |row|
293
+ val = row.instance_eval(&block)
294
+ row[name] = val
295
+ rows.push(row)
296
+ end
297
+ @data = rows.map{|row| row.to_hash.values}.transpose.map do |arr|
298
+ Mikon::DArray.new(arr)
299
+ end
300
+ @labels = rows.first.labels
301
+ else
302
+ if args[0].is_a?(Symbol)
303
+ name = args[0]
304
+ case
305
+ when args[1].is_a?(Mikon::DArray)
306
+ @data.push(args[1])
307
+ when args[1].is_a?(Mikon::Series)
308
+ @data.push(args[1].to_darr)
309
+ when args[1].is_a?(Array)
310
+ @data.push(Mikon::DArray.new(args[1]))
311
+ else
312
+ raise ArgumentError
313
+ end
314
+ elsif args[0].is_a?(Mikon::Series)
315
+ @data.push(args[0].to_darr)
316
+ name = args[0].name
317
+ end
318
+ @labels.push(name)
319
+ end
320
+ _check_if_valid
321
+ return self
322
+ end
323
+
324
+ # Access row using index
325
+ def row(index)
326
+ pos = @index.index(index)
327
+ arr = @data.map{|column| column[pos]}
328
+ Mikon::Row.new(@labels, arr, index)
329
+ end
330
+
331
+ # Iterate row
332
+ def each_row(&block)
333
+ return self.to_enum(:each_row) unless block_given?
334
+ @index.each.with_index do |el, i|
335
+ row_arr = @data.map{|darr| darr[i]}
336
+ row = Mikon::Row.new(@labels, row_arr, @index[i])
337
+ block.call(row)
338
+ end
339
+ end
340
+
341
+ # Replace NaN with specified value (destructive)
342
+ # @param [Float|Fixnum] value new value to replace NaN
343
+ def fillna(value=0)
344
+ @data.each {|darr| darr.fillna(value)}
345
+ self
346
+ end
347
+
348
+ # Delete column
349
+ def delete(label)
350
+ pos = @labels.index(label)
351
+ raise "there is no column named " + label.to_s if pos.nil?
352
+ @labels.delete_at(pos)
353
+ @data.delete_at(pos)
354
+ end
355
+
356
+ attr_reader :name, :index, :labels
357
+ end
358
+
359
+ # Row class for internal use
360
+ class Row
361
+ def initialize(labels, arr, index)
362
+ @labels = labels
363
+ @arr = arr
364
+ @index = index
365
+ end
366
+
367
+ def [](name)
368
+ pos = @labels.index(name)
369
+ pos.nil? ? nil : @arr[pos]
370
+ end
371
+
372
+ def []=(name, val)
373
+ pos = @labels.index(name)
374
+ if pos.nil?
375
+ @labels.push(name)
376
+ @arr.push(val)
377
+ else
378
+ @arr[pos] = val
379
+ end
380
+ end
381
+
382
+ # @example
383
+ # row = Row.new([:a, :b, :c], [1, 2, 3], :example_row)
384
+ # puts row.instance_eval { a * b * c} #-> 7
385
+ def method_missing(name, *args)
386
+ super unless args.length == 0
387
+ pos = @labels.index(name)
388
+ pos.nil? ? super : @arr[pos]
389
+ end
390
+
391
+ def to_hash
392
+ @labels.each.with_index.reduce({}) do |memo, (label, i)|
393
+ memo[label] = @arr[i]
394
+ memo
395
+ end
396
+ end
397
+
398
+ attr_reader :labels, :arr, :index
399
+ end
400
+ end