daru 0.1.5 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +5 -5
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.gitignore +1 -0
  4. data/.rubocop.yml +21 -7
  5. data/.travis.yml +10 -5
  6. data/CONTRIBUTING.md +15 -10
  7. data/History.md +124 -2
  8. data/README.md +37 -9
  9. data/ReleasePolicy.md +20 -0
  10. data/benchmarks/db_loading.rb +34 -0
  11. data/benchmarks/statistics.rb +6 -6
  12. data/benchmarks/where_clause.rb +1 -1
  13. data/benchmarks/where_vs_filter.rb +1 -1
  14. data/daru.gemspec +17 -41
  15. data/lib/daru.rb +10 -13
  16. data/lib/daru/accessors/gsl_wrapper.rb +1 -1
  17. data/lib/daru/accessors/nmatrix_wrapper.rb +2 -0
  18. data/lib/daru/category.rb +29 -15
  19. data/lib/daru/configuration.rb +34 -0
  20. data/lib/daru/core/group_by.rb +158 -77
  21. data/lib/daru/core/merge.rb +12 -3
  22. data/lib/daru/core/query.rb +20 -4
  23. data/lib/daru/dataframe.rb +692 -118
  24. data/lib/daru/date_time/index.rb +14 -11
  25. data/lib/daru/date_time/offsets.rb +9 -1
  26. data/lib/daru/extensions/which_dsl.rb +55 -0
  27. data/lib/daru/formatters/table.rb +3 -5
  28. data/lib/daru/index/categorical_index.rb +4 -4
  29. data/lib/daru/index/index.rb +131 -42
  30. data/lib/daru/index/multi_index.rb +118 -10
  31. data/lib/daru/io/csv/converters.rb +21 -0
  32. data/lib/daru/io/io.rb +105 -33
  33. data/lib/daru/io/sql_data_source.rb +10 -0
  34. data/lib/daru/iruby/templates/dataframe.html.erb +4 -51
  35. data/lib/daru/iruby/templates/dataframe_mi.html.erb +3 -56
  36. data/lib/daru/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  37. data/lib/daru/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  38. data/lib/daru/iruby/templates/dataframe_tbody.html.erb +28 -0
  39. data/lib/daru/iruby/templates/dataframe_thead.html.erb +21 -0
  40. data/lib/daru/iruby/templates/vector.html.erb +3 -25
  41. data/lib/daru/iruby/templates/vector_mi.html.erb +3 -34
  42. data/lib/daru/iruby/templates/vector_mi_tbody.html.erb +26 -0
  43. data/lib/daru/iruby/templates/vector_mi_thead.html.erb +8 -0
  44. data/lib/daru/iruby/templates/vector_tbody.html.erb +17 -0
  45. data/lib/daru/iruby/templates/vector_thead.html.erb +8 -0
  46. data/lib/daru/maths/arithmetic/vector.rb +38 -2
  47. data/lib/daru/maths/statistics/dataframe.rb +28 -30
  48. data/lib/daru/maths/statistics/vector.rb +295 -41
  49. data/lib/daru/plotting/gruff/dataframe.rb +13 -15
  50. data/lib/daru/plotting/nyaplot/category.rb +1 -1
  51. data/lib/daru/plotting/nyaplot/dataframe.rb +15 -4
  52. data/lib/daru/plotting/nyaplot/vector.rb +1 -2
  53. data/lib/daru/vector.rb +308 -96
  54. data/lib/daru/version.rb +1 -1
  55. data/profile/vector_new.rb +9 -0
  56. data/spec/accessors/gsl_wrapper_spec.rb +38 -35
  57. data/spec/accessors/nmatrix_wrapper_spec.rb +25 -22
  58. data/spec/category_spec.rb +24 -20
  59. data/spec/core/group_by_spec.rb +238 -4
  60. data/spec/core/merge_spec.rb +1 -1
  61. data/spec/core/query_spec.rb +65 -50
  62. data/spec/daru_spec.rb +22 -0
  63. data/spec/dataframe_spec.rb +473 -16
  64. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  65. data/spec/date_time/index_spec.rb +34 -16
  66. data/spec/date_time/offsets_spec.rb +14 -0
  67. data/spec/extensions/rserve_spec.rb +1 -1
  68. data/spec/extensions/which_dsl_spec.rb +38 -0
  69. data/spec/fixtures/boolean_converter_test.csv +5 -0
  70. data/spec/fixtures/duplicates.csv +32 -0
  71. data/spec/fixtures/eciresults.html +394 -0
  72. data/spec/fixtures/empty_rows_test.csv +17 -0
  73. data/spec/fixtures/macau.html +3691 -0
  74. data/spec/fixtures/macd_data.csv +150 -0
  75. data/spec/fixtures/matrix_test.csv +55 -55
  76. data/spec/fixtures/moneycontrol.html +6812 -0
  77. data/spec/fixtures/string_converter_test.csv +5 -0
  78. data/spec/fixtures/test_xls.xls +0 -0
  79. data/spec/fixtures/test_xls_2.xls +0 -0
  80. data/spec/fixtures/url_test.txt~ +0 -0
  81. data/spec/fixtures/valid_markup.html +62 -0
  82. data/spec/fixtures/wiki_climate.html +1243 -0
  83. data/spec/fixtures/wiki_table_info.html +631 -0
  84. data/spec/formatters/table_formatter_spec.rb +29 -0
  85. data/spec/index/categorical_index_spec.rb +33 -33
  86. data/spec/index/index_spec.rb +160 -41
  87. data/spec/index/multi_index_spec.rb +143 -33
  88. data/spec/io/io_spec.rb +246 -2
  89. data/spec/io/sql_data_source_spec.rb +31 -41
  90. data/spec/iruby/dataframe_spec.rb +17 -19
  91. data/spec/iruby/vector_spec.rb +26 -28
  92. data/spec/maths/arithmetic/dataframe_spec.rb +1 -1
  93. data/spec/maths/arithmetic/vector_spec.rb +18 -0
  94. data/spec/maths/statistics/vector_spec.rb +153 -15
  95. data/spec/plotting/gruff/category_spec.rb +3 -3
  96. data/spec/plotting/gruff/dataframe_spec.rb +14 -4
  97. data/spec/plotting/gruff/vector_spec.rb +9 -9
  98. data/spec/plotting/nyaplot/category_spec.rb +5 -9
  99. data/spec/plotting/nyaplot/dataframe_spec.rb +95 -47
  100. data/spec/plotting/nyaplot/vector_spec.rb +5 -11
  101. data/spec/shared/vector_display_spec.rb +12 -14
  102. data/spec/spec_helper.rb +30 -7
  103. data/spec/support/matchers.rb +5 -0
  104. data/spec/vector_spec.rb +306 -72
  105. metadata +96 -55
  106. data/spec/fixtures/stock_data.csv +0 -500
@@ -17,17 +17,17 @@ module Daru
17
17
  end
18
18
  end
19
19
 
20
- def initialize left_df, right_df, opts={}
20
+ def initialize left_df, right_df, opts={} # rubocop:disable Metrics/AbcSize -- quick-fix for issue #171
21
21
  init_opts(opts)
22
22
  validate_on!(left_df, right_df)
23
23
  key_sanitizer = ->(h) { sanitize_merge_keys(h.values_at(*on)) }
24
24
 
25
25
  @left = df_to_a(left_df)
26
- @left.sort_by!(&key_sanitizer)
26
+ @left.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
27
27
  @left_key_values = @left.map(&key_sanitizer)
28
28
 
29
29
  @right = df_to_a(right_df)
30
- @right.sort_by!(&key_sanitizer)
30
+ @right.sort! { |a, b| safe_compare(a.values_at(*on), b.values_at(*on)) }
31
31
  @right_key_values = @right.map(&key_sanitizer)
32
32
 
33
33
  @left_keys, @right_keys = merge_keys(left_df, right_df, on)
@@ -246,6 +246,15 @@ module Daru
246
246
  raise ArgumentError, "Both dataframes expected to have #{on.inspect} field"
247
247
  end
248
248
  end
249
+
250
+ def safe_compare(left_array, right_array)
251
+ left_array.zip(right_array).map { |l, r|
252
+ next 0 if l.nil? && r.nil?
253
+ next 1 if r.nil?
254
+ next -1 if l.nil?
255
+ l <=> r
256
+ }.reject(&:zero?).first || 0
257
+ end
249
258
  end
250
259
 
251
260
  module Merge
@@ -9,13 +9,13 @@ module Daru
9
9
  end
10
10
 
11
11
  def & other
12
- BoolArray.new @barry.zip(other.barry).map { |b, o| b && o }
12
+ BoolArray.new(@barry.zip(other.barry).map { |b, o| b && o })
13
13
  end
14
14
 
15
15
  alias :and :&
16
16
 
17
17
  def | other
18
- BoolArray.new @barry.zip(other.barry).map { |b, o| b || o }
18
+ BoolArray.new(@barry.zip(other.barry).map { |b, o| b || o })
19
19
  end
20
20
 
21
21
  alias :or :|
@@ -39,11 +39,11 @@ module Daru
39
39
 
40
40
  class << self
41
41
  def apply_scalar_operator operator, data, other
42
- BoolArray.new data.map { |d| !!d.send(operator, other) }
42
+ BoolArray.new(data.map { |d| !!d.send(operator, other) if d.respond_to?(operator) })
43
43
  end
44
44
 
45
45
  def apply_vector_operator operator, vector, other
46
- BoolArray.new vector.zip(other).map { |d, o| !!d.send(operator, o) }
46
+ BoolArray.new(vector.zip(other).map { |d, o| !!d.send(operator, o) })
47
47
  end
48
48
 
49
49
  def df_where data_frame, bool_array
@@ -70,6 +70,22 @@ module Daru
70
70
  resultant_dv
71
71
  end
72
72
 
73
+ def vector_apply_where dv, bool_array
74
+ _data, new_index = fetch_new_data_and_index dv, bool_array
75
+ all_index = dv.index
76
+ all_data = all_index.map { |idx| new_index.include?(idx) ? yield(dv[idx]) : dv[idx] }
77
+
78
+ resultant_dv = Daru::Vector.new all_data,
79
+ index: dv.index.class.new(all_index),
80
+ dtype: dv.dtype,
81
+ type: dv.type,
82
+ name: dv.name
83
+
84
+ # Preserve categories order for category vector
85
+ resultant_dv.categories = dv.categories if dv.category?
86
+ resultant_dv
87
+ end
88
+
73
89
  private
74
90
 
75
91
  def fetch_new_data_and_index dv, bool_array
@@ -10,7 +10,10 @@ module Daru
10
10
  include Daru::Maths::Arithmetic::DataFrame
11
11
  include Daru::Maths::Statistics::DataFrame
12
12
  # TODO: Remove this line but its causing erros due to unkown reason
13
- include Daru::Plotting::DataFrame::NyaplotLibrary if Daru.has_nyaplot?
13
+ Daru.has_nyaplot?
14
+
15
+ attr_accessor(*Configuration::INSPECT_OPTIONS_KEYS)
16
+
14
17
  extend Gem::Deprecate
15
18
 
16
19
  class << self
@@ -20,7 +23,7 @@ module Daru
20
23
  #
21
24
  # == Arguments
22
25
  #
23
- # * path - Path of the file to load specified as a String.
26
+ # * path - Local path / Remote URL of the file to load specified as a String.
24
27
  #
25
28
  # == Options
26
29
  #
@@ -63,7 +66,7 @@ module Daru
63
66
 
64
67
  # Read a database query and returns a Dataset
65
68
  #
66
- # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
69
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
67
70
  # @param query [String] The query to be executed
68
71
  #
69
72
  # @return A dataframe containing the data resulting from the query
@@ -72,6 +75,11 @@ module Daru
72
75
  #
73
76
  # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
74
77
  # Daru::DataFrame.from_sql(dbh, "SELECT * FROM test")
78
+ #
79
+ # #Alternatively
80
+ #
81
+ # require 'dbi'
82
+ # Daru::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
75
83
  def from_sql dbh, query
76
84
  Daru::IO.from_sql dbh, query
77
85
  end
@@ -79,7 +87,7 @@ module Daru
79
87
  # Read a dataframe from AR::Relation
80
88
  #
81
89
  # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
82
- # @params fields [Array] Field names to be loaded (optional)
90
+ # @param fields [Array] Field names to be loaded (optional)
83
91
  #
84
92
  # @return A dataframe containing the data loaded from the relation
85
93
  #
@@ -112,6 +120,49 @@ module Daru
112
120
  Daru::IO.from_plaintext path, fields
113
121
  end
114
122
 
123
+ # Read the table data from a remote html file. Please note that this module
124
+ # works only for static table elements on a HTML page, and won't work in
125
+ # cases where the data is being loaded into the HTML table by Javascript.
126
+ #
127
+ # By default - all <th> tag elements in the first proper row are considered
128
+ # as the order, and all the <th> tag elements in the first column are
129
+ # considered as the index.
130
+ #
131
+ # == Arguments
132
+ #
133
+ # * path [String] - URL of the target HTML file.
134
+ # * fields [Hash] -
135
+ #
136
+ # +:match+ - A *String* to match and choose a particular table(s) from multiple tables of a HTML page.
137
+ #
138
+ # +:order+ - An *Array* which would act as the user-defined order, to override the parsed *Daru::DataFrame*.
139
+ #
140
+ # +:index+ - An *Array* which would act as the user-defined index, to override the parsed *Daru::DataFrame*.
141
+ #
142
+ # +:name+ - A *String* that manually assigns a name to the scraped *Daru::DataFrame*, for user's preference.
143
+ #
144
+ # == Returns
145
+ # An Array of +Daru::DataFrame+s, with each dataframe corresponding to a
146
+ # HTML table on that webpage.
147
+ #
148
+ # == Usage
149
+ # dfs = Daru::DataFrame.from_html("http://www.moneycontrol.com/", match: "Sun Pharma")
150
+ # dfs.count
151
+ # # => 4
152
+ #
153
+ # dfs.first
154
+ # #
155
+ # # => <Daru::DataFrame(5x4)>
156
+ # # Company Price Change Value (Rs
157
+ # # 0 Sun Pharma 502.60 -65.05 2,117.87
158
+ # # 1 Reliance 1356.90 19.60 745.10
159
+ # # 2 Tech Mahin 379.45 -49.70 650.22
160
+ # # 3 ITC 315.85 6.75 621.12
161
+ # # 4 HDFC 1598.85 50.95 553.91
162
+ def from_html path, fields={}
163
+ Daru::IO.from_html path, fields
164
+ end
165
+
115
166
  # Create DataFrame by specifying rows as an Array of Arrays or Array of
116
167
  # Daru::Vector objects.
117
168
  def rows source, opts={}
@@ -229,6 +280,17 @@ module Daru
229
280
  # Default to *true*.
230
281
  #
231
282
  # == Usage
283
+ #
284
+ # df = Daru::DataFrame.new
285
+ # # =>
286
+ # # <Daru::DataFrame(0x0)>
287
+ # # Creates an empty DataFrame with no rows or columns.
288
+ #
289
+ # df = Daru::DataFrame.new({}, order: [:a, :b])
290
+ # #<Daru::DataFrame(0x2)>
291
+ # a b
292
+ # # Creates a DataFrame with no rows and columns :a and :b
293
+ #
232
294
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
233
295
  # index: [:a, :b, :c, :d], name: :spider_man)
234
296
  #
@@ -239,26 +301,67 @@ module Daru
239
301
  # # b 7 2
240
302
  # # c 8 3
241
303
  # # d 9 4
242
- def initialize source, opts={} # rubocop:disable Metrics/MethodLength
304
+ #
305
+ # df = Daru::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
306
+ #
307
+ # # =>
308
+ # # #<Daru::DataFrame: bat_man (4x2)>
309
+ # # 0 1
310
+ # # 0 1 6
311
+ # # 1 2 7
312
+ # # 2 3 8
313
+ # # 3 4 9
314
+ #
315
+ # # Dataframe having Index name
316
+ #
317
+ # df = Daru::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
318
+ # index: Daru::Index.new([:a, :b, :c, :d], name: 'idx_name'),
319
+ # name: :spider_man)
320
+ #
321
+ # # =>
322
+ # # <Daru::DataFrame:80766980 @name = spider_man @size = 4>
323
+ # # idx_name b a
324
+ # # a 6 1
325
+ # # b 7 2
326
+ # # c 8 3
327
+ # # d 9 4
328
+ #
329
+ #
330
+ # idx = Daru::Index.new [100, 99, 101, 1, 2], name: "s1"
331
+ # => #<Daru::Index(5): s1 {100, 99, 101, 1, 2}>
332
+ #
333
+ # df = Daru::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
334
+ # c: [11,22,33,44,55]},
335
+ # order: [:a, :b, :c],
336
+ # index: idx)
337
+ # # =>
338
+ # #<Daru::DataFrame(5x3)>
339
+ # # s1 a b c
340
+ # # 100 1 11 11
341
+ # # 99 2 12 22
342
+ # # 101 3 13 33
343
+ # # 1 4 14 44
344
+ # # 2 5 15 55
345
+
346
+ def initialize source={}, opts={} # rubocop:disable Metrics/MethodLength
243
347
  vectors, index = opts[:order], opts[:index] # FIXME: just keyword arges after Ruby 2.1
244
348
  @data = []
245
349
  @name = opts[:name]
246
350
 
247
351
  case source
248
- when ->(s) { s.empty? }
249
- @vectors = Index.coerce vectors
250
- @index = Index.coerce index
251
- create_empty_vectors
352
+ when [], {}
353
+ create_empty_vectors(vectors, index)
252
354
  when Array
253
355
  initialize_from_array source, vectors, index, opts
254
356
  when Hash
255
357
  initialize_from_hash source, vectors, index, opts
358
+ when ->(s) { s.empty? } # TODO: likely want to remove this case
359
+ create_empty_vectors(vectors, index)
256
360
  end
257
361
 
258
362
  set_size
259
363
  validate
260
364
  update
261
- self.plotting_library = Daru.plotting_library
262
365
  end
263
366
 
264
367
  def plotting_library= lib
@@ -271,11 +374,18 @@ module Daru
271
374
  )
272
375
  end
273
376
  else
274
- raise ArguementError, "Plotting library #{lib} not supported. "\
377
+ raise ArgumentError, "Plotting library #{lib} not supported. "\
275
378
  'Supported libraries are :nyaplot and :gruff'
276
379
  end
277
380
  end
278
381
 
382
+ # this method is overwritten: see Daru::DataFrame#plotting_library=
383
+ def plot(*args, **options, &b)
384
+ init_plotting_library
385
+
386
+ plot(*args, **options, &b)
387
+ end
388
+
279
389
  # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
280
390
  # Defaults to *:vector*. Use of this method is not recommended for accessing
281
391
  # rows. Use df.row[:a] for accessing row with index ':a'.
@@ -285,7 +395,7 @@ module Daru
285
395
  end
286
396
 
287
397
  # Retrive rows by positions
288
- # @param [Array<Integer>] *positions positions of rows to retrive
398
+ # @param [Array<Integer>] positions of rows to retrive
289
399
  # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
290
400
  # @example
291
401
  # df = Daru::DataFrame.new({
@@ -303,19 +413,17 @@ module Daru
303
413
  validate_positions(*positions, nrows)
304
414
 
305
415
  if positions.is_a? Integer
306
- return Daru::Vector.new @data.map { |vec| vec.at(*positions) },
307
- index: @vectors
416
+ row = get_rows_for([positions])
417
+ Daru::Vector.new row, index: @vectors
308
418
  else
309
- new_rows = @data.map { |vec| vec.at(*original_positions) }
310
- return Daru::DataFrame.new new_rows,
311
- index: @index.at(*original_positions),
312
- order: @vectors
419
+ new_rows = get_rows_for(original_positions)
420
+ Daru::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
313
421
  end
314
422
  end
315
423
 
316
424
  # Set rows by positions
317
425
  # @param [Array<Integer>] positions positions of rows to set
318
- # @vector [Array, Daru::Vector] vector vector to be assigned
426
+ # @param [Array, Daru::Vector] vector vector to be assigned
319
427
  # @example
320
428
  # df = Daru::DataFrame.new({
321
429
  # a: [1, 2, 3],
@@ -348,7 +456,7 @@ module Daru
348
456
  end
349
457
 
350
458
  # Retrive vectors by positions
351
- # @param [Array<Integer>] *positions positions of vectors to retrive
459
+ # @param [Array<Integer>] positions of vectors to retrive
352
460
  # @return [Daru::Vector, Daru::DataFrame] vector for single position and dataframe for multiple positions
353
461
  # @example
354
462
  # df = Daru::DataFrame.new({
@@ -432,13 +540,24 @@ module Daru
432
540
  end
433
541
 
434
542
  def add_row row, index=nil
435
- self.row[index || @size] = row
543
+ self.row[*(index || @size)] = row
436
544
  end
437
545
 
438
546
  def add_vector n, vector
439
547
  self[n] = vector
440
548
  end
441
549
 
550
+ def insert_vector n, name, source
551
+ raise ArgumentError unless source.is_a? Array
552
+ vector = Daru::Vector.new(source, index: @index, name: @name)
553
+ @data << vector
554
+ @vectors = @vectors.add name
555
+ ordr = @vectors.dup.to_a
556
+ elmnt = ordr.pop
557
+ ordr = ordr.insert n, elmnt
558
+ self.order=ordr
559
+ end
560
+
442
561
  # Access a row or set/create a row. Refer #[] and #[]= docs for details.
443
562
  #
444
563
  # == Usage
@@ -448,6 +567,20 @@ module Daru
448
567
  Daru::Accessors::DataFrameByRow.new(self)
449
568
  end
450
569
 
570
+ # Extract a dataframe given row indexes or positions
571
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
572
+ # @return [Daru::Dataframe]
573
+ def get_sub_dataframe(keys, by_position: true)
574
+ return Daru::DataFrame.new({}) if keys == []
575
+
576
+ keys = @index.pos(*keys) unless by_position
577
+
578
+ sub_df = row_at(*keys)
579
+ sub_df = sub_df.to_df.transpose if sub_df.is_a?(Daru::Vector)
580
+
581
+ sub_df
582
+ end
583
+
451
584
  # Duplicate the DataFrame entirely.
452
585
  #
453
586
  # == Arguments
@@ -457,7 +590,7 @@ module Daru
457
590
  def dup vectors_to_dup=nil
458
591
  vectors_to_dup = @vectors.to_a unless vectors_to_dup
459
592
 
460
- src = vectors_to_dup.map { |vec| @data[@vectors[vec]].dup }
593
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
461
594
  new_order = Daru::Index.new(vectors_to_dup)
462
595
 
463
596
  Daru::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
@@ -506,8 +639,8 @@ module Daru
506
639
  deprecate :dup_only_valid, :reject_values, 2016, 10
507
640
 
508
641
  # Returns a dataframe in which rows with any of the mentioned values
509
- # are ignored.
510
- # @param [Array] *values values to reject to form the new dataframe
642
+ # are ignored.
643
+ # @param [Array] values to reject to form the new dataframe
511
644
  # @return [Daru::DataFrame] Data Frame with only rows which doesn't
512
645
  # contain the mentioned values
513
646
  # @example
@@ -544,7 +677,7 @@ module Daru
544
677
  # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
545
678
  # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
546
679
  # }, index: 11..18)
547
- # df
680
+ # df.replace_values nil, Float::NAN
548
681
  # # => #<Daru::DataFrame(8x3)>
549
682
  # # a b c
550
683
  # # 11 1 a a
@@ -560,6 +693,89 @@ module Daru
560
693
  self
561
694
  end
562
695
 
696
+ # Rolling fillna
697
+ # replace all Float::NAN and NIL values with the preceeding or following value
698
+ #
699
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
700
+ #
701
+ # @example
702
+ # df = Daru::DataFrame.new({
703
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
704
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
705
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
706
+ # })
707
+ #
708
+ # => #<Daru::DataFrame(8x3)>
709
+ # a b c
710
+ # 0 1 a a
711
+ # 1 2 b NaN
712
+ # 2 3 nil 3
713
+ # 3 nil NaN 4
714
+ # 4 NaN nil 3
715
+ # 5 nil 3 5
716
+ # 6 1 5 nil
717
+ # 7 7 nil 7
718
+ #
719
+ # 2.3.3 :068 > df.rolling_fillna(:forward)
720
+ # => #<Daru::DataFrame(8x3)>
721
+ # a b c
722
+ # 0 1 a a
723
+ # 1 2 b a
724
+ # 2 3 b 3
725
+ # 3 3 b 4
726
+ # 4 3 b 3
727
+ # 5 3 3 5
728
+ # 6 1 5 5
729
+ # 7 7 5 7
730
+ #
731
+ def rolling_fillna!(direction=:forward)
732
+ @data.each { |vec| vec.rolling_fillna!(direction) }
733
+ self
734
+ end
735
+
736
+ def rolling_fillna(direction=:forward)
737
+ dup.rolling_fillna!(direction)
738
+ end
739
+
740
+ # Return unique rows by vector specified or all vectors
741
+ #
742
+ # @param vtrs [String][Symbol] vector names(s) that should be considered
743
+ #
744
+ # @example
745
+ #
746
+ # => #<Daru::DataFrame(6x2)>
747
+ # a b
748
+ # 0 1 a
749
+ # 1 2 b
750
+ # 2 3 c
751
+ # 3 4 d
752
+ # 2 3 c
753
+ # 3 4 f
754
+ #
755
+ # 2.3.3 :> df.unique
756
+ # => #<Daru::DataFrame(5x2)>
757
+ # a b
758
+ # 0 1 a
759
+ # 1 2 b
760
+ # 2 3 c
761
+ # 3 4 d
762
+ # 3 4 f
763
+ #
764
+ # 2.3.3 :> df.unique(:a)
765
+ # => #<Daru::DataFrame(5x2)>
766
+ # a b
767
+ # 0 1 a
768
+ # 1 2 b
769
+ # 2 3 c
770
+ # 3 4 d
771
+ #
772
+ def uniq(*vtrs)
773
+ vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
774
+ grouped = group_by(vecs)
775
+ indexes = grouped.groups.values.map { |v| v[0] }.sort
776
+ row[*indexes]
777
+ end
778
+
563
779
  # Iterate over each index of the DataFrame.
564
780
  def each_index &block
565
781
  return to_enum(:each_index) unless block_given?
@@ -679,7 +895,7 @@ module Daru
679
895
  # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
680
896
  # Default to :vector.
681
897
  def map! axis=:vector, &block
682
- if axis == :vector || axis == :column
898
+ if %i[vector column].include?(axis)
683
899
  map_vectors!(&block)
684
900
  elsif axis == :row
685
901
  map_rows!(&block)
@@ -807,6 +1023,18 @@ module Daru
807
1023
  self
808
1024
  end
809
1025
 
1026
+ def apply_method(method, keys: nil, by_position: true)
1027
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
1028
+
1029
+ case method
1030
+ when Symbol then df.send(method)
1031
+ when Proc then method.call(df)
1032
+ when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
1033
+ else raise
1034
+ end
1035
+ end
1036
+ alias :apply_method_on_sub_df :apply_method
1037
+
810
1038
  # Retrieves a Daru::Vector, based on the result of calculation
811
1039
  # performed on each row.
812
1040
  def collect_rows &block
@@ -913,7 +1141,7 @@ module Daru
913
1141
 
914
1142
  # creates a new vector with the data of a given field which the block returns true
915
1143
  def filter_vector vec, &block
916
- Daru::Vector.new each_row.select(&block).map { |row| row[vec] }
1144
+ Daru::Vector.new(each_row.select(&block).map { |row| row[vec] })
917
1145
  end
918
1146
 
919
1147
  # Iterates over each row and retains it in a new DataFrame if the block returns
@@ -934,9 +1162,9 @@ module Daru
934
1162
  dup.tap { |df| df.keep_vector_if(&block) }
935
1163
  end
936
1164
 
937
- # Test each row with one or more tests. Each test is a Proc with the form
938
- # *Proc.new {|row| row[:age] > 0}*
939
- #
1165
+ # Test each row with one or more tests.
1166
+ # @param tests [Proc] Each test is a Proc with the form
1167
+ # *Proc.new {|row| row[:age] > 0}*
940
1168
  # The function returns an array with all errors.
941
1169
  #
942
1170
  # FIXME: description here is too sparse. As far as I can get,
@@ -1031,14 +1259,14 @@ module Daru
1031
1259
  alias :vector_missing_values :missing_values_rows
1032
1260
 
1033
1261
  def has_missing_data?
1034
- !!@data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
1262
+ @data.any? { |vec| vec.include_values?(*Daru::MISSING_VALUES) }
1035
1263
  end
1036
1264
  alias :flawed? :has_missing_data?
1037
1265
  deprecate :has_missing_data?, :include_values?, 2016, 10
1038
1266
  deprecate :flawed?, :include_values?, 2016, 10
1039
1267
 
1040
1268
  # Check if any of given values occur in the data frame
1041
- # @param [Array] *values values to check for
1269
+ # @param [Array] values to check for
1042
1270
  # @return [true, false] true if any of the given values occur in the
1043
1271
  # dataframe, false otherwise
1044
1272
  # @example
@@ -1119,7 +1347,7 @@ module Daru
1119
1347
  # row[:a] < 3 and row[:b] == 'b'
1120
1348
  # end #=> true
1121
1349
  def any? axis=:vector, &block
1122
- if axis == :vector || axis == :column
1350
+ if %i[vector column].include?(axis)
1123
1351
  @data.any?(&block)
1124
1352
  elsif axis == :row
1125
1353
  each_row do |row|
@@ -1141,7 +1369,7 @@ module Daru
1141
1369
  # row[:a] < 10
1142
1370
  # end #=> true
1143
1371
  def all? axis=:vector, &block
1144
- if axis == :vector || axis == :column
1372
+ if %i[vector column].include?(axis)
1145
1373
  @data.all?(&block)
1146
1374
  elsif axis == :row
1147
1375
  each_row.all?(&block)
@@ -1169,13 +1397,60 @@ module Daru
1169
1397
 
1170
1398
  alias :last :tail
1171
1399
 
1172
- # Returns a vector with sum of all vectors specified in the argument.
1173
- # If vecs parameter is empty, sum all numeric vector.
1174
- def vector_sum vecs=nil
1400
+ # Sum all numeric/specified vectors in the DataFrame.
1401
+ #
1402
+ # Returns a new vector that's a containing a sum of all numeric
1403
+ # or specified vectors of the DataFrame. By default, if the vector
1404
+ # contains a nil, the sum is nil.
1405
+ # With :skipnil argument set to true, nil values are assumed to be
1406
+ # 0 (zero) and the sum vector is returned.
1407
+ #
1408
+ # @param args [Array] List of vectors to sum. Default is nil in which case
1409
+ # all numeric vectors are summed.
1410
+ #
1411
+ # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1412
+ #
1413
+ # @return Vector with sum of all vectors specified in the argument.
1414
+ # If vecs parameter is empty, sum all numeric vector.
1415
+ #
1416
+ # @example
1417
+ # df = Daru::DataFrame.new({
1418
+ # a: [1, 2, nil],
1419
+ # b: [2, 1, 3],
1420
+ # c: [1, 1, 1]
1421
+ # })
1422
+ # => #<Daru::DataFrame(3x3)>
1423
+ # a b c
1424
+ # 0 1 2 1
1425
+ # 1 2 1 1
1426
+ # 2 nil 3 1
1427
+ # df.vector_sum [:a, :c]
1428
+ # => #<Daru::Vector(3)>
1429
+ # 0 2
1430
+ # 1 3
1431
+ # 2 nil
1432
+ # df.vector_sum
1433
+ # => #<Daru::Vector(3)>
1434
+ # 0 4
1435
+ # 1 4
1436
+ # 2 nil
1437
+ # df.vector_sum skipnil: true
1438
+ # => #<Daru::Vector(3)>
1439
+ # c
1440
+ # 0 4
1441
+ # 1 4
1442
+ # 2 4
1443
+ #
1444
+ def vector_sum(*args)
1445
+ defaults = {vecs: nil, skipnil: false}
1446
+ options = args.last.is_a?(::Hash) ? args.pop : {}
1447
+ options = defaults.merge(options)
1448
+ vecs = args[0] || options[:vecs]
1449
+ skipnil = args[1] || options[:skipnil]
1450
+
1175
1451
  vecs ||= numeric_vectors
1176
1452
  sum = Daru::Vector.new [0]*@size, index: @index, name: @name, dtype: @dtype
1177
-
1178
- vecs.inject(sum) { |memo, n| memo + self[n] }
1453
+ vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1179
1454
  end
1180
1455
 
1181
1456
  # Calculate mean of the rows of the dataframe.
@@ -1220,11 +1495,10 @@ module Daru
1220
1495
  # # ["foo", "two", 3]=>[2, 4]}
1221
1496
  def group_by *vectors
1222
1497
  vectors.flatten!
1223
- # FIXME: wouldn't it better to do vectors - @vectors here and
1224
- # raise one error with all non-existent vector names?.. - zverok, 2016-05-18
1225
- vectors.each { |v|
1226
- raise(ArgumentError, "Vector #{v} does not exist") unless has_vector?(v)
1227
- }
1498
+ missing = vectors - @vectors.to_a
1499
+ unless missing.empty?
1500
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}")
1501
+ end
1228
1502
 
1229
1503
  vectors = [@vectors.first] if vectors.empty?
1230
1504
 
@@ -1234,7 +1508,7 @@ module Daru
1234
1508
  def reindex_vectors new_vectors
1235
1509
  unless new_vectors.is_a?(Daru::Index)
1236
1510
  raise ArgumentError, 'Must pass the new index of type Index or its '\
1237
- "subclasses, not #{new_index.class}"
1511
+ "subclasses, not #{new_vectors.class}"
1238
1512
  end
1239
1513
 
1240
1514
  cl = Daru::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
@@ -1272,14 +1546,52 @@ module Daru
1272
1546
  df
1273
1547
  end
1274
1548
 
1549
+ module SetSingleIndexStrategy
1550
+ def self.uniq_size(df, col)
1551
+ df[col].uniq.size
1552
+ end
1553
+
1554
+ def self.new_index(df, col)
1555
+ Daru::Index.new(df[col].to_a)
1556
+ end
1557
+
1558
+ def self.delete_vector(df, col)
1559
+ df.delete_vector(col)
1560
+ end
1561
+ end
1562
+
1563
+ module SetMultiIndexStrategy
1564
+ def self.uniq_size(df, cols)
1565
+ df[*cols].uniq.size
1566
+ end
1567
+
1568
+ def self.new_index(df, cols)
1569
+ Daru::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1570
+ mi.name = cols
1571
+ mi
1572
+ end
1573
+ end
1574
+
1575
+ def self.delete_vector(df, cols)
1576
+ df.delete_vectors(*cols)
1577
+ end
1578
+ end
1579
+
1275
1580
  # Set a particular column as the new DF
1276
- def set_index new_index, opts={}
1277
- raise ArgumentError, 'All elements in new index must be unique.' if
1278
- @size != self[new_index].uniq.size
1581
+ def set_index new_index_col, opts={}
1582
+ if new_index_col.respond_to?(:to_a)
1583
+ strategy = SetMultiIndexStrategy
1584
+ new_index_col = new_index_col.to_a
1585
+ else
1586
+ strategy = SetSingleIndexStrategy
1587
+ end
1279
1588
 
1280
- self.index = Daru::Index.new(self[new_index].to_a)
1281
- delete_vector(new_index) unless opts[:keep]
1589
+ uniq_size = strategy.uniq_size(self, new_index_col)
1590
+ raise ArgumentError, 'All elements in new index must be unique.' if
1591
+ @size != uniq_size
1282
1592
 
1593
+ self.index = strategy.new_index(self, new_index_col)
1594
+ strategy.delete_vector(self, new_index_col) unless opts[:keep]
1283
1595
  self
1284
1596
  end
1285
1597
 
@@ -1317,11 +1629,24 @@ module Daru
1317
1629
  end
1318
1630
  end
1319
1631
 
1632
+ def reset_index
1633
+ index_df = index.to_df
1634
+ names = index.name
1635
+ names = [names] unless names.instance_of?(Array)
1636
+ new_vectors = names + vectors.to_a
1637
+ self.index = index_df.index
1638
+ names.each do |name|
1639
+ self[name] = index_df[name]
1640
+ end
1641
+ self.order = new_vectors
1642
+ self
1643
+ end
1644
+
1320
1645
  # Reassign index with a new index of type Daru::Index or any of its subclasses.
1321
1646
  #
1322
1647
  # @param [Daru::Index] idx New index object on which the rows of the dataframe
1323
1648
  # are to be indexed.
1324
- # @example Reassgining index of a DataFrame
1649
+ # @example Reassigining index of a DataFrame
1325
1650
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1326
1651
  # df.index.to_a #=> [0,1,2,3]
1327
1652
  #
@@ -1337,7 +1662,7 @@ module Daru
1337
1662
 
1338
1663
  # Reassign vectors with a new index of type Daru::Index or any of its subclasses.
1339
1664
  #
1340
- # @param [Daru::Index] idx The new index object on which the vectors are to
1665
+ # @param new_index [Daru::Index] idx The new index object on which the vectors are to
1341
1666
  # be indexed. Must of the same size as ncols.
1342
1667
  # @example Reassigning vectors of a DataFrame
1343
1668
  # df = Daru::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
@@ -1377,13 +1702,31 @@ module Daru
1377
1702
  # df.rename_vectors :a => :alpha, :c => :gamma
1378
1703
  # df.vectors.to_a #=> [:alpha, :b, :gamma]
1379
1704
  def rename_vectors name_map
1380
- existing_targets = name_map.select { |k,v| k != v }.values & vectors.to_a
1705
+ existing_targets = name_map.reject { |k,v| k == v }.values & vectors.to_a
1381
1706
  delete_vectors(*existing_targets)
1382
1707
 
1383
1708
  new_names = vectors.to_a.map { |v| name_map[v] ? name_map[v] : v }
1384
1709
  self.vectors = Daru::Index.new new_names
1385
1710
  end
1386
1711
 
1712
+ # Renames the vectors and returns itself
1713
+ #
1714
+ # == Arguments
1715
+ #
1716
+ # * name_map - A hash where the keys are the exising vector names and
1717
+ # the values are the new names. If a vector is renamed
1718
+ # to a vector name that is already in use, the existing
1719
+ # one is overwritten.
1720
+ #
1721
+ # == Usage
1722
+ #
1723
+ # df = Daru::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1724
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
1725
+ def rename_vectors! name_map
1726
+ rename_vectors(name_map)
1727
+ self
1728
+ end
1729
+
1387
1730
  # Return the indexes of all the numeric vectors. Will include vectors with nils
1388
1731
  # alongwith numbers.
1389
1732
  def numeric_vectors
@@ -1408,27 +1751,24 @@ module Daru
1408
1751
  Daru::DataFrame.new(arry, clone: cln, order: order, index: @index)
1409
1752
  end
1410
1753
 
1411
- # Generate a summary of this DataFrame with ReportBuilder.
1412
- def summary(method=:to_text)
1413
- ReportBuilder.new(no_title: true).add(self).send(method)
1414
- end
1415
-
1416
- def report_building(b) # :nodoc: #
1417
- b.section(name: @name) do |g|
1418
- g.text "Number of rows: #{nrows}"
1419
- @vectors.each do |v|
1420
- g.text "Element:[#{v}]"
1421
- g.parse_element(self[v])
1422
- end
1754
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1755
+ # @return [String] String containing the summary of the DataFrame
1756
+ def summary
1757
+ summary = "= #{name}"
1758
+ summary << "\n Number of rows: #{nrows}"
1759
+ @vectors.each do |v|
1760
+ summary << "\n Element:[#{v}]\n"
1761
+ summary << self[v].summary(1)
1423
1762
  end
1763
+ summary
1424
1764
  end
1425
1765
 
1426
1766
  # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1427
1767
  # vectors, with or without a block.
1428
1768
  #
1429
- # @param order [Array] The order of vector names in which the DataFrame
1769
+ # @param vector_order [Array] The order of vector names in which the DataFrame
1430
1770
  # should be sorted.
1431
- # @param [Hash] opts The options to sort with.
1771
+ # @param opts [Hash] opts The options to sort with.
1432
1772
  # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1433
1773
  # or descending order. Specify Array corresponding to *order* for multiple
1434
1774
  # sort orders.
@@ -1597,12 +1937,11 @@ module Daru
1597
1937
 
1598
1938
  new_fields = (@vectors.to_a + other_df.vectors.to_a)
1599
1939
  new_fields = ArrayHelper.recode_repeated(new_fields)
1600
-
1601
1940
  DataFrame.new({}, order: new_fields).tap do |df_new|
1602
1941
  (0...nrows).each do |i|
1603
1942
  df_new.add_row row[i].to_a + other_df.row[i].to_a
1604
1943
  end
1605
-
1944
+ df_new.index = @index if @index == other_df.index
1606
1945
  df_new.update
1607
1946
  end
1608
1947
  end
@@ -1783,7 +2122,9 @@ module Daru
1783
2122
  end
1784
2123
 
1785
2124
  # Convert to html for IRuby.
1786
- def to_html threshold=30
2125
+ def to_html(threshold=Daru.max_rows)
2126
+ table_thead = to_html_thead
2127
+ table_tbody = to_html_tbody(threshold)
1787
2128
  path = if index.is_a?(MultiIndex)
1788
2129
  File.expand_path('../iruby/templates/dataframe_mi.html.erb', __FILE__)
1789
2130
  else
@@ -1792,8 +2133,29 @@ module Daru
1792
2133
  ERB.new(File.read(path).strip).result(binding)
1793
2134
  end
1794
2135
 
2136
+ def to_html_thead
2137
+ table_thead_path =
2138
+ if index.is_a?(MultiIndex)
2139
+ File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __FILE__)
2140
+ else
2141
+ File.expand_path('../iruby/templates/dataframe_thead.html.erb', __FILE__)
2142
+ end
2143
+ ERB.new(File.read(table_thead_path).strip).result(binding)
2144
+ end
2145
+
2146
+ def to_html_tbody(threshold=Daru.max_rows)
2147
+ threshold ||= @size
2148
+ table_tbody_path =
2149
+ if index.is_a?(MultiIndex)
2150
+ File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __FILE__)
2151
+ else
2152
+ File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __FILE__)
2153
+ end
2154
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
2155
+ end
2156
+
1795
2157
  def to_s
1796
- to_html
2158
+ "#<#{self.class}#{': ' + @name.to_s if @name}(#{nrows}x#{ncols})>"
1797
2159
  end
1798
2160
 
1799
2161
  # Method for updating the metadata (i.e. missing value positions) of the
@@ -1815,7 +2177,7 @@ module Daru
1815
2177
 
1816
2178
  # Write this DataFrame to a CSV file.
1817
2179
  #
1818
- # == Arguements
2180
+ # == Arguments
1819
2181
  #
1820
2182
  # * filename - Path of CSV file where the DataFrame is to be saved.
1821
2183
  #
@@ -1899,15 +2261,15 @@ module Daru
1899
2261
  end
1900
2262
 
1901
2263
  # Pretty print in a nice table format for the command line (irb/pry/iruby)
1902
- def inspect spacing=10, threshold=15
1903
- row_headers = index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2264
+ def inspect spacing=Daru.spacing, threshold=Daru.max_rows
1904
2265
  name_part = @name ? ": #{@name} " : ''
2266
+ spacing = [headers.to_a.map(&:length).max, spacing].max
1905
2267
 
1906
- "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>\n" +
2268
+ "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
1907
2269
  Formatters::Table.format(
1908
2270
  each_row.lazy,
1909
2271
  row_headers: row_headers,
1910
- headers: vectors,
2272
+ headers: headers,
1911
2273
  threshold: threshold,
1912
2274
  spacing: spacing
1913
2275
  )
@@ -1927,7 +2289,7 @@ module Daru
1927
2289
  end
1928
2290
 
1929
2291
  # Converts the specified non category type vectors to category type vectors
1930
- # @param [Array] *names names of non category type vectors to be converted
2292
+ # @param [Array] names of non category type vectors to be converted
1931
2293
  # @return [Daru::DataFrame] data frame in which specified vectors have been
1932
2294
  # converted to category type
1933
2295
  # @example
@@ -1992,7 +2354,7 @@ module Daru
1992
2354
  # # 2 3]
1993
2355
  def split_by_category cat_name
1994
2356
  cat_dv = self[cat_name]
1995
- raise ArguementError, "#{cat_name} is not a category vector" unless
2357
+ raise ArgumentError, "#{cat_name} is not a category vector" unless
1996
2358
  cat_dv.category?
1997
2359
 
1998
2360
  cat_dv.categories.map do |cat|
@@ -2002,8 +2364,128 @@ module Daru
2002
2364
  end
2003
2365
  end
2004
2366
 
2367
+ # @param indexes [Array] index(s) at which row tuples are retrieved
2368
+ # @return [Array] returns array of row tuples at given index(s)
2369
+ # @example Using Daru::Index
2370
+ # df = Daru::DataFrame.new({
2371
+ # a: [1, 2, 3],
2372
+ # b: ['a', 'a', 'b']
2373
+ # })
2374
+ #
2375
+ # df.access_row_tuples_by_indexs(1,2)
2376
+ # # => [[2, "a"], [3, "b"]]
2377
+ #
2378
+ # df.index = Daru::Index.new([:one,:two,:three])
2379
+ # df.access_row_tuples_by_indexs(:one,:three)
2380
+ # # => [[1, "a"], [3, "b"]]
2381
+ #
2382
+ # @example Using Daru::MultiIndex
2383
+ # mi_idx = Daru::MultiIndex.from_tuples [
2384
+ # [:a,:one,:bar],
2385
+ # [:a,:one,:baz],
2386
+ # [:b,:two,:bar],
2387
+ # [:a,:two,:baz],
2388
+ # ]
2389
+ # df_mi = Daru::DataFrame.new({
2390
+ # a: 1..4,
2391
+ # b: 'a'..'d'
2392
+ # }, index: mi_idx )
2393
+ #
2394
+ # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2395
+ # # => [[3, "c"]]
2396
+ # df_mi.access_row_tuples_by_indexs(:a)
2397
+ # # => [[1, "a"], [2, "b"], [4, "d"]]
2398
+ def access_row_tuples_by_indexs *indexes
2399
+ return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2400
+ @index.is_a?(Daru::MultiIndex)
2401
+ positions = @index.pos(*indexes)
2402
+ if positions.is_a? Numeric
2403
+ row = get_rows_for([positions])
2404
+ row.first.is_a?(Array) ? row : [row]
2405
+ else
2406
+ new_rows = get_rows_for(indexes, by_position: false)
2407
+ indexes.map { |index| new_rows.map { |r| r[index] } }
2408
+ end
2409
+ end
2410
+
2411
+ # Function to use for aggregating the data.
2412
+ #
2413
+ # @param options [Hash] options for column, you want in resultant dataframe
2414
+ #
2415
+ # @return [Daru::DataFrame]
2416
+ #
2417
+ # @example
2418
+ # df = Daru::DataFrame.new(
2419
+ # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2420
+ # => #<Daru::DataFrame(5x2)>
2421
+ # col num
2422
+ # 0 a 52
2423
+ # 1 b 12
2424
+ # 2 c 7
2425
+ # 3 d 17
2426
+ # 4 e 1
2427
+ #
2428
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2429
+ # => #<Daru::DataFrame(5x1)>
2430
+ # num_100_ti
2431
+ # 0 5200
2432
+ # 1 1200
2433
+ # 2 700
2434
+ # 3 1700
2435
+ # 4 100
2436
+ #
2437
+ # When we have duplicate index :
2438
+ #
2439
+ # idx = Daru::CategoricalIndex.new [:a, :b, :a, :a, :c]
2440
+ # df = Daru::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2441
+ # => #<Daru::DataFrame(5x1)>
2442
+ # num
2443
+ # a 52
2444
+ # b 12
2445
+ # a 7
2446
+ # a 17
2447
+ # c 1
2448
+ #
2449
+ # df.aggregate(num: :mean)
2450
+ # => #<Daru::DataFrame(3x1)>
2451
+ # num
2452
+ # a 25.3333333
2453
+ # b 12
2454
+ # c 1
2455
+ #
2456
+ # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2457
+ # internally.
2458
+ def aggregate(options={}, multi_index_level=-1)
2459
+ if block_given?
2460
+ positions_tuples, new_index = yield(@index) # note: use of yield is private for now
2461
+ else
2462
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2463
+ end
2464
+
2465
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2466
+
2467
+ Daru::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2468
+ end
2469
+
2470
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2471
+ group_by(*group_by_keys).aggregate(aggregation_map)
2472
+ end
2473
+
2005
2474
  private
2006
2475
 
2476
+ # Will lazily load the plotting library being used for this dataframe
2477
+ def init_plotting_library
2478
+ self.plotting_library = Daru.plotting_library
2479
+ end
2480
+
2481
+ def headers
2482
+ Daru::Index.new(Array(index.name) + @vectors.to_a)
2483
+ end
2484
+
2485
+ def row_headers
2486
+ index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2487
+ end
2488
+
2007
2489
  def convert_categorical_vectors names
2008
2490
  names.map do |n|
2009
2491
  next unless self[n].category?
@@ -2034,7 +2516,7 @@ module Daru
2034
2516
  end
2035
2517
 
2036
2518
  def dispatch_to_axis(axis, method, *args, &block)
2037
- if axis == :vector || axis == :column
2519
+ if %i[vector column].include?(axis)
2038
2520
  send("#{method}_vector", *args, &block)
2039
2521
  elsif axis == :row
2040
2522
  send("#{method}_row", *args, &block)
@@ -2044,7 +2526,7 @@ module Daru
2044
2526
  end
2045
2527
 
2046
2528
  def dispatch_to_axis_pl(axis, method, *args, &block)
2047
- if axis == :vector || axis == :column
2529
+ if %i[vector column].include?(axis)
2048
2530
  send("#{method}_vectors", *args, &block)
2049
2531
  elsif axis == :row
2050
2532
  send("#{method}_rows", *args, &block)
@@ -2053,7 +2535,7 @@ module Daru
2053
2535
  end
2054
2536
  end
2055
2537
 
2056
- AXES = [:row, :vector].freeze
2538
+ AXES = %i[row vector].freeze
2057
2539
 
2058
2540
  def extract_axis names, default=:vector
2059
2541
  if AXES.include?(names.last)
@@ -2065,7 +2547,7 @@ module Daru
2065
2547
 
2066
2548
  def access_vector *names
2067
2549
  if names.first.is_a?(Range)
2068
- dup(@vectors[names.first])
2550
+ dup(@vectors.subset(names.first))
2069
2551
  elsif @vectors.is_a?(MultiIndex)
2070
2552
  access_vector_multi_index(*names)
2071
2553
  else
@@ -2087,14 +2569,16 @@ module Daru
2087
2569
 
2088
2570
  def access_vector_single_index *names
2089
2571
  if names.count < 2
2090
- pos = @vectors[names.first]
2091
-
2572
+ begin
2573
+ pos = @vectors.is_a?(Daru::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2574
+ rescue IndexError
2575
+ raise IndexError, "Specified vector #{names.first} does not exist"
2576
+ end
2092
2577
  return @data[pos] if pos.is_a?(Numeric)
2093
-
2094
2578
  names = pos
2095
2579
  end
2096
2580
 
2097
- new_vectors = names.map { |name| [name, @data[@vectors[name]]] }.to_h
2581
+ new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2098
2582
 
2099
2583
  order = names.is_a?(Array) ? Daru::Index.new(names) : names
2100
2584
  Daru::DataFrame.new(new_vectors, order: order,
@@ -2105,19 +2589,30 @@ module Daru
2105
2589
  positions = @index.pos(*indexes)
2106
2590
 
2107
2591
  if positions.is_a? Numeric
2108
- return Daru::Vector.new populate_row_for(positions),
2109
- index: @vectors,
2110
- name: indexes.first
2592
+ row = get_rows_for([positions])
2593
+ Daru::Vector.new row, index: @vectors, name: indexes.first
2111
2594
  else
2112
- new_rows = @data.map { |vec| vec[*indexes] }
2113
- return Daru::DataFrame.new new_rows,
2114
- index: @index.subset(*indexes),
2115
- order: @vectors
2595
+ new_rows = get_rows_for(indexes, by_position: false)
2596
+ Daru::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2116
2597
  end
2117
2598
  end
2118
2599
 
2119
- def populate_row_for pos
2120
- @data.map { |vector| vector.at(*pos) }
2600
+ # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2601
+ # because of coercion by Daru::Vector#at and Daru::Vector#[], can return either an Array of
2602
+ # values (representing a row) or an array of Vectors (that can be seen as rows)
2603
+ def get_rows_for(keys, by_position: true)
2604
+ raise unless keys.is_a?(Array)
2605
+
2606
+ if by_position
2607
+ pos = keys
2608
+ @data.map { |vector| vector.at(*pos) }
2609
+ else
2610
+ # TODO: for now (2018-07-27), it is different than using
2611
+ # get_rows_for(@index.pos(*keys))
2612
+ # because Daru::Vector#at and Daru::Vector#[] don't handle Daru::MultiIndex the same way
2613
+ indexes = keys
2614
+ @data.map { |vec| vec[*indexes] }
2615
+ end
2121
2616
  end
2122
2617
 
2123
2618
  def insert_or_modify_vector name, vector
@@ -2126,7 +2621,7 @@ module Daru
2126
2621
  if @index.empty?
2127
2622
  insert_vector_in_empty name, vector
2128
2623
  else
2129
- vec = prepare_vector_for_insert name, vector
2624
+ vec = prepare_for_insert name, vector
2130
2625
 
2131
2626
  assign_or_add_vector name, vec
2132
2627
  end
@@ -2173,25 +2668,35 @@ module Daru
2173
2668
  @data.map! { |v| v.empty? ? v.reindex(@index) : v }
2174
2669
  end
2175
2670
 
2176
- def prepare_vector_for_insert name, vector
2177
- if vector.is_a?(Daru::Vector)
2178
- # so that index-by-index assignment is avoided when possible.
2179
- return vector.dup if vector.index == @index
2180
-
2181
- Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
2182
- @index.each do |idx|
2183
- v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2184
- end
2185
- }
2671
+ def prepare_for_insert name, arg
2672
+ if arg.is_a? Daru::Vector
2673
+ prepare_vector_for_insert name, arg
2674
+ elsif arg.respond_to?(:to_a)
2675
+ prepare_enum_for_insert name, arg
2186
2676
  else
2187
- # FIXME: No spec checks this case... And SizeError is not a thing - zverok, 2016-05-08
2188
- if @size != vector.size
2189
- raise SizeError,
2190
- "Specified vector of length #{vector.size} cannot be inserted in DataFrame of size #{@size}"
2677
+ prepare_value_for_insert name, arg
2678
+ end
2679
+ end
2680
+
2681
+ def prepare_vector_for_insert name, vector
2682
+ # so that index-by-index assignment is avoided when possible.
2683
+ return vector.dup if vector.index == @index
2684
+ Daru::Vector.new([], name: coerce_name(name), index: @index).tap { |v|
2685
+ @index.each do |idx|
2686
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2191
2687
  end
2688
+ }
2689
+ end
2192
2690
 
2193
- Daru::Vector.new(vector, name: coerce_name(name), index: @index)
2691
+ def prepare_enum_for_insert name, enum
2692
+ if @size != enum.size
2693
+ raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
2194
2694
  end
2695
+ Daru::Vector.new(enum, name: coerce_name(name), index: @index)
2696
+ end
2697
+
2698
+ def prepare_value_for_insert name, value
2699
+ Daru::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
2195
2700
  end
2196
2701
 
2197
2702
  def insert_or_modify_row indexes, vector
@@ -2208,7 +2713,10 @@ module Daru
2208
2713
  set_size
2209
2714
  end
2210
2715
 
2211
- def create_empty_vectors
2716
+ def create_empty_vectors(vectors, index)
2717
+ @vectors = Index.coerce vectors
2718
+ @index = Index.coerce index
2719
+
2212
2720
  @data = @vectors.map do |name|
2213
2721
  Daru::Vector.new([], name: coerce_name(name), index: @index)
2214
2722
  end
@@ -2250,7 +2758,7 @@ module Daru
2250
2758
  end
2251
2759
 
2252
2760
  def create_vectors_index_with vectors, source
2253
- vectors = source.keys.sort_by(&:to_s) if vectors.nil?
2761
+ vectors = source.keys if vectors.nil?
2254
2762
 
2255
2763
  @vectors =
2256
2764
  if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
@@ -2276,8 +2784,10 @@ module Daru
2276
2784
 
2277
2785
  case source.first
2278
2786
  when Array
2787
+ vectors ||= (0..source.size-1).to_a
2279
2788
  initialize_from_array_of_arrays source, vectors, index, opts
2280
2789
  when Vector
2790
+ vectors ||= (0..source.size-1).to_a
2281
2791
  initialize_from_array_of_vectors source, vectors, index, opts
2282
2792
  when Hash
2283
2793
  initialize_from_array_of_hashes source, vectors, index, opts
@@ -2295,9 +2805,7 @@ module Daru
2295
2805
  @index = Index.coerce(index || source[0].size)
2296
2806
  @vectors = Index.coerce(vectors)
2297
2807
 
2298
- @data = @vectors.each_with_index.map do |_vec,idx|
2299
- Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
2300
- end
2808
+ update_data source, vectors
2301
2809
  end
2302
2810
 
2303
2811
  def initialize_from_array_of_vectors source, vectors, index, opts
@@ -2528,7 +3036,6 @@ module Daru
2528
3036
 
2529
3037
  # Raises IndexError when one of the positions is not a valid position
2530
3038
  def validate_positions *positions, size
2531
- positions = [positions] if positions.is_a? Integer
2532
3039
  positions.each do |pos|
2533
3040
  raise IndexError, "#{pos} is not a valid position." if pos >= size
2534
3041
  end
@@ -2546,6 +3053,73 @@ module Daru
2546
3053
  end
2547
3054
  end
2548
3055
 
3056
+ def update_data source, vectors
3057
+ @data = @vectors.each_with_index.map do |_vec, idx|
3058
+ Daru::Vector.new(source[idx], index: @index, name: vectors[idx])
3059
+ end
3060
+ end
3061
+
3062
+ def aggregate_by_positions_tuples(options, positions_tuples)
3063
+ agg_over_vectors_only, options = cast_aggregation_options(options)
3064
+
3065
+ if agg_over_vectors_only
3066
+ options.map do |vect_name, method|
3067
+ vect = self[vect_name]
3068
+
3069
+ positions_tuples.map do |positions|
3070
+ vect.apply_method_on_sub_vector(method, keys: positions)
3071
+ end
3072
+ end
3073
+ else
3074
+ methods = options.values
3075
+
3076
+ # note: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3077
+ rows = positions_tuples.map do |positions|
3078
+ apply_method_on_sub_df(methods, keys: positions)
3079
+ end
3080
+
3081
+ rows.transpose
3082
+ end
3083
+ end
3084
+
3085
+ # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3086
+ # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3087
+ # than aggregation over (sub-)dfs
3088
+ def cast_aggregation_options(options)
3089
+ vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3090
+
3091
+ over_vectors = true
3092
+
3093
+ if non_vects.any?
3094
+ options = options.clone
3095
+
3096
+ vects.each do |name|
3097
+ proc_on_vect = options[name].to_proc
3098
+ options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3099
+ end
3100
+
3101
+ over_vectors = false
3102
+ end
3103
+
3104
+ [over_vectors, options]
3105
+ end
3106
+
3107
+ def group_index_for_aggregation(index, multi_index_level=-1)
3108
+ case index
3109
+ when Daru::MultiIndex
3110
+ groups_by_pos = Daru::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3111
+
3112
+ new_index = Daru::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3113
+ pos_tuples = groups_by_pos.values
3114
+ when Daru::Index, Daru::CategoricalIndex
3115
+ new_index = Array(index).uniq
3116
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3117
+ else raise
3118
+ end
3119
+
3120
+ [pos_tuples, new_index]
3121
+ end
3122
+
2549
3123
  # coerce ranges, integers and array in appropriate ways
2550
3124
  def coerce_positions *positions, size
2551
3125
  if positions.size == 1
@@ -2555,7 +3129,7 @@ module Daru
2555
3129
  when Range
2556
3130
  size.times.to_a[positions.first]
2557
3131
  else
2558
- raise ArgumentError, 'Unkown position type.'
3132
+ raise ArgumentError, 'Unknown position type.'
2559
3133
  end
2560
3134
  else
2561
3135
  positions