daru_lite 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,3080 @@
1
+ require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/maths/arithmetic/dataframe'
3
+ require 'daru_lite/maths/statistics/dataframe'
4
+ require 'daru_lite/io/io'
5
+
6
+ module DaruLite
7
+ class DataFrame # rubocop:disable Metrics/ClassLength
8
+ include DaruLite::Maths::Arithmetic::DataFrame
9
+ include DaruLite::Maths::Statistics::DataFrame
10
+
11
+ attr_accessor(*Configuration::INSPECT_OPTIONS_KEYS)
12
+
13
+ extend Gem::Deprecate
14
+
15
+ class << self
16
+ # Load data from a CSV file. Specify an optional block to grab the CSV
17
+ # object and pre-condition it (for example use the `convert` or
18
+ # `header_convert` methods).
19
+ #
20
+ # == Arguments
21
+ #
22
+ # * path - Local path / Remote URL of the file to load specified as a String.
23
+ #
24
+ # == Options
25
+ #
26
+ # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
+ # and uses those to eventually construct the resulting DataFrame.
28
+ #
29
+ # == Verbose Description
30
+ #
31
+ # You can specify all the options to the `.from_csv` function that you
32
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
+ #
34
+ # For example, if the columns in your CSV file are separated by something
35
+ # other that commas, you can use the `:col_sep` option. If you want to
36
+ # convert numeric values to numbers and not keep them as strings, you can
37
+ # use the `:converters` option and set it to `:numeric`.
38
+ #
39
+ # The `.from_csv` function uses the following defaults for reading CSV files
40
+ # (that are passed into the `CSV.read()` function):
41
+ #
42
+ # {
43
+ # :col_sep => ',',
44
+ # :converters => :numeric
45
+ # }
46
+ def from_csv(path, opts = {}, &block)
47
+ DaruLite::IO.from_csv path, opts, &block
48
+ end
49
+
50
+ # Read data from an Excel file into a DataFrame.
51
+ #
52
+ # == Arguments
53
+ #
54
+ # * path - Path of the file to be read.
55
+ #
56
+ # == Options
57
+ #
58
+ # *:worksheet_id - ID of the worksheet that is to be read.
59
+ def from_excel(path, opts = {}, &block)
60
+ DaruLite::IO.from_excel path, opts, &block
61
+ end
62
+
63
+ # Read a database query and returns a Dataset
64
+ #
65
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
+ # @param query [String] The query to be executed
67
+ #
68
+ # @return A dataframe containing the data resulting from the query
69
+ #
70
+ # USE:
71
+ #
72
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
+ # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
+ #
75
+ # #Alternatively
76
+ #
77
+ # require 'dbi'
78
+ # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
+ def from_sql(dbh, query)
80
+ DaruLite::IO.from_sql dbh, query
81
+ end
82
+
83
+ # Read a dataframe from AR::Relation
84
+ #
85
+ # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
+ # @param fields [Array] Field names to be loaded (optional)
87
+ #
88
+ # @return A dataframe containing the data loaded from the relation
89
+ #
90
+ # USE:
91
+ #
92
+ # # When Post model is defined as:
93
+ # class Post < ActiveRecord::Base
94
+ # scope :active, -> { where.not(published_at: nil) }
95
+ # end
96
+ #
97
+ # # You can load active posts into a dataframe by:
98
+ # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
+ def from_activerecord(relation, *fields)
100
+ DaruLite::IO.from_activerecord relation, *fields
101
+ end
102
+
103
+ # Read the database from a plaintext file. For this method to work,
104
+ # the data should be present in a plain text file in columns. See
105
+ # spec/fixtures/bank2.dat for an example.
106
+ #
107
+ # == Arguments
108
+ #
109
+ # * path - Path of the file to be read.
110
+ # * fields - Vector names of the resulting database.
111
+ #
112
+ # == Usage
113
+ #
114
+ # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
+ def from_plaintext(path, fields)
116
+ DaruLite::IO.from_plaintext path, fields
117
+ end
118
+
119
+ # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
+ # DaruLite::Vector objects.
121
+ def rows(source, opts = {})
122
+ raise SizeError, 'All vectors must have same length' \
123
+ unless source.all? { |v| v.size == source.first.size }
124
+
125
+ opts[:order] ||= guess_order(source)
126
+
127
+ if ArrayHelper.array_of?(source, Array) || source.empty?
128
+ DataFrame.new(source.transpose, opts)
129
+ elsif ArrayHelper.array_of?(source, Vector)
130
+ from_vector_rows(source, opts)
131
+ else
132
+ raise ArgumentError, "Can't create DataFrame from #{source}"
133
+ end
134
+ end
135
+
136
+ # Generates a new dataset, using three vectors
137
+ # - Rows
138
+ # - Columns
139
+ # - Values
140
+ #
141
+ # For example, you have these values
142
+ #
143
+ # x y v
144
+ # a a 0
145
+ # a b 1
146
+ # b a 1
147
+ # b b 0
148
+ #
149
+ # You obtain
150
+ # id a b
151
+ # a 0 1
152
+ # b 1 0
153
+ #
154
+ # Useful to process outputs from databases
155
+ def crosstab_by_assignation(rows, columns, values)
156
+ raise 'Three vectors should be equal size' if
157
+ rows.size != columns.size || rows.size != values.size
158
+
159
+ data = Hash.new do |h, col|
160
+ h[col] = rows.factors.map { |r| [r, nil] }.to_h
161
+ end
162
+ columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
163
+
164
+ # FIXME: in fact, WITHOUT this line you'll obtain more "right"
165
+ # data: with vectors having "rows" as an index...
166
+ data = data.transform_values(&:values)
167
+ data[:_id] = rows.factors
168
+
169
+ DataFrame.new(data)
170
+ end
171
+
172
+ private
173
+
174
+ def guess_order(source)
175
+ case source.first
176
+ when Vector # assume that all are Vectors
177
+ source.first.index.to_a
178
+ when Array
179
+ Array.new(source.first.size, &:to_s)
180
+ end
181
+ end
182
+
183
+ def from_vector_rows(source, opts)
184
+ index = source.map(&:name)
185
+ .each_with_index.map { |n, i| n || i }
186
+ index = ArrayHelper.recode_repeated(index)
187
+
188
+ DataFrame.new({}, opts).tap do |df|
189
+ source.each_with_index do |row, idx|
190
+ df[index[idx] || idx, :row] = row
191
+ end
192
+ end
193
+ end
194
+ end
195
+
196
+ # The vectors (columns) index of the DataFrame
197
+ attr_reader :vectors
198
+ # TOREMOVE
199
+ attr_reader :data
200
+
201
+ # The index of the rows of the DataFrame
202
+ attr_reader :index
203
+
204
+ # The name of the DataFrame
205
+ attr_reader :name
206
+
207
+ # The number of rows present in the DataFrame
208
+ attr_reader :size
209
+
210
+ # DataFrame basically consists of an Array of Vector objects.
211
+ # These objects are indexed by row and column by vectors and index Index objects.
212
+ #
213
+ # == Arguments
214
+ #
215
+ # * source - Source from the DataFrame is to be initialized. Can be a Hash
216
+ # of names and vectors (array or DaruLite::Vector), an array of arrays or
217
+ # array of DaruLite::Vectors.
218
+ #
219
+ # == Options
220
+ #
221
+ # +:order+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order in
222
+ # which Vectors should appear in the DataFrame.
223
+ #
224
+ # +:index+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order
225
+ # in which rows of the DataFrame will be named.
226
+ #
227
+ # +:name+ - A name for the DataFrame.
228
+ #
229
+ # +:clone+ - Specify as *true* or *false*. When set to false, and Vector
230
+ # objects are passed for the source, the Vector objects will not duplicated
231
+ # when creating the DataFrame. Will have no effect if Array is passed in
232
+ # the source, or if the passed DaruLite::Vectors have different indexes.
233
+ # Default to *true*.
234
+ #
235
+ # == Usage
236
+ #
237
+ # df = DaruLite::DataFrame.new
238
+ # # =>
239
+ # # <DaruLite::DataFrame(0x0)>
240
+ # # Creates an empty DataFrame with no rows or columns.
241
+ #
242
+ # df = DaruLite::DataFrame.new({}, order: [:a, :b])
243
+ # #<DaruLite::DataFrame(0x2)>
244
+ # a b
245
+ # # Creates a DataFrame with no rows and columns :a and :b
246
+ #
247
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
248
+ # index: [:a, :b, :c, :d], name: :spider_man)
249
+ #
250
+ # # =>
251
+ # # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
252
+ # # b a
253
+ # # a 6 1
254
+ # # b 7 2
255
+ # # c 8 3
256
+ # # d 9 4
257
+ #
258
+ # df = DaruLite::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
259
+ #
260
+ # # =>
261
+ # # #<DaruLite::DataFrame: bat_man (4x2)>
262
+ # # 0 1
263
+ # # 0 1 6
264
+ # # 1 2 7
265
+ # # 2 3 8
266
+ # # 3 4 9
267
+ #
268
+ # # Dataframe having Index name
269
+ #
270
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
271
+ # index: DaruLite::Index.new([:a, :b, :c, :d], name: 'idx_name'),
272
+ # name: :spider_man)
273
+ #
274
+ # # =>
275
+ # # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
276
+ # # idx_name b a
277
+ # # a 6 1
278
+ # # b 7 2
279
+ # # c 8 3
280
+ # # d 9 4
281
+ #
282
+ #
283
+ # idx = DaruLite::Index.new [100, 99, 101, 1, 2], name: "s1"
284
+ # => #<DaruLite::Index(5): s1 {100, 99, 101, 1, 2}>
285
+ #
286
+ # df = DaruLite::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
287
+ # c: [11,22,33,44,55]},
288
+ # order: [:a, :b, :c],
289
+ # index: idx)
290
+ # # =>
291
+ # #<DaruLite::DataFrame(5x3)>
292
+ # # s1 a b c
293
+ # # 100 1 11 11
294
+ # # 99 2 12 22
295
+ # # 101 3 13 33
296
+ # # 1 4 14 44
297
+ # # 2 5 15 55
298
+
299
+ def initialize(source = {}, opts = {})
300
+ vectors = opts[:order]
301
+ index = opts[:index] # FIXME: just keyword arges after Ruby 2.1
302
+ @data = []
303
+ @name = opts[:name]
304
+
305
+ case source
306
+ when [], {}
307
+ create_empty_vectors(vectors, index)
308
+ when Array
309
+ initialize_from_array source, vectors, index, opts
310
+ when Hash
311
+ initialize_from_hash source, vectors, index, opts
312
+ end
313
+
314
+ set_size
315
+ validate
316
+ update
317
+ end
318
+
319
+ # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
+ # Defaults to *:vector*. Use of this method is not recommended for accessing
321
+ # rows. Use df.row[:a] for accessing row with index ':a'.
322
+ def [](*names)
323
+ axis = extract_axis(names, :vector)
324
+ dispatch_to_axis axis, :access, *names
325
+ end
326
+
327
+ # Retrive rows by positions
328
+ # @param [Array<Integer>] positions of rows to retrive
329
+ # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
+ # @example
331
+ # df = DaruLite::DataFrame.new({
332
+ # a: [1, 2, 3],
333
+ # b: ['a', 'b', 'c']
334
+ # })
335
+ # df.row_at 1, 2
336
+ # # => #<DaruLite::DataFrame(2x2)>
337
+ # # a b
338
+ # # 1 2 b
339
+ # # 2 3 c
340
+ def row_at(*positions)
341
+ original_positions = positions
342
+ positions = coerce_positions(*positions, nrows)
343
+ validate_positions(*positions, nrows)
344
+
345
+ if positions.is_a? Integer
346
+ row = get_rows_for([positions])
347
+ DaruLite::Vector.new row, index: @vectors
348
+ else
349
+ new_rows = get_rows_for(original_positions)
350
+ DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
+ end
352
+ end
353
+
354
+ # Set rows by positions
355
+ # @param [Array<Integer>] positions positions of rows to set
356
+ # @param [Array, DaruLite::Vector] vector vector to be assigned
357
+ # @example
358
+ # df = DaruLite::DataFrame.new({
359
+ # a: [1, 2, 3],
360
+ # b: ['a', 'b', 'c']
361
+ # })
362
+ # df.set_row_at [0, 1], ['x', 'x']
363
+ # df
364
+ # #=> #<DaruLite::DataFrame(3x2)>
365
+ # # a b
366
+ # # 0 x x
367
+ # # 1 x x
368
+ # # 2 3 c
369
+ def set_row_at(positions, vector)
370
+ validate_positions(*positions, nrows)
371
+ vector =
372
+ if vector.is_a? DaruLite::Vector
373
+ vector.reindex @vectors
374
+ else
375
+ DaruLite::Vector.new vector
376
+ end
377
+
378
+ raise SizeError, 'Vector length should match row length' if
379
+ vector.size != @vectors.size
380
+
381
+ @data.each_with_index do |vec, pos|
382
+ vec.set_at(positions, vector.at(pos))
383
+ end
384
+ @index = @data[0].index
385
+ set_size
386
+ end
387
+
388
+ # Retrive vectors by positions
389
+ # @param [Array<Integer>] positions of vectors to retrive
390
+ # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
+ # @example
392
+ # df = DaruLite::DataFrame.new({
393
+ # a: [1, 2, 3],
394
+ # b: ['a', 'b', 'c']
395
+ # })
396
+ # df.at 0
397
+ # # => #<DaruLite::Vector(3)>
398
+ # # a
399
+ # # 0 1
400
+ # # 1 2
401
+ # # 2 3
402
+ def at(*positions)
403
+ if AXES.include? positions.last
404
+ axis = positions.pop
405
+ return row_at(*positions) if axis == :row
406
+ end
407
+
408
+ original_positions = positions
409
+ positions = coerce_positions(*positions, ncols)
410
+ validate_positions(*positions, ncols)
411
+
412
+ if positions.is_a? Integer
413
+ @data[positions].dup
414
+ else
415
+ DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
+ index: @index,
417
+ order: @vectors.at(*original_positions),
418
+ name: @name
419
+ end
420
+ end
421
+
422
+ # Set vectors by positions
423
+ # @param [Array<Integer>] positions positions of vectors to set
424
+ # @param [Array, DaruLite::Vector] vector vector to be assigned
425
+ # @example
426
+ # df = DaruLite::DataFrame.new({
427
+ # a: [1, 2, 3],
428
+ # b: ['a', 'b', 'c']
429
+ # })
430
+ # df.set_at [0], ['x', 'y', 'z']
431
+ # df
432
+ # #=> #<DaruLite::DataFrame(3x2)>
433
+ # # a b
434
+ # # 0 x a
435
+ # # 1 y b
436
+ # # 2 z c
437
+ def set_at(positions, vector)
438
+ if positions.last == :row
439
+ positions.pop
440
+ return set_row_at(positions, vector)
441
+ end
442
+
443
+ validate_positions(*positions, ncols)
444
+ vector =
445
+ if vector.is_a? DaruLite::Vector
446
+ vector.reindex @index
447
+ else
448
+ DaruLite::Vector.new vector
449
+ end
450
+
451
+ raise SizeError, 'Vector length should match index length' if
452
+ vector.size != @index.size
453
+
454
+ positions.each { |pos| @data[pos] = vector }
455
+ end
456
+
457
+ # Insert a new row/vector of the specified name or modify a previous row.
458
+ # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
+ # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
+ #
461
+ # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
+ # of the vector will be matched against the row/vector indexes of the DataFrame
463
+ # before an insertion is performed. Unmatched indexes will be set to nil.
464
+ def []=(*args)
465
+ vector = args.pop
466
+ axis = extract_axis(args)
467
+ names = args
468
+
469
+ dispatch_to_axis axis, :insert_or_modify, names, vector
470
+ end
471
+
472
+ def add_row(row, index = nil)
473
+ self.row[*(index || @size)] = row
474
+ end
475
+
476
+ def add_vector(n, vector)
477
+ self[n] = vector
478
+ end
479
+
480
+ def insert_vector(n, name, source)
481
+ raise ArgumentError unless source.is_a? Array
482
+
483
+ vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
+ @data << vector
485
+ @vectors = @vectors.add name
486
+ ordr = @vectors.dup.to_a
487
+ elmnt = ordr.pop
488
+ ordr.insert n, elmnt
489
+ self.order = ordr
490
+ end
491
+
492
+ # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
+ #
494
+ # == Usage
495
+ # df.row[:a] # access row named ':a'
496
+ # df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
497
+ def row
498
+ DaruLite::Accessors::DataFrameByRow.new(self)
499
+ end
500
+
501
+ # Extract a dataframe given row indexes or positions
502
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
+ # @return [DaruLite::Dataframe]
504
+ def get_sub_dataframe(keys, by_position: true)
505
+ return DaruLite::DataFrame.new({}) if keys == []
506
+
507
+ keys = @index.pos(*keys) unless by_position
508
+
509
+ sub_df = row_at(*keys)
510
+ sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
+
512
+ sub_df
513
+ end
514
+
515
+ # Duplicate the DataFrame entirely.
516
+ #
517
+ # == Arguments
518
+ #
519
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
521
+ def dup(vectors_to_dup = nil)
522
+ vectors_to_dup ||= @vectors.to_a
523
+
524
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
+ new_order = DaruLite::Index.new(vectors_to_dup)
526
+
527
+ DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
+ end
529
+
530
+ # Only clone the structure of the DataFrame.
531
+ def clone_structure
532
+ DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
+ end
534
+
535
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
+ # preserved.
537
+ #
538
+ # == Arguments
539
+ #
540
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
+ # a view of the whole data frame otherwise.
542
+ def clone(*vectors_to_clone)
543
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
+
546
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
+ DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
+ end
549
+
550
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
+ # or a full copy of only valid data if missing data is present.
552
+ def clone_only_valid
553
+ if include_values?(*DaruLite::MISSING_VALUES)
554
+ reject_values(*DaruLite::MISSING_VALUES)
555
+ else
556
+ clone
557
+ end
558
+ end
559
+
560
+ # Creates a new duplicate dataframe containing only rows
561
+ # without a single missing value.
562
+ def dup_only_valid(vecs = nil)
563
+ rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
+ .inject(&:concat)
565
+ .uniq
566
+
567
+ row_indexes = @index.to_a
568
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
+ end
570
+ deprecate :dup_only_valid, :reject_values, 2016, 10
571
+
572
+ # Returns a dataframe in which rows with any of the mentioned values
573
+ # are ignored.
574
+ # @param [Array] values to reject to form the new dataframe
575
+ # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
+ # contain the mentioned values
577
+ # @example
578
+ # df = DaruLite::DataFrame.new({
579
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
+ # }, index: 11..18)
583
+ # df.reject_values nil, Float::NAN
584
+ # # => #<DaruLite::DataFrame(2x3)>
585
+ # # a b c
586
+ # # 11 1 a a
587
+ # # 18 7 8 7
588
+ def reject_values(*values)
589
+ positions =
590
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
+ if positions.size == 1
593
+ pos = positions.first
594
+ row_at(pos..pos)
595
+ else
596
+ row_at(*positions)
597
+ end
598
+ end
599
+
600
+ # Replace specified values with given value
601
+ # @param [Array] old_values values to replace with new value
602
+ # @param [object] new_value new value to replace with
603
+ # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
+ # with new value
605
+ # @example
606
+ # df = DaruLite::DataFrame.new({
607
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
+ # }, index: 11..18)
611
+ # df.replace_values nil, Float::NAN
612
+ # # => #<DaruLite::DataFrame(8x3)>
613
+ # # a b c
614
+ # # 11 1 a a
615
+ # # 12 2 b NaN
616
+ # # 13 3 NaN 3
617
+ # # 14 NaN NaN 4
618
+ # # 15 NaN NaN 3
619
+ # # 16 NaN 3 5
620
+ # # 17 1 5 NaN
621
+ # # 18 7 8 7
622
+ def replace_values(old_values, new_value)
623
+ @data.each { |vec| vec.replace_values old_values, new_value }
624
+ self
625
+ end
626
+
627
+ # Rolling fillna
628
+ # replace all Float::NAN and NIL values with the preceeding or following value
629
+ #
630
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
+ #
632
+ # @example
633
+ # df = DaruLite::DataFrame.new({
634
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
+ # })
638
+ #
639
+ # => #<DaruLite::DataFrame(8x3)>
640
+ # a b c
641
+ # 0 1 a a
642
+ # 1 2 b NaN
643
+ # 2 3 nil 3
644
+ # 3 nil NaN 4
645
+ # 4 NaN nil 3
646
+ # 5 nil 3 5
647
+ # 6 1 5 nil
648
+ # 7 7 nil 7
649
+ #
650
+ # 2.3.3 :068 > df.rolling_fillna(:forward)
651
+ # => #<DaruLite::DataFrame(8x3)>
652
+ # a b c
653
+ # 0 1 a a
654
+ # 1 2 b a
655
+ # 2 3 b 3
656
+ # 3 3 b 4
657
+ # 4 3 b 3
658
+ # 5 3 3 5
659
+ # 6 1 5 5
660
+ # 7 7 5 7
661
+ #
662
+ def rolling_fillna!(direction = :forward)
663
+ @data.each { |vec| vec.rolling_fillna!(direction) }
664
+ self
665
+ end
666
+
667
+ def rolling_fillna(direction = :forward)
668
+ dup.rolling_fillna!(direction)
669
+ end
670
+
671
+ # Return unique rows by vector specified or all vectors
672
+ #
673
+ # @param vtrs [String][Symbol] vector names(s) that should be considered
674
+ #
675
+ # @example
676
+ #
677
+ # => #<DaruLite::DataFrame(6x2)>
678
+ # a b
679
+ # 0 1 a
680
+ # 1 2 b
681
+ # 2 3 c
682
+ # 3 4 d
683
+ # 2 3 c
684
+ # 3 4 f
685
+ #
686
+ # 2.3.3 :> df.unique
687
+ # => #<DaruLite::DataFrame(5x2)>
688
+ # a b
689
+ # 0 1 a
690
+ # 1 2 b
691
+ # 2 3 c
692
+ # 3 4 d
693
+ # 3 4 f
694
+ #
695
+ # 2.3.3 :> df.unique(:a)
696
+ # => #<DaruLite::DataFrame(5x2)>
697
+ # a b
698
+ # 0 1 a
699
+ # 1 2 b
700
+ # 2 3 c
701
+ # 3 4 d
702
+ #
703
+ def uniq(*vtrs)
704
+ vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
+ grouped = group_by(vecs)
706
+ indexes = grouped.groups.values.map { |v| v[0] }.sort
707
+ row[*indexes]
708
+ end
709
+
710
+ # Iterate over each index of the DataFrame.
711
+ def each_index(&block)
712
+ return to_enum(:each_index) unless block
713
+
714
+ @index.each(&block)
715
+
716
+ self
717
+ end
718
+
719
+ # Iterate over each vector
720
+ def each_vector(&block)
721
+ return to_enum(:each_vector) unless block
722
+
723
+ @data.each(&block)
724
+
725
+ self
726
+ end
727
+
728
+ alias each_column each_vector
729
+
730
+ # Iterate over each vector alongwith the name of the vector
731
+ def each_vector_with_index
732
+ return to_enum(:each_vector_with_index) unless block_given?
733
+
734
+ @vectors.each do |vector|
735
+ yield @data[@vectors[vector]], vector
736
+ end
737
+
738
+ self
739
+ end
740
+
741
+ alias each_column_with_index each_vector_with_index
742
+
743
+ # Iterate over each row
744
+ def each_row
745
+ return to_enum(:each_row) unless block_given?
746
+
747
+ @index.size.times do |pos|
748
+ yield row_at(pos)
749
+ end
750
+
751
+ self
752
+ end
753
+
754
+ def each_row_with_index
755
+ return to_enum(:each_row_with_index) unless block_given?
756
+
757
+ @index.each do |index|
758
+ yield access_row(index), index
759
+ end
760
+
761
+ self
762
+ end
763
+
764
+ # Iterate over each row or vector of the DataFrame. Specify axis
765
+ # by passing :vector or :row as the argument. Default to :vector.
766
+ #
767
+ # == Description
768
+ #
769
+ # `#each` works exactly like Array#each. The default mode for `each`
770
+ # is to iterate over the columns of the DataFrame. To iterate over
771
+ # rows you must pass the axis, i.e `:row` as an argument.
772
+ #
773
+ # == Arguments
774
+ #
775
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
+ # or :row. Default to :vector.
777
+ def each(axis = :vector, &block)
778
+ dispatch_to_axis axis, :each, &block
779
+ end
780
+
781
+ # Iterate over a row or vector and return results in a DaruLite::Vector.
782
+ # Specify axis with :vector or :row. Default to :vector.
783
+ #
784
+ # == Description
785
+ #
786
+ # The #collect iterator works similar to #map, the only difference
787
+ # being that it returns a DaruLite::Vector comprising of the results of
788
+ # each block run. The resultant Vector has the same index as that
789
+ # of the axis over which collect has iterated. It also accepts the
790
+ # optional axis argument.
791
+ #
792
+ # == Arguments
793
+ #
794
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
+ # or :row. Default to :vector.
796
+ def collect(axis = :vector, &block)
797
+ dispatch_to_axis_pl axis, :collect, &block
798
+ end
799
+
800
+ # Map over each vector or row of the data frame according to
801
+ # the argument specified. Will return an Array of the resulting
802
+ # elements. To map over each row/vector and get a DataFrame,
803
+ # see #recode.
804
+ #
805
+ # == Description
806
+ #
807
+ # The #map iterator works like Array#map. The value returned by
808
+ # each run of the block is added to an Array and the Array is
809
+ # returned. This method also accepts an axis argument, like #each.
810
+ # The default is :vector.
811
+ #
812
+ # == Arguments
813
+ #
814
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
+ # Default to :vector.
816
+ def map(axis = :vector, &block)
817
+ dispatch_to_axis_pl axis, :map, &block
818
+ end
819
+
820
+ # Destructive map. Modifies the DataFrame. Each run of the block
821
+ # must return a DaruLite::Vector. You can specify the axis to map over
822
+ # as the argument. Default to :vector.
823
+ #
824
+ # == Arguments
825
+ #
826
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
+ # Default to :vector.
828
+ def map!(axis = :vector, &block)
829
+ if %i[vector column].include?(axis)
830
+ map_vectors!(&block)
831
+ elsif axis == :row
832
+ map_rows!(&block)
833
+ end
834
+ end
835
+
836
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
837
+ # block must return a DaruLite::Vector object. You can specify the axis
838
+ # to map over. Default to :vector.
839
+ #
840
+ # == Description
841
+ #
842
+ # Recode works similarly to #map, but an important difference between
843
+ # the two is that recode returns a modified DaruLite::DataFrame instead
844
+ # of an Array. For this reason, #recode expects that every run of the
845
+ # block to return a DaruLite::Vector.
846
+ #
847
+ # Just like map and each, recode also accepts an optional _axis_ argument.
848
+ #
849
+ # == Arguments
850
+ #
851
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
+ # Default to :vector.
853
+ def recode(axis = :vector, &block)
854
+ dispatch_to_axis_pl axis, :recode, &block
855
+ end
856
+
857
+ # Retain vectors or rows if the block returns a truthy value.
858
+ #
859
+ # == Description
860
+ #
861
+ # For filtering out certain rows/vectors based on their values,
862
+ # use the #filter method. By default it iterates over vectors and
863
+ # keeps those vectors for which the block returns true. It accepts
864
+ # an optional axis argument which lets you specify whether you want
865
+ # to iterate over vectors or rows.
866
+ #
867
+ # == Arguments
868
+ #
869
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
+ # Default to :vector.
871
+ #
872
+ # == Usage
873
+ #
874
+ # # Filter vectors
875
+ #
876
+ # df.filter do |vector|
877
+ # vector.type == :numeric and vector.median < 50
878
+ # end
879
+ #
880
+ # # Filter rows
881
+ #
882
+ # df.filter(:row) do |row|
883
+ # row[:a] + row[:d] < 100
884
+ # end
885
+ def filter(axis = :vector, &block)
886
+ dispatch_to_axis_pl axis, :filter, &block
887
+ end
888
+
889
+ def recode_vectors
890
+ block_given? or return to_enum(:recode_vectors)
891
+
892
+ dup.tap do |df|
893
+ df.each_vector_with_index do |v, i|
894
+ df[*i] = should_be_vector!(yield(v))
895
+ end
896
+ end
897
+ end
898
+
899
+ def recode_rows
900
+ block_given? or return to_enum(:recode_rows)
901
+
902
+ dup.tap do |df|
903
+ df.each_row_with_index do |r, i|
904
+ df.row[i] = should_be_vector!(yield(r))
905
+ end
906
+ end
907
+ end
908
+
909
+ # Map each vector and return an Array.
910
+ def map_vectors(&block)
911
+ return to_enum(:map_vectors) unless block
912
+
913
+ @data.map(&block)
914
+ end
915
+
916
+ # Destructive form of #map_vectors
917
+ def map_vectors!
918
+ return to_enum(:map_vectors!) unless block_given?
919
+
920
+ vectors.dup.each do |n|
921
+ self[n] = should_be_vector!(yield(self[n]))
922
+ end
923
+
924
+ self
925
+ end
926
+
927
+ # Map vectors alongwith the index.
928
+ def map_vectors_with_index(&block)
929
+ return to_enum(:map_vectors_with_index) unless block
930
+
931
+ each_vector_with_index.map(&block)
932
+ end
933
+
934
+ # Map each row
935
+ def map_rows(&block)
936
+ return to_enum(:map_rows) unless block
937
+
938
+ each_row.map(&block)
939
+ end
940
+
941
+ def map_rows_with_index(&block)
942
+ return to_enum(:map_rows_with_index) unless block
943
+
944
+ each_row_with_index.map(&block)
945
+ end
946
+
947
+ def map_rows!
948
+ return to_enum(:map_rows!) unless block_given?
949
+
950
+ index.dup.each do |i|
951
+ row[i] = should_be_vector!(yield(row[i]))
952
+ end
953
+
954
+ self
955
+ end
956
+
957
+ def apply_method(method, keys: nil, by_position: true)
958
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
+
960
+ case method
961
+ when Symbol then df.send(method)
962
+ when Proc then method.call(df)
963
+ when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
+ else raise
965
+ end
966
+ end
967
+ alias apply_method_on_sub_df apply_method
968
+
969
+ # Retrieves a DaruLite::Vector, based on the result of calculation
970
+ # performed on each row.
971
+ def collect_rows(&block)
972
+ return to_enum(:collect_rows) unless block
973
+
974
+ DaruLite::Vector.new(each_row.map(&block), index: @index)
975
+ end
976
+
977
+ def collect_row_with_index(&block)
978
+ return to_enum(:collect_row_with_index) unless block
979
+
980
+ DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
+ end
982
+
983
+ # Retrives a DaruLite::Vector, based on the result of calculation
984
+ # performed on each vector.
985
+ def collect_vectors(&block)
986
+ return to_enum(:collect_vectors) unless block
987
+
988
+ DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
+ end
990
+
991
+ def collect_vector_with_index(&block)
992
+ return to_enum(:collect_vector_with_index) unless block
993
+
994
+ DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
+ end
996
+
997
+ # Generate a matrix, based on vector names of the DataFrame.
998
+ #
999
+ # @return {::Matrix}
1000
+ # :nocov:
1001
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
+ # to work.... -- zverok
1003
+ def collect_matrix
1004
+ return to_enum(:collect_matrix) unless block_given?
1005
+
1006
+ vecs = vectors.to_a
1007
+ rows = vecs.collect do |row|
1008
+ vecs.collect do |col|
1009
+ yield row, col
1010
+ end
1011
+ end
1012
+
1013
+ Matrix.rows(rows)
1014
+ end
1015
+ # :nocov:
1016
+
1017
+ # Delete a vector
1018
+ def delete_vector(vector)
1019
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
+
1021
+ @data.delete_at @vectors[vector]
1022
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
+
1024
+ self
1025
+ end
1026
+
1027
+ # Deletes a list of vectors
1028
+ def delete_vectors(*vectors)
1029
+ Array(vectors).each { |vec| delete_vector vec }
1030
+
1031
+ self
1032
+ end
1033
+
1034
+ # Delete a row
1035
+ def delete_row(index)
1036
+ idx = named_index_for index
1037
+
1038
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
+
1040
+ @index = DaruLite::Index.new(@index.to_a - [idx])
1041
+ each_vector do |vector|
1042
+ vector.delete_at idx
1043
+ end
1044
+
1045
+ set_size
1046
+ end
1047
+
1048
+ # Creates a DataFrame with the random data, of n size.
1049
+ # If n not given, uses original number of rows.
1050
+ #
1051
+ # @return {DaruLite::DataFrame}
1052
+ def bootstrap(n = nil)
1053
+ n ||= nrows
1054
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
+ n.times do
1056
+ df_boot.add_row(row[rand(n)])
1057
+ end
1058
+ df_boot.update
1059
+ end
1060
+ end
1061
+
1062
+ def keep_row_if
1063
+ @index
1064
+ .reject { |idx| yield access_row(idx) }
1065
+ .each { |idx| delete_row idx }
1066
+ end
1067
+
1068
+ def keep_vector_if
1069
+ @vectors.each do |vector|
1070
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
+ end
1072
+ end
1073
+
1074
+ # creates a new vector with the data of a given field which the block returns true
1075
+ def filter_vector(vec, &block)
1076
+ DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
+ end
1078
+
1079
+ # Iterates over each row and retains it in a new DataFrame if the block returns
1080
+ # true for that row.
1081
+ def filter_rows
1082
+ return to_enum(:filter_rows) unless block_given?
1083
+
1084
+ keep_rows = @index.map { |index| yield access_row(index) }
1085
+
1086
+ where keep_rows
1087
+ end
1088
+
1089
+ # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
+ # true for that vector.
1091
+ def filter_vectors(&block)
1092
+ return to_enum(:filter_vectors) unless block
1093
+
1094
+ dup.tap { |df| df.keep_vector_if(&block) }
1095
+ end
1096
+
1097
+ # Test each row with one or more tests.
1098
+ # @param tests [Proc] Each test is a Proc with the form
1099
+ # *Proc.new {|row| row[:age] > 0}*
1100
+ # The function returns an array with all errors.
1101
+ #
1102
+ # FIXME: description here is too sparse. As far as I can get,
1103
+ # it should tell something about that each test is [descr, fields, block],
1104
+ # and that first value may be column name to output. - zverok, 2016-05-18
1105
+ def verify(*tests)
1106
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
+
1108
+ each_row_with_index.map do |row, i|
1109
+ tests.reject { |*_, block| block.call(row) }
1110
+ .map { |test| verify_error_message row, test, id, i }
1111
+ end.flatten
1112
+ end
1113
+
1114
+ # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
+ # value each run of the block returns.
1116
+ #
1117
+ # == Usage
1118
+ #
1119
+ # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
+ # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
+ # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
+ # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
+ # total = ds.vector_by_calculation { a + b + c }
1124
+ # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
+ # # nil
1126
+ # # 0 111
1127
+ # # 1 222
1128
+ # # 2 333
1129
+ # # 3 444
1130
+ # # 4 555
1131
+ # # 5 666
1132
+ # # 6 777
1133
+ def vector_by_calculation(&block)
1134
+ a = each_row.map { |r| r.instance_eval(&block) }
1135
+
1136
+ DaruLite::Vector.new a, index: @index
1137
+ end
1138
+
1139
+ # Reorder the vectors in a dataframe
1140
+ # @param [Array] order_array new order of the vectors
1141
+ # @example
1142
+ # df = DaruLite::DataFrame({
1143
+ # a: [1, 2, 3],
1144
+ # b: [4, 5, 6]
1145
+ # }, order: [:a, :b])
1146
+ # df.order = [:b, :a]
1147
+ # df
1148
+ # # => #<DaruLite::DataFrame(3x2)>
1149
+ # # b a
1150
+ # # 0 4 1
1151
+ # # 1 5 2
1152
+ # # 2 6 3
1153
+ def order=(order_array)
1154
+ raise ArgumentError, 'Invalid order' unless
1155
+ order_array.sort == vectors.to_a.sort
1156
+
1157
+ initialize(to_h, order: order_array)
1158
+ end
1159
+
1160
+ # Return the dataframe with rotate vectors positions, the vector at position count is now
1161
+ # the first vector of the dataframe.
1162
+ # If only one vector in the dataframe, the dataframe is return without any change.
1163
+ # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1164
+ # @example
1165
+ # df = DaruLite::DataFrame({
1166
+ # a: [1, 2, 3],
1167
+ # b: [4, 5, 6],
1168
+ # total: [5, 7, 9],
1169
+ # })
1170
+ # df.rotate_vectors(-1)
1171
+ # df
1172
+ # # => #<DaruLite::DataFrame(3x3)>
1173
+ # # total b a
1174
+ # # 0 5 4 1
1175
+ # # 1 7 5 2
1176
+ # # 2 9 6 3
1177
+ def rotate_vectors(count = -1)
1178
+ return self unless vectors.many?
1179
+
1180
+ self.order = vectors.to_a.rotate(count)
1181
+ self
1182
+ end
1183
+
1184
+ # Returns a vector, based on a string with a calculation based
1185
+ # on vector.
1186
+ #
1187
+ # The calculation will be eval'ed, so you can put any variable
1188
+ # or expression valid on ruby.
1189
+ #
1190
+ # For example:
1191
+ # a = DaruLite::Vector.new [1,2]
1192
+ # b = DaruLite::Vector.new [3,4]
1193
+ # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1194
+ # ds.compute("a+b")
1195
+ # => Vector [4,6]
1196
+ def compute(text, &block)
1197
+ return instance_eval(&block) if block
1198
+
1199
+ instance_eval(text)
1200
+ end
1201
+
1202
+ # Return a vector with the number of missing values in each row.
1203
+ #
1204
+ # == Arguments
1205
+ #
1206
+ # * +missing_values+ - An Array of the values that should be
1207
+ # treated as 'missing'. The default missing value is *nil*.
1208
+ def missing_values_rows(missing_values = [nil])
1209
+ number_of_missing = each_row.map do |row|
1210
+ row.indexes(*missing_values).size
1211
+ end
1212
+
1213
+ DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1214
+ end
1215
+
1216
+ # TODO: remove next version
1217
+ alias vector_missing_values missing_values_rows
1218
+
1219
+ def has_missing_data?
1220
+ @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1221
+ end
1222
+ alias flawed? has_missing_data?
1223
+ deprecate :has_missing_data?, :include_values?, 2016, 10
1224
+ deprecate :flawed?, :include_values?, 2016, 10
1225
+
1226
+ # Check if any of given values occur in the data frame
1227
+ # @param [Array] values to check for
1228
+ # @return [true, false] true if any of the given values occur in the
1229
+ # dataframe, false otherwise
1230
+ # @example
1231
+ # df = DaruLite::DataFrame.new({
1232
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1233
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1234
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1235
+ # }, index: 11..18)
1236
+ # df.include_values? nil
1237
+ # # => true
1238
+ def include_values?(*values)
1239
+ @data.any? { |vec| vec.include_values?(*values) }
1240
+ end
1241
+
1242
+ # Return a nested hash using vector names as keys and an array constructed of
1243
+ # hashes with other values. If block provided, is used to provide the
1244
+ # values, with parameters +row+ of dataset, +current+ last hash on
1245
+ # hierarchy and +name+ of the key to include
1246
+ def nest(*tree_keys, &block)
1247
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1248
+
1249
+ each_row.with_object({}) do |row, current|
1250
+ # Create tree
1251
+ *keys, last = tree_keys
1252
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1253
+ name = row[last]
1254
+
1255
+ if block
1256
+ current[name] = yield(row, current, name)
1257
+ else
1258
+ current[name] ||= []
1259
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1260
+ end
1261
+ end
1262
+ end
1263
+
1264
+ def vector_count_characters(vecs = nil)
1265
+ vecs ||= @vectors.to_a
1266
+
1267
+ collect_rows do |row|
1268
+ vecs.sum { |v| row[v].to_s.size }
1269
+ end
1270
+ end
1271
+
1272
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1273
+ self[name]
1274
+ .split_by_separator(sep)
1275
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1276
+ end
1277
+
1278
+ # Return the number of rows and columns of the DataFrame in an Array.
1279
+ def shape
1280
+ [nrows, ncols]
1281
+ end
1282
+
1283
+ # The number of rows
1284
+ def nrows
1285
+ @index.size
1286
+ end
1287
+
1288
+ # The number of vectors
1289
+ def ncols
1290
+ @vectors.size
1291
+ end
1292
+
1293
+ # Check if a vector is present
1294
+ def has_vector?(vector)
1295
+ @vectors.include? vector
1296
+ end
1297
+
1298
+ # Works like Array#any?.
1299
+ #
1300
+ # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1301
+ # :row. A DaruLite::Vector object is yielded in the block.
1302
+ # @example Using any?
1303
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1304
+ # df.any?(:row) do |row|
1305
+ # row[:a] < 3 and row[:b] == 'b'
1306
+ # end #=> true
1307
+ def any?(axis = :vector, &block)
1308
+ if %i[vector column].include?(axis)
1309
+ @data.any?(&block)
1310
+ elsif axis == :row
1311
+ each_row do |row|
1312
+ return true if yield(row)
1313
+ end
1314
+ false
1315
+ else
1316
+ raise ArgumentError, "Unidentified axis #{axis}"
1317
+ end
1318
+ end
1319
+
1320
+ # Works like Array#all?
1321
+ #
1322
+ # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1323
+ # :row. A DaruLite::Vector object is yielded in the block.
1324
+ # @example Using all?
1325
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1326
+ # df.all?(:row) do |row|
1327
+ # row[:a] < 10
1328
+ # end #=> true
1329
+ def all?(axis = :vector, &block)
1330
+ if %i[vector column].include?(axis)
1331
+ @data.all?(&block)
1332
+ elsif axis == :row
1333
+ each_row.all?(&block)
1334
+ else
1335
+ raise ArgumentError, "Unidentified axis #{axis}"
1336
+ end
1337
+ end
1338
+
1339
+ # The first ten elements of the DataFrame
1340
+ #
1341
+ # @param [Fixnum] quantity (10) The number of elements to display from the top.
1342
+ def head(quantity = 10)
1343
+ row.at 0..(quantity - 1)
1344
+ end
1345
+
1346
+ alias first head
1347
+
1348
+ # The last ten elements of the DataFrame
1349
+ #
1350
+ # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1351
+ def tail(quantity = 10)
1352
+ start = [-quantity, -size].max
1353
+ row.at start..-1
1354
+ end
1355
+
1356
+ alias last tail
1357
+
1358
+ # Sum all numeric/specified vectors in the DataFrame.
1359
+ #
1360
+ # Returns a new vector that's a containing a sum of all numeric
1361
+ # or specified vectors of the DataFrame. By default, if the vector
1362
+ # contains a nil, the sum is nil.
1363
+ # With :skipnil argument set to true, nil values are assumed to be
1364
+ # 0 (zero) and the sum vector is returned.
1365
+ #
1366
+ # @param args [Array] List of vectors to sum. Default is nil in which case
1367
+ # all numeric vectors are summed.
1368
+ #
1369
+ # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1370
+ #
1371
+ # @return Vector with sum of all vectors specified in the argument.
1372
+ # If vecs parameter is empty, sum all numeric vector.
1373
+ #
1374
+ # @example
1375
+ # df = DaruLite::DataFrame.new({
1376
+ # a: [1, 2, nil],
1377
+ # b: [2, 1, 3],
1378
+ # c: [1, 1, 1]
1379
+ # })
1380
+ # => #<DaruLite::DataFrame(3x3)>
1381
+ # a b c
1382
+ # 0 1 2 1
1383
+ # 1 2 1 1
1384
+ # 2 nil 3 1
1385
+ # df.vector_sum [:a, :c]
1386
+ # => #<DaruLite::Vector(3)>
1387
+ # 0 2
1388
+ # 1 3
1389
+ # 2 nil
1390
+ # df.vector_sum
1391
+ # => #<DaruLite::Vector(3)>
1392
+ # 0 4
1393
+ # 1 4
1394
+ # 2 nil
1395
+ # df.vector_sum skipnil: true
1396
+ # => #<DaruLite::Vector(3)>
1397
+ # c
1398
+ # 0 4
1399
+ # 1 4
1400
+ # 2 4
1401
+ #
1402
+ def vector_sum(*args)
1403
+ defaults = { vecs: nil, skipnil: false }
1404
+ options = args.last.is_a?(::Hash) ? args.pop : {}
1405
+ options = defaults.merge(options)
1406
+ vecs = args[0] || options[:vecs]
1407
+ skipnil = args[1] || options[:skipnil]
1408
+
1409
+ vecs ||= numeric_vectors
1410
+ sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1411
+ vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1412
+ end
1413
+
1414
+ # Calculate mean of the rows of the dataframe.
1415
+ #
1416
+ # == Arguments
1417
+ #
1418
+ # * +max_missing+ - The maximum number of elements in the row that can be
1419
+ # zero for the mean calculation to happen. Default to 0.
1420
+ def vector_mean(max_missing = 0)
1421
+ # FIXME: in vector_sum we preserve created vector dtype, but
1422
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
1423
+ mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1424
+
1425
+ each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1426
+ memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1427
+ end
1428
+ end
1429
+
1430
+ # Group elements by vector to perform operations on them. Returns a
1431
+ # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1432
+ # list of possible operations.
1433
+ #
1434
+ # == Arguments
1435
+ #
1436
+ # * vectors - An Array contatining names of vectors to group by.
1437
+ #
1438
+ # == Usage
1439
+ #
1440
+ # df = DaruLite::DataFrame.new({
1441
+ # a: %w{foo bar foo bar foo bar foo foo},
1442
+ # b: %w{one one two three two two one three},
1443
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1444
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1445
+ # })
1446
+ # df.group_by([:a,:b,:c]).groups
1447
+ # #=> {["bar", "one", 2]=>[1],
1448
+ # # ["bar", "three", 1]=>[3],
1449
+ # # ["bar", "two", 6]=>[5],
1450
+ # # ["foo", "one", 1]=>[0],
1451
+ # # ["foo", "one", 3]=>[6],
1452
+ # # ["foo", "three", 8]=>[7],
1453
+ # # ["foo", "two", 3]=>[2, 4]}
1454
+ def group_by(*vectors)
1455
+ vectors.flatten!
1456
+ missing = vectors - @vectors.to_a
1457
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1458
+
1459
+ vectors = [@vectors.first] if vectors.empty?
1460
+
1461
+ DaruLite::Core::GroupBy.new(self, vectors)
1462
+ end
1463
+
1464
+ def reindex_vectors(new_vectors)
1465
+ unless new_vectors.is_a?(DaruLite::Index)
1466
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
1467
+ "subclasses, not #{new_vectors.class}"
1468
+ end
1469
+
1470
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1471
+ new_vectors.each_with_object(cl) do |vec, memo|
1472
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1473
+ end
1474
+ end
1475
+
1476
+ def get_vector_anyways(v)
1477
+ @vectors.include?(v) ? self[v].to_a : Array.new(size)
1478
+ end
1479
+
1480
+ # Concatenate another DataFrame along corresponding columns.
1481
+ # If columns do not exist in both dataframes, they are filled with nils
1482
+ def concat(other_df)
1483
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1484
+
1485
+ data = vectors.map do |v|
1486
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1487
+ end
1488
+
1489
+ DaruLite::DataFrame.new(data, order: vectors)
1490
+ end
1491
+
1492
+ # Concatenates another DataFrame as #concat.
1493
+ # Additionally it tries to preserve the index. If the indices contain
1494
+ # common elements, #union will overwrite the according rows in the
1495
+ # first dataframe.
1496
+ def union(other_df)
1497
+ index = (@index.to_a + other_df.index.to_a).uniq
1498
+ df = row[*(@index.to_a - other_df.index.to_a)]
1499
+
1500
+ df = df.concat(other_df)
1501
+ df.index = DaruLite::Index.new(index)
1502
+ df
1503
+ end
1504
+
1505
+ module SetSingleIndexStrategy
1506
+ def self.uniq_size(df, col)
1507
+ df[col].uniq.size
1508
+ end
1509
+
1510
+ def self.new_index(df, col)
1511
+ DaruLite::Index.new(df[col].to_a)
1512
+ end
1513
+
1514
+ def self.delete_vector(df, col)
1515
+ df.delete_vector(col)
1516
+ end
1517
+ end
1518
+
1519
+ module SetCategoricalIndexStrategy
1520
+ def self.new_index(df, col)
1521
+ DaruLite::CategoricalIndex.new(df[col].to_a)
1522
+ end
1523
+
1524
+ def self.delete_vector(df, col)
1525
+ df.delete_vector(col)
1526
+ end
1527
+ end
1528
+
1529
+ module SetMultiIndexStrategy
1530
+ def self.uniq_size(df, cols)
1531
+ df[*cols].uniq.size
1532
+ end
1533
+
1534
+ def self.new_index(df, cols)
1535
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1536
+ mi.name = cols
1537
+ end
1538
+ end
1539
+
1540
+ def self.delete_vector(df, cols)
1541
+ df.delete_vectors(*cols)
1542
+ end
1543
+ end
1544
+
1545
+ # Set a particular column as the new DF
1546
+ def set_index(new_index_col, keep: false, categorical: false)
1547
+ if categorical
1548
+ strategy = SetCategoricalIndexStrategy
1549
+ elsif new_index_col.respond_to?(:to_a)
1550
+ strategy = SetMultiIndexStrategy
1551
+ new_index_col = new_index_col.to_a
1552
+ else
1553
+ strategy = SetSingleIndexStrategy
1554
+ end
1555
+
1556
+ unless categorical
1557
+ uniq_size = strategy.uniq_size(self, new_index_col)
1558
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1559
+ end
1560
+
1561
+ self.index = strategy.new_index(self, new_index_col)
1562
+ strategy.delete_vector(self, new_index_col) unless keep
1563
+ self
1564
+ end
1565
+
1566
+ # Change the index of the DataFrame and preserve the labels of the previous
1567
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
1568
+ #
1569
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1570
+ # @example Reindexing DataFrame
1571
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1572
+ # index: ['a','b','c','d'])
1573
+ # #=>
1574
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1575
+ # # a b
1576
+ # # a 1 11
1577
+ # # b 2 22
1578
+ # # c 3 33
1579
+ # # d 4 44
1580
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1581
+ # #=>
1582
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1583
+ # # a b
1584
+ # # b 2 22
1585
+ # # 0 nil nil
1586
+ # # a 1 11
1587
+ # # g nil nil
1588
+ def reindex(new_index)
1589
+ unless new_index.is_a?(DaruLite::Index)
1590
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
1591
+ "subclasses, not #{new_index.class}"
1592
+ end
1593
+
1594
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1595
+ new_index.each_with_object(cl) do |idx, memo|
1596
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1597
+ end
1598
+ end
1599
+
1600
+ def reset_index
1601
+ index_df = index.to_df
1602
+ names = index.name
1603
+ names = [names] unless names.instance_of?(Array)
1604
+ new_vectors = names + vectors.to_a
1605
+ self.index = index_df.index
1606
+ names.each do |name|
1607
+ self[name] = index_df[name]
1608
+ end
1609
+ self.order = new_vectors
1610
+ self
1611
+ end
1612
+
1613
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1614
+ #
1615
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1616
+ # are to be indexed.
1617
+ # @example Reassigining index of a DataFrame
1618
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1619
+ # df.index.to_a #=> [0,1,2,3]
1620
+ #
1621
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
1622
+ # df.index.to_a #=> ['a','b','c','d']
1623
+ # df.row['a'].to_a #=> [1,11]
1624
+ def index=(idx)
1625
+ @index = Index.coerce idx
1626
+ @data.each { |vec| vec.index = @index }
1627
+
1628
+ self
1629
+ end
1630
+
1631
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1632
+ #
1633
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1634
+ # be indexed. Must of the same size as ncols.
1635
+ # @example Reassigning vectors of a DataFrame
1636
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1637
+ # df.vectors.to_a #=> [:a, :b, :c]
1638
+ #
1639
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1640
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
1641
+ def vectors=(new_index)
1642
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1643
+
1644
+ if new_index.size != ncols
1645
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1646
+ "dataframe size #{ncols}"
1647
+ end
1648
+
1649
+ @vectors = new_index
1650
+ @data.zip(new_index.to_a).each do |vect, name|
1651
+ vect.name = name
1652
+ end
1653
+ self
1654
+ end
1655
+
1656
+ # Renames the vectors
1657
+ #
1658
+ # == Arguments
1659
+ #
1660
+ # * name_map - A hash where the keys are the exising vector names and
1661
+ # the values are the new names. If a vector is renamed
1662
+ # to a vector name that is already in use, the existing
1663
+ # one is overwritten.
1664
+ #
1665
+ # == Usage
1666
+ #
1667
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1668
+ # df.rename_vectors :a => :alpha, :c => :gamma
1669
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
1670
+ def rename_vectors(name_map)
1671
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1672
+ delete_vectors(*existing_targets)
1673
+
1674
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
1675
+ self.vectors = DaruLite::Index.new new_names
1676
+ end
1677
+
1678
+ # Renames the vectors and returns itself
1679
+ #
1680
+ # == Arguments
1681
+ #
1682
+ # * name_map - A hash where the keys are the exising vector names and
1683
+ # the values are the new names. If a vector is renamed
1684
+ # to a vector name that is already in use, the existing
1685
+ # one is overwritten.
1686
+ #
1687
+ # == Usage
1688
+ #
1689
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1690
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
1691
+ def rename_vectors!(name_map)
1692
+ rename_vectors(name_map)
1693
+ self
1694
+ end
1695
+
1696
+ # Converts the vectors to a DaruLite::MultiIndex.
1697
+ # The argument passed is used as the MultiIndex's top level
1698
+ def add_level_to_vectors(top_level_label)
1699
+ tuples = vectors.map { |label| [top_level_label, *label] }
1700
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1701
+ end
1702
+
1703
+ # Return the indexes of all the numeric vectors. Will include vectors with nils
1704
+ # alongwith numbers.
1705
+ def numeric_vectors
1706
+ # FIXME: Why _with_index ?..
1707
+ each_vector_with_index
1708
+ .select { |vec, _i| vec.numeric? }
1709
+ .map(&:last)
1710
+ end
1711
+
1712
+ def numeric_vector_names
1713
+ @vectors.select { |v| self[v].numeric? }
1714
+ end
1715
+
1716
+ # Return a DataFrame of only the numerical Vectors. If clone: false
1717
+ # is specified as option, only a *view* of the Vectors will be
1718
+ # returned. Defaults to clone: true.
1719
+ def only_numerics(opts = {})
1720
+ cln = opts[:clone] != false
1721
+ arry = numeric_vectors.map { |v| self[v] }
1722
+
1723
+ order = Index.new(numeric_vectors)
1724
+ DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1725
+ end
1726
+
1727
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1728
+ # @return [String] String containing the summary of the DataFrame
1729
+ def summary
1730
+ summary = "= #{name}"
1731
+ summary << "\n Number of rows: #{nrows}"
1732
+ @vectors.each do |v|
1733
+ summary << "\n Element:[#{v}]\n"
1734
+ summary << self[v].summary(1)
1735
+ end
1736
+ summary
1737
+ end
1738
+
1739
+ # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1740
+ # vectors, with or without a block.
1741
+ #
1742
+ # @param vector_order [Array] The order of vector names in which the DataFrame
1743
+ # should be sorted.
1744
+ # @param opts [Hash] opts The options to sort with.
1745
+ # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1746
+ # or descending order. Specify Array corresponding to *order* for multiple
1747
+ # sort orders.
1748
+ # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1749
+ # to be used for sorting, for each vector name in *order* as a hash of
1750
+ # vector name and lambda expressions. In case a lambda for a vector is not
1751
+ # specified, the default will be used.
1752
+ # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1753
+ # automatically or not when a block is provided.
1754
+ # If set to True, nils will appear at top after sorting.
1755
+ #
1756
+ # @example Sort a dataframe with a vector sequence.
1757
+ #
1758
+ #
1759
+ # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1760
+ #
1761
+ # df.sort [:a, :b]
1762
+ # # =>
1763
+ # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1764
+ # # a b
1765
+ # # 2 1 3
1766
+ # # 0 1 5
1767
+ # # 3 2 2
1768
+ # # 1 2 4
1769
+ # # 4 3 1
1770
+ #
1771
+ # @example Sort a dataframe without a block. Here nils will be handled automatically.
1772
+ #
1773
+ # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1774
+ #
1775
+ # df.sort([:a])
1776
+ # # =>
1777
+ # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1778
+ # # a b
1779
+ # # 1 nil 3
1780
+ # # 3 nil 1
1781
+ # # 0 -3 4
1782
+ # # 2 -1 2
1783
+ # # 4 5 4
1784
+ #
1785
+ # @example Sort a dataframe with a block with nils handled automatically.
1786
+ #
1787
+ # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1788
+ #
1789
+ # df.sort [:b], by: {b: lambda { |a| a.length } }
1790
+ # # NoMethodError: undefined method `length' for nil:NilClass
1791
+ # # from (pry):8:in `block in __pry__'
1792
+ #
1793
+ # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1794
+ #
1795
+ # # =>
1796
+ # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1797
+ # # a b
1798
+ # # 2 1 nil
1799
+ # # 5 1 nil
1800
+ # # 4 -1 x
1801
+ # # 1 -1 aa
1802
+ # # 0 nil aaa
1803
+ # # 3 nil baaa
1804
+ #
1805
+ # @example Sort a dataframe with a block with nils handled manually.
1806
+ #
1807
+ # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1808
+ #
1809
+ # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1810
+ # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1811
+ #
1812
+ # # =>
1813
+ # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1814
+ # # a b
1815
+ # # 4 -1 x
1816
+ # # 1 -1 aa
1817
+ # # 0 nil aaa
1818
+ # # 3 nil baaa
1819
+ # # 2 1 nil
1820
+ # # 5 1 nil
1821
+
1822
+ def sort!(vector_order, opts = {})
1823
+ raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1824
+
1825
+ # To enable sorting with categorical data,
1826
+ # map categories to integers preserving their order
1827
+ old = convert_categorical_vectors vector_order
1828
+ block = sort_prepare_block vector_order, opts
1829
+
1830
+ order = @index.size.times.sort(&block)
1831
+ new_index = @index.reorder order
1832
+
1833
+ # To reverse map mapping of categorical data to integers
1834
+ restore_categorical_vectors old
1835
+
1836
+ @data.each do |vector|
1837
+ vector.reorder! order
1838
+ end
1839
+
1840
+ self.index = new_index
1841
+
1842
+ self
1843
+ end
1844
+
1845
+ # Non-destructive version of #sort!
1846
+ def sort(vector_order, opts = {})
1847
+ dup.sort! vector_order, opts
1848
+ end
1849
+
1850
+ # Pivots a data frame on specified vectors and applies an aggregate function
1851
+ # to quickly generate a summary.
1852
+ #
1853
+ # == Options
1854
+ #
1855
+ # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1856
+ # contained in an Array.
1857
+ #
1858
+ # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1859
+ # names contained in an Array.
1860
+ #
1861
+ # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1862
+ # use any of the statistics functions applicable on Vectors that can be found in
1863
+ # the DaruLite::Statistics::Vector module.
1864
+ #
1865
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
1866
+ # specified in *:index* or *:vectors*. Optional.
1867
+ #
1868
+ # == Usage
1869
+ #
1870
+ # df = DaruLite::DataFrame.new({
1871
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1872
+ # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1873
+ # c: ['small','large','large','small','small','large','small','large','small'],
1874
+ # d: [1,2,2,3,3,4,5,6,7],
1875
+ # e: [2,4,4,6,6,8,10,12,14]
1876
+ # })
1877
+ # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1878
+ #
1879
+ # #=>
1880
+ # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1881
+ # # [:e, :one] [:e, :two]
1882
+ # # [:bar] 18 26
1883
+ # # [:foo] 10 12
1884
+ def pivot_table(opts = {})
1885
+ raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1886
+
1887
+ index = opts[:index]
1888
+ vectors = opts[:vectors] || []
1889
+ aggregate_function = opts[:agg] || :mean
1890
+ values = prepare_pivot_values index, vectors, opts
1891
+ raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1892
+
1893
+ grouped = group_by(index)
1894
+ return grouped.send(aggregate_function) if vectors.empty?
1895
+
1896
+ super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1897
+
1898
+ pivot_dataframe super_hash
1899
+ end
1900
+
1901
+ # Merge vectors from two DataFrames. In case of name collision,
1902
+ # the vectors names are changed to x_1, x_2 ....
1903
+ #
1904
+ # @return {DaruLite::DataFrame}
1905
+ def merge(other_df)
1906
+ unless nrows == other_df.nrows
1907
+ raise ArgumentError,
1908
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1909
+ end
1910
+
1911
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
1912
+ new_fields = ArrayHelper.recode_repeated(new_fields)
1913
+ DataFrame.new({}, order: new_fields).tap do |df_new|
1914
+ (0...nrows).each do |i|
1915
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
1916
+ end
1917
+ df_new.index = @index if @index == other_df.index
1918
+ df_new.update
1919
+ end
1920
+ end
1921
+
1922
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1923
+ # outer, right outer and full outer joins.
1924
+ #
1925
+ # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1926
+ # to be performed.
1927
+ # @param [Hash] opts Options Hash
1928
+ # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1929
+ # @option :on [Array] The columns on which the join is to be performed.
1930
+ # Column names specified here must be common to both DataFrames.
1931
+ # @option :indicator [Symbol] The name of a vector to add to the resultant
1932
+ # dataframe that indicates whether the record was in the left (:left_only),
1933
+ # right (:right_only), or both (:both) joining dataframes.
1934
+ # @return [DaruLite::DataFrame]
1935
+ # @example Inner Join
1936
+ # left = DaruLite::DataFrame.new({
1937
+ # :id => [1,2,3,4],
1938
+ # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1939
+ # })
1940
+ # right = DaruLite::DataFrame.new({
1941
+ # :id => [1,2,3,4],
1942
+ # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1943
+ # })
1944
+ # left.join(right, how: :inner, on: [:name])
1945
+ # #=>
1946
+ # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1947
+ # # id_1 name id_2
1948
+ # # 0 1 Pirate 2
1949
+ # # 1 3 Ninja 4
1950
+ def join(other_df, opts = {})
1951
+ DaruLite::Core::Merge.join(self, other_df, opts)
1952
+ end
1953
+
1954
+ # Creates a new dataset for one to many relations
1955
+ # on a dataset, based on pattern of field names.
1956
+ #
1957
+ # for example, you have a survey for number of children
1958
+ # with this structure:
1959
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1960
+ # with
1961
+ # ds.one_to_many([:id], "child_%v_%n"
1962
+ # the field of first parameters will be copied verbatim
1963
+ # to new dataset, and fields which responds to second
1964
+ # pattern will be added one case for each different %n.
1965
+ #
1966
+ # @example
1967
+ # cases=[
1968
+ # ['1','george','red',10,'blue',20,nil,nil],
1969
+ # ['2','fred','green',15,'orange',30,'white',20],
1970
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
1971
+ # ]
1972
+ # ds=DaruLite::DataFrame.rows(cases, order:
1973
+ # [:id, :name,
1974
+ # :car_color1, :car_value1,
1975
+ # :car_color2, :car_value2,
1976
+ # :car_color3, :car_value3])
1977
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
1978
+ # #=> Matrix[
1979
+ # # ["red", "1", 10],
1980
+ # # ["blue", "1", 20],
1981
+ # # ["green", "2", 15],
1982
+ # # ["orange", "2", 30],
1983
+ # # ["white", "2", 20]
1984
+ # # ]
1985
+ def one_to_many(parent_fields, pattern)
1986
+ vars, numbers = one_to_many_components(pattern)
1987
+
1988
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1989
+ each_row do |row|
1990
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1991
+ numbers.each do |n|
1992
+ generated = one_to_many_row row, n, vars, pattern
1993
+ next if generated.values.all?(&:nil?)
1994
+
1995
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1996
+ end
1997
+ end
1998
+ ds.update
1999
+ end
2000
+ end
2001
+
2002
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2003
+ self[nm]
2004
+ .split_by_separator(sep)
2005
+ .each_with_index do |(k, v), i|
2006
+ v.rename "#{nm}:#{k}"
2007
+ self[:"#{nm}#{join}#{i + 1}"] = v
2008
+ end
2009
+ end
2010
+
2011
+ # Create a sql, basen on a given Dataset
2012
+ #
2013
+ # == Arguments
2014
+ #
2015
+ # * table - String specifying name of the table that will created in SQL.
2016
+ # * charset - Character set. Default is "UTF8".
2017
+ #
2018
+ # @example
2019
+ #
2020
+ # ds = DaruLite::DataFrame.new({
2021
+ # :id => DaruLite::Vector.new([1,2,3,4,5]),
2022
+ # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2023
+ # })
2024
+ # ds.create_sql('names')
2025
+ # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2026
+ #
2027
+ def create_sql(table, charset = 'UTF8')
2028
+ sql = "CREATE TABLE #{table} ("
2029
+ fields = vectors.to_a.collect do |f|
2030
+ v = self[f]
2031
+ "#{f} #{v.db_type}"
2032
+ end
2033
+
2034
+ sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2035
+ end
2036
+
2037
+ # Returns the dataframe. This can be convenient when the user does not
2038
+ # know whether the object is a vector or a dataframe.
2039
+ # @return [self] the dataframe
2040
+ def to_df
2041
+ self
2042
+ end
2043
+
2044
+ # Convert all vectors of type *:numeric* into a Matrix.
2045
+ def to_matrix
2046
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2047
+ end
2048
+
2049
+ # Converts the DataFrame into an array of hashes where key is vector name
2050
+ # and value is the corresponding element. The 0th index of the array contains
2051
+ # the array of hashes while the 1th index contains the indexes of each row
2052
+ # of the dataframe. Each element in the index array corresponds to its row
2053
+ # in the array of hashes, which has the same index.
2054
+ def to_a
2055
+ [each_row.map(&:to_h), @index.to_a]
2056
+ end
2057
+
2058
+ # Convert to json. If no_index is false then the index will NOT be included
2059
+ # in the JSON thus created.
2060
+ def to_json(no_index = true)
2061
+ if no_index
2062
+ to_a[0].to_json
2063
+ else
2064
+ to_a.to_json
2065
+ end
2066
+ end
2067
+
2068
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2069
+ # the corresponding vectors.
2070
+ def to_h
2071
+ @vectors
2072
+ .each_with_index
2073
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2074
+ end
2075
+
2076
+ # Convert to html for IRuby.
2077
+ def to_html(threshold = DaruLite.max_rows)
2078
+ table_thead = to_html_thead
2079
+ table_tbody = to_html_tbody(threshold)
2080
+ path = if index.is_a?(MultiIndex)
2081
+ File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2082
+ else
2083
+ File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2084
+ end
2085
+ ERB.new(File.read(path).strip).result(binding)
2086
+ end
2087
+
2088
+ def to_html_thead
2089
+ table_thead_path =
2090
+ if index.is_a?(MultiIndex)
2091
+ File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
2092
+ else
2093
+ File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
2094
+ end
2095
+ ERB.new(File.read(table_thead_path).strip).result(binding)
2096
+ end
2097
+
2098
+ def to_html_tbody(threshold = DaruLite.max_rows)
2099
+ threshold ||= @size
2100
+ table_tbody_path =
2101
+ if index.is_a?(MultiIndex)
2102
+ File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2103
+ else
2104
+ File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2105
+ end
2106
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
2107
+ end
2108
+
2109
+ def to_s
2110
+ "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
2111
+ end
2112
+
2113
+ # Method for updating the metadata (i.e. missing value positions) of the
2114
+ # after assingment/deletion etc. are complete. This is provided so that
2115
+ # time is not wasted in creating the metadata for the vector each time
2116
+ # assignment/deletion of elements is done. Updating data this way is called
2117
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2118
+ def update
2119
+ @data.each(&:update) if DaruLite.lazy_update
2120
+ end
2121
+
2122
+ # Rename the DataFrame.
2123
+ def rename(new_name)
2124
+ @name = new_name
2125
+ self
2126
+ end
2127
+
2128
+ alias name= rename
2129
+
2130
+ # Write this DataFrame to a CSV file.
2131
+ #
2132
+ # == Arguments
2133
+ #
2134
+ # * filename - Path of CSV file where the DataFrame is to be saved.
2135
+ #
2136
+ # == Options
2137
+ #
2138
+ # * convert_comma - If set to *true*, will convert any commas in any
2139
+ # of the data to full stops ('.').
2140
+ # All the options accepted by CSV.read() can also be passed into this
2141
+ # function.
2142
+ def write_csv(filename, opts = {})
2143
+ DaruLite::IO.dataframe_write_csv self, filename, opts
2144
+ end
2145
+
2146
+ # Write this dataframe to an Excel Spreadsheet
2147
+ #
2148
+ # == Arguments
2149
+ #
2150
+ # * filename - The path of the file where the DataFrame should be written.
2151
+ def write_excel(filename, opts = {})
2152
+ DaruLite::IO.dataframe_write_excel self, filename, opts
2153
+ end
2154
+
2155
+ # Insert each case of the Dataset on the selected table
2156
+ #
2157
+ # == Arguments
2158
+ #
2159
+ # * dbh - DBI database connection object.
2160
+ # * query - Query string.
2161
+ #
2162
+ # == Usage
2163
+ #
2164
+ # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2165
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2166
+ # ds.write_sql(dbh,"test")
2167
+ def write_sql(dbh, table)
2168
+ DaruLite::IO.dataframe_write_sql self, dbh, table
2169
+ end
2170
+
2171
+ # Use marshalling to save dataframe to a file.
2172
+ def save(filename)
2173
+ DaruLite::IO.save self, filename
2174
+ end
2175
+
2176
+ def _dump(_depth)
2177
+ Marshal.dump(
2178
+ data: @data,
2179
+ index: @index.to_a,
2180
+ order: @vectors.to_a,
2181
+ name: @name
2182
+ )
2183
+ end
2184
+
2185
+ def self._load(data)
2186
+ h = Marshal.load data
2187
+ DaruLite::DataFrame.new(h[:data],
2188
+ index: h[:index],
2189
+ order: h[:order],
2190
+ name: h[:name])
2191
+ end
2192
+
2193
+ # Transpose a DataFrame, tranposing elements and row, column indexing.
2194
+ def transpose
2195
+ DaruLite::DataFrame.new(
2196
+ each_vector.map(&:to_a).transpose,
2197
+ index: @vectors,
2198
+ order: @index,
2199
+ dtype: @dtype,
2200
+ name: @name
2201
+ )
2202
+ end
2203
+
2204
+ # Pretty print in a nice table format for the command line (irb/pry/iruby)
2205
+ def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
2206
+ name_part = @name ? ": #{@name} " : ''
2207
+ spacing = [headers.to_a.map(&:length).max, spacing].max
2208
+
2209
+ "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
2210
+ Formatters::Table.format(
2211
+ each_row.lazy,
2212
+ row_headers: row_headers,
2213
+ headers: headers,
2214
+ threshold: threshold,
2215
+ spacing: spacing
2216
+ )
2217
+ end
2218
+
2219
+ # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2220
+ def where(bool_array)
2221
+ DaruLite::Core::Query.df_where self, bool_array
2222
+ end
2223
+
2224
+ def ==(other)
2225
+ self.class == other.class &&
2226
+ @size == other.size &&
2227
+ @index == other.index &&
2228
+ @vectors == other.vectors &&
2229
+ @vectors.to_a.all? { |v| self[v] == other[v] }
2230
+ end
2231
+
2232
+ # Converts the specified non category type vectors to category type vectors
2233
+ # @param [Array] names of non category type vectors to be converted
2234
+ # @return [DaruLite::DataFrame] data frame in which specified vectors have been
2235
+ # converted to category type
2236
+ # @example
2237
+ # df = DaruLite::DataFrame.new({
2238
+ # a: [1, 2, 3],
2239
+ # b: ['a', 'a', 'b']
2240
+ # })
2241
+ # df.to_category :b
2242
+ # df[:b].type
2243
+ # # => :category
2244
+ def to_category(*names)
2245
+ names.each { |n| self[n] = self[n].to_category }
2246
+ self
2247
+ end
2248
+
2249
+ def method_missing(name, *args, &block)
2250
+ if /(.+)=/.match?(name)
2251
+ name = name[/(.+)=/].delete('=')
2252
+ name = name.to_sym unless has_vector?(name)
2253
+ insert_or_modify_vector [name], args[0]
2254
+ elsif has_vector?(name)
2255
+ self[name]
2256
+ elsif has_vector?(name.to_s)
2257
+ self[name.to_s]
2258
+ else
2259
+ super
2260
+ end
2261
+ end
2262
+
2263
+ def respond_to_missing?(name, include_private = false)
2264
+ name.to_s.end_with?('=') || has_vector?(name) || super
2265
+ end
2266
+
2267
+ def interact_code(vector_names, full)
2268
+ dfs = vector_names.zip(full).map do |vec_name, f|
2269
+ self[vec_name].contrast_code(full: f).each.to_a
2270
+ end
2271
+
2272
+ all_vectors = recursive_product(dfs)
2273
+ DaruLite::DataFrame.new all_vectors,
2274
+ order: all_vectors.map(&:name)
2275
+ end
2276
+
2277
+ # Split the dataframe into many dataframes based on category vector
2278
+ # @param [object] cat_name name of category vector to split the dataframe
2279
+ # @return [Array] array of dataframes split by category with category vector
2280
+ # used to split not included
2281
+ # @example
2282
+ # df = DaruLite::DataFrame.new({
2283
+ # a: [1, 2, 3],
2284
+ # b: ['a', 'a', 'b']
2285
+ # })
2286
+ # df.to_category :b
2287
+ # df.split_by_category :b
2288
+ # # => [#<DaruLite::DataFrame: a (2x1)>
2289
+ # # a
2290
+ # # 0 1
2291
+ # # 1 2,
2292
+ # # #<DaruLite::DataFrame: b (1x1)>
2293
+ # # a
2294
+ # # 2 3]
2295
+ def split_by_category(cat_name)
2296
+ cat_dv = self[cat_name]
2297
+ raise ArgumentError, "#{cat_name} is not a category vector" unless
2298
+ cat_dv.category?
2299
+
2300
+ cat_dv.categories.map do |cat|
2301
+ where(cat_dv.eq cat)
2302
+ .rename(cat)
2303
+ .delete_vector cat_name
2304
+ end
2305
+ end
2306
+
2307
+ # @param indexes [Array] index(s) at which row tuples are retrieved
2308
+ # @return [Array] returns array of row tuples at given index(s)
2309
+ # @example Using DaruLite::Index
2310
+ # df = DaruLite::DataFrame.new({
2311
+ # a: [1, 2, 3],
2312
+ # b: ['a', 'a', 'b']
2313
+ # })
2314
+ #
2315
+ # df.access_row_tuples_by_indexs(1,2)
2316
+ # # => [[2, "a"], [3, "b"]]
2317
+ #
2318
+ # df.index = DaruLite::Index.new([:one,:two,:three])
2319
+ # df.access_row_tuples_by_indexs(:one,:three)
2320
+ # # => [[1, "a"], [3, "b"]]
2321
+ #
2322
+ # @example Using DaruLite::MultiIndex
2323
+ # mi_idx = DaruLite::MultiIndex.from_tuples [
2324
+ # [:a,:one,:bar],
2325
+ # [:a,:one,:baz],
2326
+ # [:b,:two,:bar],
2327
+ # [:a,:two,:baz],
2328
+ # ]
2329
+ # df_mi = DaruLite::DataFrame.new({
2330
+ # a: 1..4,
2331
+ # b: 'a'..'d'
2332
+ # }, index: mi_idx )
2333
+ #
2334
+ # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2335
+ # # => [[3, "c"]]
2336
+ # df_mi.access_row_tuples_by_indexs(:a)
2337
+ # # => [[1, "a"], [2, "b"], [4, "d"]]
2338
+ def access_row_tuples_by_indexs(*indexes)
2339
+ return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2340
+ @index.is_a?(DaruLite::MultiIndex)
2341
+
2342
+ positions = @index.pos(*indexes)
2343
+ if positions.is_a? Numeric
2344
+ row = get_rows_for([positions])
2345
+ row.first.is_a?(Array) ? row : [row]
2346
+ else
2347
+ new_rows = get_rows_for(indexes, by_position: false)
2348
+ indexes.map { |index| new_rows.map { |r| r[index] } }
2349
+ end
2350
+ end
2351
+
2352
+ # Function to use for aggregating the data.
2353
+ #
2354
+ # @param options [Hash] options for column, you want in resultant dataframe
2355
+ #
2356
+ # @return [DaruLite::DataFrame]
2357
+ #
2358
+ # @example
2359
+ # df = DaruLite::DataFrame.new(
2360
+ # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2361
+ # => #<DaruLite::DataFrame(5x2)>
2362
+ # col num
2363
+ # 0 a 52
2364
+ # 1 b 12
2365
+ # 2 c 7
2366
+ # 3 d 17
2367
+ # 4 e 1
2368
+ #
2369
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2370
+ # => #<DaruLite::DataFrame(5x1)>
2371
+ # num_100_ti
2372
+ # 0 5200
2373
+ # 1 1200
2374
+ # 2 700
2375
+ # 3 1700
2376
+ # 4 100
2377
+ #
2378
+ # When we have duplicate index :
2379
+ #
2380
+ # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2381
+ # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2382
+ # => #<DaruLite::DataFrame(5x1)>
2383
+ # num
2384
+ # a 52
2385
+ # b 12
2386
+ # a 7
2387
+ # a 17
2388
+ # c 1
2389
+ #
2390
+ # df.aggregate(num: :mean)
2391
+ # => #<DaruLite::DataFrame(3x1)>
2392
+ # num
2393
+ # a 25.3333333
2394
+ # b 12
2395
+ # c 1
2396
+ #
2397
+ # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2398
+ # internally.
2399
+ def aggregate(options = {}, multi_index_level = -1)
2400
+ if block_given?
2401
+ positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2402
+ else
2403
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2404
+ end
2405
+
2406
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2407
+
2408
+ DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2409
+ end
2410
+
2411
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2412
+ group_by(*group_by_keys).aggregate(aggregation_map)
2413
+ end
2414
+
2415
+ private
2416
+
2417
+ def headers
2418
+ DaruLite::Index.new(Array(index.name) + @vectors.to_a)
2419
+ end
2420
+
2421
+ def row_headers
2422
+ index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2423
+ end
2424
+
2425
+ def convert_categorical_vectors(names)
2426
+ names.filter_map do |n|
2427
+ next unless self[n].category?
2428
+
2429
+ old = [n, self[n]]
2430
+ self[n] = DaruLite::Vector.new(self[n].to_ints)
2431
+ old
2432
+ end
2433
+ end
2434
+
2435
+ def restore_categorical_vectors(old)
2436
+ old.each { |name, vector| self[name] = vector }
2437
+ end
2438
+
2439
+ def recursive_product(dfs)
2440
+ return dfs.first if dfs.size == 1
2441
+
2442
+ left = dfs.first
2443
+ dfs.shift
2444
+ right = recursive_product dfs
2445
+ left.product(right).map do |dv1, dv2|
2446
+ (dv1 * dv2).rename "#{dv1.name}:#{dv2.name}"
2447
+ end
2448
+ end
2449
+
2450
+ def should_be_vector!(val)
2451
+ return val if val.is_a?(DaruLite::Vector)
2452
+
2453
+ raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2454
+ end
2455
+
2456
+ def dispatch_to_axis(axis, method, *args, &block)
2457
+ if %i[vector column].include?(axis)
2458
+ send(:"#{method}_vector", *args, &block)
2459
+ elsif axis == :row
2460
+ send(:"#{method}_row", *args, &block)
2461
+ else
2462
+ raise ArgumentError, "Unknown axis #{axis}"
2463
+ end
2464
+ end
2465
+
2466
+ def dispatch_to_axis_pl(axis, method, *args, &block)
2467
+ if %i[vector column].include?(axis)
2468
+ send(:"#{method}_vectors", *args, &block)
2469
+ elsif axis == :row
2470
+ send(:"#{method}_rows", *args, &block)
2471
+ else
2472
+ raise ArgumentError, "Unknown axis #{axis}"
2473
+ end
2474
+ end
2475
+
2476
+ AXES = %i[row vector].freeze
2477
+
2478
+ def extract_axis(names, default = :vector)
2479
+ if AXES.include?(names.last)
2480
+ names.pop
2481
+ else
2482
+ default
2483
+ end
2484
+ end
2485
+
2486
+ def access_vector(*names)
2487
+ if names.first.is_a?(Range)
2488
+ dup(@vectors.subset(names.first))
2489
+ elsif @vectors.is_a?(MultiIndex)
2490
+ access_vector_multi_index(*names)
2491
+ else
2492
+ access_vector_single_index(*names)
2493
+ end
2494
+ end
2495
+
2496
+ def access_vector_multi_index(*names)
2497
+ pos = @vectors[names]
2498
+
2499
+ return @data[pos] if pos.is_a?(Integer)
2500
+
2501
+ new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2502
+
2503
+ pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2504
+
2505
+ DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2506
+ end
2507
+
2508
+ def access_vector_single_index(*names)
2509
+ if names.count < 2
2510
+ begin
2511
+ pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2512
+ rescue IndexError
2513
+ raise IndexError, "Specified vector #{names.first} does not exist"
2514
+ end
2515
+ return @data[pos] if pos.is_a?(Numeric)
2516
+
2517
+ names = pos
2518
+ end
2519
+
2520
+ new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2521
+
2522
+ order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2523
+ DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2524
+ end
2525
+
2526
+ def access_row(*indexes)
2527
+ positions = @index.pos(*indexes)
2528
+
2529
+ if positions.is_a? Numeric
2530
+ row = get_rows_for([positions])
2531
+ DaruLite::Vector.new row, index: @vectors, name: indexes.first
2532
+ else
2533
+ new_rows = get_rows_for(indexes, by_position: false)
2534
+ DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2535
+ end
2536
+ end
2537
+
2538
+ # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2539
+ # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2540
+ # values (representing a row) or an array of Vectors (that can be seen as rows)
2541
+ def get_rows_for(keys, by_position: true)
2542
+ raise unless keys.is_a?(Array)
2543
+
2544
+ if by_position
2545
+ pos = keys
2546
+ @data.map { |vector| vector.at(*pos) }
2547
+ else
2548
+ # TODO: for now (2018-07-27), it is different than using
2549
+ # get_rows_for(@index.pos(*keys))
2550
+ # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2551
+ indexes = keys
2552
+ @data.map { |vec| vec[*indexes] }
2553
+ end
2554
+ end
2555
+
2556
+ def insert_or_modify_vector(name, vector)
2557
+ name = name[0] unless @vectors.is_a?(MultiIndex)
2558
+
2559
+ if @index.empty?
2560
+ insert_vector_in_empty name, vector
2561
+ else
2562
+ vec = prepare_for_insert name, vector
2563
+
2564
+ assign_or_add_vector name, vec
2565
+ end
2566
+ end
2567
+
2568
+ def assign_or_add_vector(name, v)
2569
+ # FIXME: fix this jugaad. need to make changes in Indexing itself.
2570
+ begin
2571
+ pos = @vectors[name]
2572
+ rescue IndexError
2573
+ pos = name
2574
+ end
2575
+
2576
+ if pos.is_a?(DaruLite::Index)
2577
+ assign_multiple_vectors pos, v
2578
+ elsif pos == name &&
2579
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2580
+
2581
+ @data[pos] = v
2582
+ else
2583
+ assign_or_add_vector_rough name, v
2584
+ end
2585
+ end
2586
+
2587
+ def assign_multiple_vectors(pos, v)
2588
+ pos.each do |p|
2589
+ @data[@vectors[p]] = v
2590
+ end
2591
+ end
2592
+
2593
+ def assign_or_add_vector_rough(name, v)
2594
+ @vectors |= [name] unless @vectors.include?(name)
2595
+ @data[@vectors[name]] = v
2596
+ end
2597
+
2598
+ def insert_vector_in_empty(name, vector)
2599
+ vec = Vector.coerce(vector.to_a, name: coerce_name(name))
2600
+
2601
+ @index = vec.index
2602
+ assign_or_add_vector name, vec
2603
+ set_size
2604
+
2605
+ @data.map! { |v| v.empty? ? v.reindex(@index) : v }
2606
+ end
2607
+
2608
+ def prepare_for_insert(name, arg)
2609
+ if arg.is_a? DaruLite::Vector
2610
+ prepare_vector_for_insert name, arg
2611
+ elsif arg.respond_to?(:to_a)
2612
+ prepare_enum_for_insert name, arg
2613
+ else
2614
+ prepare_value_for_insert name, arg
2615
+ end
2616
+ end
2617
+
2618
+ def prepare_vector_for_insert(name, vector)
2619
+ # so that index-by-index assignment is avoided when possible.
2620
+ return vector.dup if vector.index == @index
2621
+
2622
+ DaruLite::Vector.new([], name: coerce_name(name), index: @index).tap do |v|
2623
+ @index.each do |idx|
2624
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2625
+ end
2626
+ end
2627
+ end
2628
+
2629
+ def prepare_enum_for_insert(name, enum)
2630
+ if @size != enum.size
2631
+ raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
2632
+ end
2633
+
2634
+ DaruLite::Vector.new(enum, name: coerce_name(name), index: @index)
2635
+ end
2636
+
2637
+ def prepare_value_for_insert(name, value)
2638
+ DaruLite::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
2639
+ end
2640
+
2641
+ def insert_or_modify_row(indexes, vector)
2642
+ vector = coerce_vector vector
2643
+
2644
+ raise SizeError, 'Vector length should match row length' if
2645
+ vector.size != @vectors.size
2646
+
2647
+ @data.each_with_index do |vec, pos|
2648
+ vec.send(:set, indexes, vector.at(pos))
2649
+ end
2650
+ @index = @data[0].index
2651
+
2652
+ set_size
2653
+ end
2654
+
2655
+ def create_empty_vectors(vectors, index)
2656
+ @vectors = Index.coerce vectors
2657
+ @index = Index.coerce index
2658
+
2659
+ @data = @vectors.map do |name|
2660
+ DaruLite::Vector.new([], name: coerce_name(name), index: @index)
2661
+ end
2662
+ end
2663
+
2664
+ def validate_labels
2665
+ if @vectors && @vectors.size != @data.size
2666
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
2667
+ "for number of vectors (#{@data.size})."
2668
+ end
2669
+
2670
+ return unless @index && @data[0] && @index.size != @data[0].size
2671
+
2672
+ raise IndexError, 'Expected number of indexes same as number of rows'
2673
+ end
2674
+
2675
+ def validate_vector_sizes
2676
+ @data.each do |vector|
2677
+ raise IndexError, 'Expected vectors with equal length' if vector.size != @size
2678
+ end
2679
+ end
2680
+
2681
+ def validate
2682
+ validate_labels
2683
+ validate_vector_sizes
2684
+ end
2685
+
2686
+ def set_size
2687
+ @size = @index.size
2688
+ end
2689
+
2690
+ def named_index_for(index)
2691
+ if @index.include? index
2692
+ index
2693
+ elsif @index.key index
2694
+ @index.key index
2695
+ else
2696
+ raise IndexError, "Specified index #{index} does not exist."
2697
+ end
2698
+ end
2699
+
2700
+ def create_vectors_index_with(vectors, source)
2701
+ vectors = source.keys if vectors.nil?
2702
+
2703
+ @vectors =
2704
+ if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
2705
+ vectors
2706
+ else
2707
+ DaruLite::Index.new((vectors + (source.keys - vectors)).uniq)
2708
+ end
2709
+ end
2710
+
2711
+ def all_vectors_have_equal_indexes?(source)
2712
+ idx = source.values[0].index
2713
+
2714
+ source.values.all? { |vector| idx == vector.index }
2715
+ end
2716
+
2717
+ def coerce_name(potential_name)
2718
+ potential_name.is_a?(Array) ? potential_name.join : potential_name
2719
+ end
2720
+
2721
+ def initialize_from_array(source, vectors, index, opts)
2722
+ raise ArgumentError, 'All objects in data source should be same class' \
2723
+ unless source.map(&:class).uniq.size == 1
2724
+
2725
+ case source.first
2726
+ when Array
2727
+ vectors ||= (0..source.size - 1).to_a
2728
+ initialize_from_array_of_arrays source, vectors, index, opts
2729
+ when Vector
2730
+ vectors ||= (0..source.size - 1).to_a
2731
+ initialize_from_array_of_vectors source, vectors, index, opts
2732
+ when Hash
2733
+ initialize_from_array_of_hashes source, vectors, index, opts
2734
+ else
2735
+ raise ArgumentError, "Can't create DataFrame from #{source}"
2736
+ end
2737
+ end
2738
+
2739
+ def initialize_from_array_of_arrays(source, vectors, index, _opts)
2740
+ if source.size != vectors.size
2741
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should " \
2742
+ "equal order size (#{source.size})"
2743
+ end
2744
+
2745
+ @index = Index.coerce(index || source[0].size)
2746
+ @vectors = Index.coerce(vectors)
2747
+
2748
+ update_data source, vectors
2749
+ end
2750
+
2751
+ def initialize_from_array_of_vectors(source, vectors, index, opts)
2752
+ clone = opts[:clone] != false
2753
+ hsh = vectors.each_with_index.to_h do |name, idx|
2754
+ [name, source[idx]]
2755
+ end
2756
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
2757
+ end
2758
+
2759
+ def initialize_from_array_of_hashes(source, vectors, index, _opts)
2760
+ names =
2761
+ if vectors.nil?
2762
+ source[0].keys
2763
+ else
2764
+ (vectors + source[0].keys).uniq
2765
+ end
2766
+ @vectors = DaruLite::Index.new(names)
2767
+ @index = DaruLite::Index.new(index || source.size)
2768
+
2769
+ @data = @vectors.map do |name|
2770
+ v = source.map { |h| h.fetch(name) { h[name.to_s] } }
2771
+ DaruLite::Vector.new(v, name: coerce_name(name), index: @index)
2772
+ end
2773
+ end
2774
+
2775
+ def initialize_from_hash(source, vectors, index, opts)
2776
+ create_vectors_index_with vectors, source
2777
+
2778
+ if ArrayHelper.array_of?(source.values, Vector)
2779
+ initialize_from_hash_with_vectors source, index, opts
2780
+ else
2781
+ initialize_from_hash_with_arrays source, index, opts
2782
+ end
2783
+ end
2784
+
2785
+ def initialize_from_hash_with_vectors(source, index, opts)
2786
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
2787
+
2788
+ clone = opts[:clone] != false
2789
+ clone = true unless index || vectors_have_same_index
2790
+
2791
+ @index = deduce_index index, source, vectors_have_same_index
2792
+
2793
+ if clone
2794
+ @data = clone_vectors source, vectors_have_same_index
2795
+ else
2796
+ @data.concat source.values
2797
+ end
2798
+ end
2799
+
2800
+ def deduce_index(index, source, vectors_have_same_index)
2801
+ if !index.nil?
2802
+ Index.coerce index
2803
+ elsif vectors_have_same_index
2804
+ source.values[0].index.dup
2805
+ else
2806
+ all_indexes = source
2807
+ .values.map { |v| v.index.to_a }
2808
+ .flatten.uniq.sort # sort only if missing indexes detected
2809
+
2810
+ DaruLite::Index.new all_indexes
2811
+ end
2812
+ end
2813
+
2814
+ def clone_vectors(source, vectors_have_same_index)
2815
+ @vectors.map do |vector|
2816
+ # avoids matching indexes of vectors if all the supplied vectors
2817
+ # have the same index.
2818
+ if vectors_have_same_index
2819
+ source[vector].dup
2820
+ else
2821
+ DaruLite::Vector.new([], name: vector, index: @index).tap do |v|
2822
+ @index.each do |idx|
2823
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
2824
+ end
2825
+ end
2826
+ end
2827
+ end
2828
+ end
2829
+
2830
+ def initialize_from_hash_with_arrays(source, index, _opts)
2831
+ @index = Index.coerce(index || source.values[0].size)
2832
+
2833
+ @vectors.each do |name|
2834
+ @data << DaruLite::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
2835
+ end
2836
+ end
2837
+
2838
+ def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2839
+ # Create an array to be used for comparison of two rows in sorting
2840
+ vector_locs
2841
+ .zip(by_blocks, ascending, handle_nils)
2842
+ .map do |vector_loc, by, asc, handle_nil|
2843
+ value = @data[vector_loc].data[asc ? r1 : r2]
2844
+
2845
+ if by
2846
+ value = begin
2847
+ by.call(value)
2848
+ rescue StandardError
2849
+ nil
2850
+ end
2851
+ end
2852
+
2853
+ sort_handle_nils value, asc, handle_nil || !by
2854
+ end
2855
+ end
2856
+
2857
+ def sort_handle_nils(value, asc, handle_nil)
2858
+ if !handle_nil
2859
+ value
2860
+ elsif asc
2861
+ [value.nil? ? 0 : 1, value]
2862
+ else
2863
+ [value.nil? ? 1 : 0, value]
2864
+ end
2865
+ end
2866
+
2867
+ def sort_coerce_boolean(opts, symbol, default, size)
2868
+ val = opts[symbol]
2869
+ case val
2870
+ when true, false
2871
+ Array.new(size, val)
2872
+ when nil
2873
+ Array.new(size, default)
2874
+ when Array
2875
+ raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2876
+ size != val.size
2877
+
2878
+ val
2879
+ else
2880
+ raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2881
+ end
2882
+ end
2883
+
2884
+ def sort_prepare_block(vector_order, opts)
2885
+ ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2886
+ handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2887
+
2888
+ by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2889
+ vector_locs = vector_order.map { |v| @vectors[v] }
2890
+
2891
+ lambda do |index1, index2|
2892
+ # Build left and right array to compare two rows
2893
+ left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2894
+ right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2895
+
2896
+ # Resolve conflict by Index if all attributes are same
2897
+ left << index1
2898
+ right << index2
2899
+ left <=> right
2900
+ end
2901
+ end
2902
+
2903
+ def verify_error_message(row, test, id, i)
2904
+ description, fields, = test
2905
+ values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2906
+ "#{i + 1} [#{row[id]}]: #{description}#{values}"
2907
+ end
2908
+
2909
+ def prepare_pivot_values(index, vectors, opts)
2910
+ case opts[:values]
2911
+ when nil # values not specified at all.
2912
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
2913
+ when Array # multiple values specified.
2914
+ opts[:values]
2915
+ else # single value specified.
2916
+ [opts[:values]]
2917
+ end
2918
+ end
2919
+
2920
+ def make_pivot_hash(grouped, vectors, values, aggregate_function)
2921
+ grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2922
+ values.each do |value|
2923
+ grouped.groups.each do |group_name, row_numbers|
2924
+ row_numbers.each do |num|
2925
+ arry = [value, *vectors.map { |v| self[v][num] }]
2926
+ sub_hash = super_hash[group_name]
2927
+ sub_hash[arry] ||= []
2928
+
2929
+ sub_hash[arry] << self[value][num]
2930
+ end
2931
+ end
2932
+ end
2933
+
2934
+ setup_pivot_aggregates super_hash, aggregate_function
2935
+ end
2936
+ end
2937
+
2938
+ def setup_pivot_aggregates(super_hash, aggregate_function)
2939
+ super_hash.each_value do |sub_hash|
2940
+ sub_hash.each do |group_name, aggregates|
2941
+ sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2942
+ end
2943
+ end
2944
+ end
2945
+
2946
+ def pivot_dataframe(super_hash)
2947
+ df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2948
+ df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2949
+
2950
+ DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2951
+ super_hash.each do |row_index, sub_h|
2952
+ sub_h.each do |vector_index, val|
2953
+ pivoted_dataframe[vector_index][row_index] = val
2954
+ end
2955
+ end
2956
+ end
2957
+ end
2958
+
2959
+ def one_to_many_components(pattern)
2960
+ re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2961
+
2962
+ vars, numbers =
2963
+ @vectors
2964
+ .map { |v| v.scan(re) }
2965
+ .reject(&:empty?).flatten(1).transpose
2966
+
2967
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
2968
+ end
2969
+
2970
+ def one_to_many_row(row, number, vars, pattern)
2971
+ vars
2972
+ .to_h do |v|
2973
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
2974
+ [v, row[name]]
2975
+ end
2976
+ end
2977
+
2978
+ # Raises IndexError when one of the positions is not a valid position
2979
+ def validate_positions(*positions, size)
2980
+ positions.each do |pos|
2981
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
2982
+ end
2983
+ end
2984
+
2985
+ # Accepts hash, enumerable and vector and align it properly so it can be added
2986
+ def coerce_vector(vector)
2987
+ case vector
2988
+ when DaruLite::Vector
2989
+ vector.reindex @vectors
2990
+ when Hash
2991
+ DaruLite::Vector.new(vector).reindex @vectors
2992
+ else
2993
+ DaruLite::Vector.new vector
2994
+ end
2995
+ end
2996
+
2997
+ def update_data(source, vectors)
2998
+ @data = @vectors.each_with_index.map do |_vec, idx|
2999
+ DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3000
+ end
3001
+ end
3002
+
3003
+ def aggregate_by_positions_tuples(options, positions_tuples)
3004
+ agg_over_vectors_only, options = cast_aggregation_options(options)
3005
+
3006
+ if agg_over_vectors_only
3007
+ options.map do |vect_name, method|
3008
+ vect = self[vect_name]
3009
+
3010
+ positions_tuples.map do |positions|
3011
+ vect.apply_method_on_sub_vector(method, keys: positions)
3012
+ end
3013
+ end
3014
+ else
3015
+ methods = options.values
3016
+
3017
+ # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3018
+ rows = positions_tuples.map do |positions|
3019
+ apply_method_on_sub_df(methods, keys: positions)
3020
+ end
3021
+
3022
+ rows.transpose
3023
+ end
3024
+ end
3025
+
3026
+ # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3027
+ # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3028
+ # than aggregation over (sub-)dfs
3029
+ def cast_aggregation_options(options)
3030
+ vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3031
+
3032
+ over_vectors = true
3033
+
3034
+ if non_vects.any?
3035
+ options = options.clone
3036
+
3037
+ vects.each do |name|
3038
+ proc_on_vect = options[name].to_proc
3039
+ options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3040
+ end
3041
+
3042
+ over_vectors = false
3043
+ end
3044
+
3045
+ [over_vectors, options]
3046
+ end
3047
+
3048
+ def group_index_for_aggregation(index, multi_index_level = -1)
3049
+ case index
3050
+ when DaruLite::MultiIndex
3051
+ groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3052
+
3053
+ new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3054
+ pos_tuples = groups_by_pos.values
3055
+ when DaruLite::Index, DaruLite::CategoricalIndex
3056
+ new_index = Array(index).uniq
3057
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3058
+ else raise
3059
+ end
3060
+
3061
+ [pos_tuples, new_index]
3062
+ end
3063
+
3064
+ # coerce ranges, integers and array in appropriate ways
3065
+ def coerce_positions(*positions, size)
3066
+ if positions.size == 1
3067
+ case positions.first
3068
+ when Integer
3069
+ positions.first
3070
+ when Range
3071
+ size.times.to_a[positions.first]
3072
+ else
3073
+ raise ArgumentError, 'Unknown position type.'
3074
+ end
3075
+ else
3076
+ positions
3077
+ end
3078
+ end
3079
+ end
3080
+ end