daru_lite 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (149) hide show
  1. checksums.yaml +7 -0
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.github/workflows/ci.yml +33 -0
  4. data/.gitignore +10 -0
  5. data/.rspec +2 -0
  6. data/.rubocop.yml +27 -0
  7. data/.rubocop_todo.yml +137 -0
  8. data/CONTRIBUTING.md +47 -0
  9. data/Gemfile +2 -0
  10. data/History.md +4 -0
  11. data/LICENSE +24 -0
  12. data/README.md +218 -0
  13. data/Rakefile +69 -0
  14. data/ReleasePolicy.md +20 -0
  15. data/benchmarks/TradeoffData.csv +65 -0
  16. data/benchmarks/csv_reading.rb +22 -0
  17. data/benchmarks/dataframe_creation.rb +39 -0
  18. data/benchmarks/db_loading.rb +34 -0
  19. data/benchmarks/duplicating.rb +45 -0
  20. data/benchmarks/group_by.rb +32 -0
  21. data/benchmarks/joining.rb +52 -0
  22. data/benchmarks/row_access.rb +41 -0
  23. data/benchmarks/row_assign.rb +36 -0
  24. data/benchmarks/sorting.rb +51 -0
  25. data/benchmarks/statistics.rb +28 -0
  26. data/benchmarks/vector_access.rb +31 -0
  27. data/benchmarks/vector_assign.rb +42 -0
  28. data/benchmarks/where_clause.rb +48 -0
  29. data/benchmarks/where_vs_filter.rb +28 -0
  30. data/daru_lite.gemspec +55 -0
  31. data/images/README.md +5 -0
  32. data/images/con0.png +0 -0
  33. data/images/con1.png +0 -0
  34. data/images/init0.png +0 -0
  35. data/images/init1.png +0 -0
  36. data/images/man0.png +0 -0
  37. data/images/man1.png +0 -0
  38. data/images/man2.png +0 -0
  39. data/images/man3.png +0 -0
  40. data/images/man4.png +0 -0
  41. data/images/man5.png +0 -0
  42. data/images/man6.png +0 -0
  43. data/lib/daru_lite/accessors/array_wrapper.rb +109 -0
  44. data/lib/daru_lite/accessors/dataframe_by_row.rb +25 -0
  45. data/lib/daru_lite/accessors/mdarray_wrapper.rb +7 -0
  46. data/lib/daru_lite/category.rb +929 -0
  47. data/lib/daru_lite/configuration.rb +34 -0
  48. data/lib/daru_lite/core/group_by.rb +403 -0
  49. data/lib/daru_lite/core/merge.rb +270 -0
  50. data/lib/daru_lite/core/query.rb +109 -0
  51. data/lib/daru_lite/dataframe.rb +3080 -0
  52. data/lib/daru_lite/date_time/index.rb +569 -0
  53. data/lib/daru_lite/date_time/offsets.rb +397 -0
  54. data/lib/daru_lite/exceptions.rb +2 -0
  55. data/lib/daru_lite/extensions/which_dsl.rb +53 -0
  56. data/lib/daru_lite/formatters/table.rb +52 -0
  57. data/lib/daru_lite/helpers/array.rb +53 -0
  58. data/lib/daru_lite/index/categorical_index.rb +201 -0
  59. data/lib/daru_lite/index/index.rb +374 -0
  60. data/lib/daru_lite/index/multi_index.rb +374 -0
  61. data/lib/daru_lite/io/csv/converters.rb +21 -0
  62. data/lib/daru_lite/io/io.rb +294 -0
  63. data/lib/daru_lite/io/sql_data_source.rb +97 -0
  64. data/lib/daru_lite/iruby/helpers.rb +38 -0
  65. data/lib/daru_lite/iruby/templates/dataframe.html.erb +5 -0
  66. data/lib/daru_lite/iruby/templates/dataframe_mi.html.erb +5 -0
  67. data/lib/daru_lite/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  68. data/lib/daru_lite/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  69. data/lib/daru_lite/iruby/templates/dataframe_tbody.html.erb +28 -0
  70. data/lib/daru_lite/iruby/templates/dataframe_thead.html.erb +21 -0
  71. data/lib/daru_lite/iruby/templates/multi_index.html.erb +12 -0
  72. data/lib/daru_lite/iruby/templates/vector.html.erb +5 -0
  73. data/lib/daru_lite/iruby/templates/vector_mi.html.erb +5 -0
  74. data/lib/daru_lite/iruby/templates/vector_mi_tbody.html.erb +26 -0
  75. data/lib/daru_lite/iruby/templates/vector_mi_thead.html.erb +8 -0
  76. data/lib/daru_lite/iruby/templates/vector_tbody.html.erb +17 -0
  77. data/lib/daru_lite/iruby/templates/vector_thead.html.erb +8 -0
  78. data/lib/daru_lite/maths/arithmetic/dataframe.rb +91 -0
  79. data/lib/daru_lite/maths/arithmetic/vector.rb +117 -0
  80. data/lib/daru_lite/maths/statistics/dataframe.rb +202 -0
  81. data/lib/daru_lite/maths/statistics/vector.rb +1019 -0
  82. data/lib/daru_lite/monkeys.rb +56 -0
  83. data/lib/daru_lite/vector.rb +1678 -0
  84. data/lib/daru_lite/version.rb +3 -0
  85. data/lib/daru_lite.rb +99 -0
  86. data/profile/_base.rb +23 -0
  87. data/profile/df_to_a.rb +10 -0
  88. data/profile/filter.rb +13 -0
  89. data/profile/joining.rb +13 -0
  90. data/profile/sorting.rb +12 -0
  91. data/profile/vector_each_with_index.rb +9 -0
  92. data/profile/vector_new.rb +9 -0
  93. data/spec/accessors/array_wrapper_spec.rb +3 -0
  94. data/spec/category_spec.rb +1741 -0
  95. data/spec/core/group_by_spec.rb +655 -0
  96. data/spec/core/merge_spec.rb +179 -0
  97. data/spec/core/query_spec.rb +347 -0
  98. data/spec/daru_lite_spec.rb +22 -0
  99. data/spec/dataframe_spec.rb +4330 -0
  100. data/spec/date_time/data_spec.rb +197 -0
  101. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  102. data/spec/date_time/index_spec.rb +588 -0
  103. data/spec/date_time/offsets_spec.rb +465 -0
  104. data/spec/extensions/which_dsl_spec.rb +38 -0
  105. data/spec/fixtures/bank2.dat +200 -0
  106. data/spec/fixtures/boolean_converter_test.csv +5 -0
  107. data/spec/fixtures/countries.json +7794 -0
  108. data/spec/fixtures/duplicates.csv +32 -0
  109. data/spec/fixtures/eciresults.html +394 -0
  110. data/spec/fixtures/empties.dat +2 -0
  111. data/spec/fixtures/empty_rows_test.csv +17 -0
  112. data/spec/fixtures/macau.html +3691 -0
  113. data/spec/fixtures/macd_data.csv +150 -0
  114. data/spec/fixtures/matrix_test.csv +100 -0
  115. data/spec/fixtures/moneycontrol.html +6812 -0
  116. data/spec/fixtures/music_data.tsv +2501 -0
  117. data/spec/fixtures/repeated_fields.csv +7 -0
  118. data/spec/fixtures/sales-funnel.csv +18 -0
  119. data/spec/fixtures/scientific_notation.csv +4 -0
  120. data/spec/fixtures/string_converter_test.csv +5 -0
  121. data/spec/fixtures/strings.dat +2 -0
  122. data/spec/fixtures/test_xls.xls +0 -0
  123. data/spec/fixtures/test_xls_2.xls +0 -0
  124. data/spec/fixtures/url_test.txt~ +0 -0
  125. data/spec/fixtures/valid_markup.html +62 -0
  126. data/spec/fixtures/wiki_climate.html +1243 -0
  127. data/spec/fixtures/wiki_table_info.html +631 -0
  128. data/spec/formatters/table_formatter_spec.rb +137 -0
  129. data/spec/helpers_spec.rb +8 -0
  130. data/spec/index/categorical_index_spec.rb +170 -0
  131. data/spec/index/index_spec.rb +417 -0
  132. data/spec/index/multi_index_spec.rb +680 -0
  133. data/spec/io/io_spec.rb +373 -0
  134. data/spec/io/sql_data_source_spec.rb +56 -0
  135. data/spec/iruby/dataframe_spec.rb +170 -0
  136. data/spec/iruby/helpers_spec.rb +49 -0
  137. data/spec/iruby/multi_index_spec.rb +37 -0
  138. data/spec/iruby/vector_spec.rb +105 -0
  139. data/spec/maths/arithmetic/dataframe_spec.rb +148 -0
  140. data/spec/maths/arithmetic/vector_spec.rb +165 -0
  141. data/spec/maths/statistics/dataframe_spec.rb +178 -0
  142. data/spec/maths/statistics/vector_spec.rb +756 -0
  143. data/spec/monkeys_spec.rb +42 -0
  144. data/spec/shared/vector_display_spec.rb +213 -0
  145. data/spec/spec_helper.rb +87 -0
  146. data/spec/support/database_helper.rb +30 -0
  147. data/spec/support/matchers.rb +5 -0
  148. data/spec/vector_spec.rb +2293 -0
  149. metadata +571 -0
@@ -0,0 +1,3080 @@
1
+ require 'daru_lite/accessors/dataframe_by_row'
2
+ require 'daru_lite/maths/arithmetic/dataframe'
3
+ require 'daru_lite/maths/statistics/dataframe'
4
+ require 'daru_lite/io/io'
5
+
6
+ module DaruLite
7
+ class DataFrame # rubocop:disable Metrics/ClassLength
8
+ include DaruLite::Maths::Arithmetic::DataFrame
9
+ include DaruLite::Maths::Statistics::DataFrame
10
+
11
+ attr_accessor(*Configuration::INSPECT_OPTIONS_KEYS)
12
+
13
+ extend Gem::Deprecate
14
+
15
+ class << self
16
+ # Load data from a CSV file. Specify an optional block to grab the CSV
17
+ # object and pre-condition it (for example use the `convert` or
18
+ # `header_convert` methods).
19
+ #
20
+ # == Arguments
21
+ #
22
+ # * path - Local path / Remote URL of the file to load specified as a String.
23
+ #
24
+ # == Options
25
+ #
26
+ # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
27
+ # and uses those to eventually construct the resulting DataFrame.
28
+ #
29
+ # == Verbose Description
30
+ #
31
+ # You can specify all the options to the `.from_csv` function that you
32
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
33
+ #
34
+ # For example, if the columns in your CSV file are separated by something
35
+ # other that commas, you can use the `:col_sep` option. If you want to
36
+ # convert numeric values to numbers and not keep them as strings, you can
37
+ # use the `:converters` option and set it to `:numeric`.
38
+ #
39
+ # The `.from_csv` function uses the following defaults for reading CSV files
40
+ # (that are passed into the `CSV.read()` function):
41
+ #
42
+ # {
43
+ # :col_sep => ',',
44
+ # :converters => :numeric
45
+ # }
46
+ def from_csv(path, opts = {}, &block)
47
+ DaruLite::IO.from_csv path, opts, &block
48
+ end
49
+
50
+ # Read data from an Excel file into a DataFrame.
51
+ #
52
+ # == Arguments
53
+ #
54
+ # * path - Path of the file to be read.
55
+ #
56
+ # == Options
57
+ #
58
+ # *:worksheet_id - ID of the worksheet that is to be read.
59
+ def from_excel(path, opts = {}, &block)
60
+ DaruLite::IO.from_excel path, opts, &block
61
+ end
62
+
63
+ # Read a database query and returns a Dataset
64
+ #
65
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
66
+ # @param query [String] The query to be executed
67
+ #
68
+ # @return A dataframe containing the data resulting from the query
69
+ #
70
+ # USE:
71
+ #
72
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
73
+ # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
74
+ #
75
+ # #Alternatively
76
+ #
77
+ # require 'dbi'
78
+ # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
79
+ def from_sql(dbh, query)
80
+ DaruLite::IO.from_sql dbh, query
81
+ end
82
+
83
+ # Read a dataframe from AR::Relation
84
+ #
85
+ # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
86
+ # @param fields [Array] Field names to be loaded (optional)
87
+ #
88
+ # @return A dataframe containing the data loaded from the relation
89
+ #
90
+ # USE:
91
+ #
92
+ # # When Post model is defined as:
93
+ # class Post < ActiveRecord::Base
94
+ # scope :active, -> { where.not(published_at: nil) }
95
+ # end
96
+ #
97
+ # # You can load active posts into a dataframe by:
98
+ # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
99
+ def from_activerecord(relation, *fields)
100
+ DaruLite::IO.from_activerecord relation, *fields
101
+ end
102
+
103
+ # Read the database from a plaintext file. For this method to work,
104
+ # the data should be present in a plain text file in columns. See
105
+ # spec/fixtures/bank2.dat for an example.
106
+ #
107
+ # == Arguments
108
+ #
109
+ # * path - Path of the file to be read.
110
+ # * fields - Vector names of the resulting database.
111
+ #
112
+ # == Usage
113
+ #
114
+ # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
115
+ def from_plaintext(path, fields)
116
+ DaruLite::IO.from_plaintext path, fields
117
+ end
118
+
119
+ # Create DataFrame by specifying rows as an Array of Arrays or Array of
120
+ # DaruLite::Vector objects.
121
+ def rows(source, opts = {})
122
+ raise SizeError, 'All vectors must have same length' \
123
+ unless source.all? { |v| v.size == source.first.size }
124
+
125
+ opts[:order] ||= guess_order(source)
126
+
127
+ if ArrayHelper.array_of?(source, Array) || source.empty?
128
+ DataFrame.new(source.transpose, opts)
129
+ elsif ArrayHelper.array_of?(source, Vector)
130
+ from_vector_rows(source, opts)
131
+ else
132
+ raise ArgumentError, "Can't create DataFrame from #{source}"
133
+ end
134
+ end
135
+
136
+ # Generates a new dataset, using three vectors
137
+ # - Rows
138
+ # - Columns
139
+ # - Values
140
+ #
141
+ # For example, you have these values
142
+ #
143
+ # x y v
144
+ # a a 0
145
+ # a b 1
146
+ # b a 1
147
+ # b b 0
148
+ #
149
+ # You obtain
150
+ # id a b
151
+ # a 0 1
152
+ # b 1 0
153
+ #
154
+ # Useful to process outputs from databases
155
+ def crosstab_by_assignation(rows, columns, values)
156
+ raise 'Three vectors should be equal size' if
157
+ rows.size != columns.size || rows.size != values.size
158
+
159
+ data = Hash.new do |h, col|
160
+ h[col] = rows.factors.map { |r| [r, nil] }.to_h
161
+ end
162
+ columns.zip(rows, values).each { |c, r, v| data[c][r] = v }
163
+
164
+ # FIXME: in fact, WITHOUT this line you'll obtain more "right"
165
+ # data: with vectors having "rows" as an index...
166
+ data = data.transform_values(&:values)
167
+ data[:_id] = rows.factors
168
+
169
+ DataFrame.new(data)
170
+ end
171
+
172
+ private
173
+
174
+ def guess_order(source)
175
+ case source.first
176
+ when Vector # assume that all are Vectors
177
+ source.first.index.to_a
178
+ when Array
179
+ Array.new(source.first.size, &:to_s)
180
+ end
181
+ end
182
+
183
+ def from_vector_rows(source, opts)
184
+ index = source.map(&:name)
185
+ .each_with_index.map { |n, i| n || i }
186
+ index = ArrayHelper.recode_repeated(index)
187
+
188
+ DataFrame.new({}, opts).tap do |df|
189
+ source.each_with_index do |row, idx|
190
+ df[index[idx] || idx, :row] = row
191
+ end
192
+ end
193
+ end
194
+ end
195
+
196
+ # The vectors (columns) index of the DataFrame
197
+ attr_reader :vectors
198
+ # TOREMOVE
199
+ attr_reader :data
200
+
201
+ # The index of the rows of the DataFrame
202
+ attr_reader :index
203
+
204
+ # The name of the DataFrame
205
+ attr_reader :name
206
+
207
+ # The number of rows present in the DataFrame
208
+ attr_reader :size
209
+
210
+ # DataFrame basically consists of an Array of Vector objects.
211
+ # These objects are indexed by row and column by vectors and index Index objects.
212
+ #
213
+ # == Arguments
214
+ #
215
+ # * source - Source from the DataFrame is to be initialized. Can be a Hash
216
+ # of names and vectors (array or DaruLite::Vector), an array of arrays or
217
+ # array of DaruLite::Vectors.
218
+ #
219
+ # == Options
220
+ #
221
+ # +:order+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order in
222
+ # which Vectors should appear in the DataFrame.
223
+ #
224
+ # +:index+ - An *Array*/*DaruLite::Index*/*DaruLite::MultiIndex* containing the order
225
+ # in which rows of the DataFrame will be named.
226
+ #
227
+ # +:name+ - A name for the DataFrame.
228
+ #
229
+ # +:clone+ - Specify as *true* or *false*. When set to false, and Vector
230
+ # objects are passed for the source, the Vector objects will not duplicated
231
+ # when creating the DataFrame. Will have no effect if Array is passed in
232
+ # the source, or if the passed DaruLite::Vectors have different indexes.
233
+ # Default to *true*.
234
+ #
235
+ # == Usage
236
+ #
237
+ # df = DaruLite::DataFrame.new
238
+ # # =>
239
+ # # <DaruLite::DataFrame(0x0)>
240
+ # # Creates an empty DataFrame with no rows or columns.
241
+ #
242
+ # df = DaruLite::DataFrame.new({}, order: [:a, :b])
243
+ # #<DaruLite::DataFrame(0x2)>
244
+ # a b
245
+ # # Creates a DataFrame with no rows and columns :a and :b
246
+ #
247
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
248
+ # index: [:a, :b, :c, :d], name: :spider_man)
249
+ #
250
+ # # =>
251
+ # # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
252
+ # # b a
253
+ # # a 6 1
254
+ # # b 7 2
255
+ # # c 8 3
256
+ # # d 9 4
257
+ #
258
+ # df = DaruLite::DataFrame.new([[1,2,3,4],[6,7,8,9]], name: :bat_man)
259
+ #
260
+ # # =>
261
+ # # #<DaruLite::DataFrame: bat_man (4x2)>
262
+ # # 0 1
263
+ # # 0 1 6
264
+ # # 1 2 7
265
+ # # 2 3 8
266
+ # # 3 4 9
267
+ #
268
+ # # Dataframe having Index name
269
+ #
270
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [6,7,8,9]}, order: [:b, :a],
271
+ # index: DaruLite::Index.new([:a, :b, :c, :d], name: 'idx_name'),
272
+ # name: :spider_man)
273
+ #
274
+ # # =>
275
+ # # <DaruLite::DataFrame:80766980 @name = spider_man @size = 4>
276
+ # # idx_name b a
277
+ # # a 6 1
278
+ # # b 7 2
279
+ # # c 8 3
280
+ # # d 9 4
281
+ #
282
+ #
283
+ # idx = DaruLite::Index.new [100, 99, 101, 1, 2], name: "s1"
284
+ # => #<DaruLite::Index(5): s1 {100, 99, 101, 1, 2}>
285
+ #
286
+ # df = DaruLite::DataFrame.new({b: [11,12,13,14,15], a: [1,2,3,4,5],
287
+ # c: [11,22,33,44,55]},
288
+ # order: [:a, :b, :c],
289
+ # index: idx)
290
+ # # =>
291
+ # #<DaruLite::DataFrame(5x3)>
292
+ # # s1 a b c
293
+ # # 100 1 11 11
294
+ # # 99 2 12 22
295
+ # # 101 3 13 33
296
+ # # 1 4 14 44
297
+ # # 2 5 15 55
298
+
299
+ def initialize(source = {}, opts = {})
300
+ vectors = opts[:order]
301
+ index = opts[:index] # FIXME: just keyword arges after Ruby 2.1
302
+ @data = []
303
+ @name = opts[:name]
304
+
305
+ case source
306
+ when [], {}
307
+ create_empty_vectors(vectors, index)
308
+ when Array
309
+ initialize_from_array source, vectors, index, opts
310
+ when Hash
311
+ initialize_from_hash source, vectors, index, opts
312
+ end
313
+
314
+ set_size
315
+ validate
316
+ update
317
+ end
318
+
319
+ # Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
320
+ # Defaults to *:vector*. Use of this method is not recommended for accessing
321
+ # rows. Use df.row[:a] for accessing row with index ':a'.
322
+ def [](*names)
323
+ axis = extract_axis(names, :vector)
324
+ dispatch_to_axis axis, :access, *names
325
+ end
326
+
327
+ # Retrive rows by positions
328
+ # @param [Array<Integer>] positions of rows to retrive
329
+ # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
330
+ # @example
331
+ # df = DaruLite::DataFrame.new({
332
+ # a: [1, 2, 3],
333
+ # b: ['a', 'b', 'c']
334
+ # })
335
+ # df.row_at 1, 2
336
+ # # => #<DaruLite::DataFrame(2x2)>
337
+ # # a b
338
+ # # 1 2 b
339
+ # # 2 3 c
340
+ def row_at(*positions)
341
+ original_positions = positions
342
+ positions = coerce_positions(*positions, nrows)
343
+ validate_positions(*positions, nrows)
344
+
345
+ if positions.is_a? Integer
346
+ row = get_rows_for([positions])
347
+ DaruLite::Vector.new row, index: @vectors
348
+ else
349
+ new_rows = get_rows_for(original_positions)
350
+ DaruLite::DataFrame.new new_rows, index: @index.at(*original_positions), order: @vectors
351
+ end
352
+ end
353
+
354
+ # Set rows by positions
355
+ # @param [Array<Integer>] positions positions of rows to set
356
+ # @param [Array, DaruLite::Vector] vector vector to be assigned
357
+ # @example
358
+ # df = DaruLite::DataFrame.new({
359
+ # a: [1, 2, 3],
360
+ # b: ['a', 'b', 'c']
361
+ # })
362
+ # df.set_row_at [0, 1], ['x', 'x']
363
+ # df
364
+ # #=> #<DaruLite::DataFrame(3x2)>
365
+ # # a b
366
+ # # 0 x x
367
+ # # 1 x x
368
+ # # 2 3 c
369
+ def set_row_at(positions, vector)
370
+ validate_positions(*positions, nrows)
371
+ vector =
372
+ if vector.is_a? DaruLite::Vector
373
+ vector.reindex @vectors
374
+ else
375
+ DaruLite::Vector.new vector
376
+ end
377
+
378
+ raise SizeError, 'Vector length should match row length' if
379
+ vector.size != @vectors.size
380
+
381
+ @data.each_with_index do |vec, pos|
382
+ vec.set_at(positions, vector.at(pos))
383
+ end
384
+ @index = @data[0].index
385
+ set_size
386
+ end
387
+
388
+ # Retrive vectors by positions
389
+ # @param [Array<Integer>] positions of vectors to retrive
390
+ # @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
391
+ # @example
392
+ # df = DaruLite::DataFrame.new({
393
+ # a: [1, 2, 3],
394
+ # b: ['a', 'b', 'c']
395
+ # })
396
+ # df.at 0
397
+ # # => #<DaruLite::Vector(3)>
398
+ # # a
399
+ # # 0 1
400
+ # # 1 2
401
+ # # 2 3
402
+ def at(*positions)
403
+ if AXES.include? positions.last
404
+ axis = positions.pop
405
+ return row_at(*positions) if axis == :row
406
+ end
407
+
408
+ original_positions = positions
409
+ positions = coerce_positions(*positions, ncols)
410
+ validate_positions(*positions, ncols)
411
+
412
+ if positions.is_a? Integer
413
+ @data[positions].dup
414
+ else
415
+ DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
416
+ index: @index,
417
+ order: @vectors.at(*original_positions),
418
+ name: @name
419
+ end
420
+ end
421
+
422
+ # Set vectors by positions
423
+ # @param [Array<Integer>] positions positions of vectors to set
424
+ # @param [Array, DaruLite::Vector] vector vector to be assigned
425
+ # @example
426
+ # df = DaruLite::DataFrame.new({
427
+ # a: [1, 2, 3],
428
+ # b: ['a', 'b', 'c']
429
+ # })
430
+ # df.set_at [0], ['x', 'y', 'z']
431
+ # df
432
+ # #=> #<DaruLite::DataFrame(3x2)>
433
+ # # a b
434
+ # # 0 x a
435
+ # # 1 y b
436
+ # # 2 z c
437
+ def set_at(positions, vector)
438
+ if positions.last == :row
439
+ positions.pop
440
+ return set_row_at(positions, vector)
441
+ end
442
+
443
+ validate_positions(*positions, ncols)
444
+ vector =
445
+ if vector.is_a? DaruLite::Vector
446
+ vector.reindex @index
447
+ else
448
+ DaruLite::Vector.new vector
449
+ end
450
+
451
+ raise SizeError, 'Vector length should match index length' if
452
+ vector.size != @index.size
453
+
454
+ positions.each { |pos| @data[pos] = vector }
455
+ end
456
+
457
+ # Insert a new row/vector of the specified name or modify a previous row.
458
+ # Instead of using this method directly, use df.row[:a] = [1,2,3] to set/create
459
+ # a row ':a' to [1,2,3], or df.vector[:vec] = [1,2,3] for vectors.
460
+ #
461
+ # In case a DaruLite::Vector is specified after the equality the sign, the indexes
462
+ # of the vector will be matched against the row/vector indexes of the DataFrame
463
+ # before an insertion is performed. Unmatched indexes will be set to nil.
464
+ def []=(*args)
465
+ vector = args.pop
466
+ axis = extract_axis(args)
467
+ names = args
468
+
469
+ dispatch_to_axis axis, :insert_or_modify, names, vector
470
+ end
471
+
472
+ def add_row(row, index = nil)
473
+ self.row[*(index || @size)] = row
474
+ end
475
+
476
+ def add_vector(n, vector)
477
+ self[n] = vector
478
+ end
479
+
480
+ def insert_vector(n, name, source)
481
+ raise ArgumentError unless source.is_a? Array
482
+
483
+ vector = DaruLite::Vector.new(source, index: @index, name: @name)
484
+ @data << vector
485
+ @vectors = @vectors.add name
486
+ ordr = @vectors.dup.to_a
487
+ elmnt = ordr.pop
488
+ ordr.insert n, elmnt
489
+ self.order = ordr
490
+ end
491
+
492
+ # Access a row or set/create a row. Refer #[] and #[]= docs for details.
493
+ #
494
+ # == Usage
495
+ # df.row[:a] # access row named ':a'
496
+ # df.row[:b] = [1,2,3] # set row ':b' to [1,2,3]
497
+ def row
498
+ DaruLite::Accessors::DataFrameByRow.new(self)
499
+ end
500
+
501
+ # Extract a dataframe given row indexes or positions
502
+ # @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
503
+ # @return [DaruLite::Dataframe]
504
+ def get_sub_dataframe(keys, by_position: true)
505
+ return DaruLite::DataFrame.new({}) if keys == []
506
+
507
+ keys = @index.pos(*keys) unless by_position
508
+
509
+ sub_df = row_at(*keys)
510
+ sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
511
+
512
+ sub_df
513
+ end
514
+
515
+ # Duplicate the DataFrame entirely.
516
+ #
517
+ # == Arguments
518
+ #
519
+ # * +vectors_to_dup+ - An Array specifying the names of Vectors to
520
+ # be duplicated. Will duplicate the entire DataFrame if not specified.
521
+ def dup(vectors_to_dup = nil)
522
+ vectors_to_dup ||= @vectors.to_a
523
+
524
+ src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
525
+ new_order = DaruLite::Index.new(vectors_to_dup)
526
+
527
+ DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
528
+ end
529
+
530
+ # Only clone the structure of the DataFrame.
531
+ def clone_structure
532
+ DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
533
+ end
534
+
535
+ # Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
536
+ # preserved.
537
+ #
538
+ # == Arguments
539
+ #
540
+ # +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
541
+ # a view of the whole data frame otherwise.
542
+ def clone(*vectors_to_clone)
543
+ vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
544
+ vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
545
+
546
+ h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
547
+ DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
548
+ end
549
+
550
+ # Returns a 'shallow' copy of DataFrame if missing data is not present,
551
+ # or a full copy of only valid data if missing data is present.
552
+ def clone_only_valid
553
+ if include_values?(*DaruLite::MISSING_VALUES)
554
+ reject_values(*DaruLite::MISSING_VALUES)
555
+ else
556
+ clone
557
+ end
558
+ end
559
+
560
+ # Creates a new duplicate dataframe containing only rows
561
+ # without a single missing value.
562
+ def dup_only_valid(vecs = nil)
563
+ rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
564
+ .inject(&:concat)
565
+ .uniq
566
+
567
+ row_indexes = @index.to_a
568
+ (vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
569
+ end
570
+ deprecate :dup_only_valid, :reject_values, 2016, 10
571
+
572
+ # Returns a dataframe in which rows with any of the mentioned values
573
+ # are ignored.
574
+ # @param [Array] values to reject to form the new dataframe
575
+ # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
576
+ # contain the mentioned values
577
+ # @example
578
+ # df = DaruLite::DataFrame.new({
579
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
580
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
581
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
582
+ # }, index: 11..18)
583
+ # df.reject_values nil, Float::NAN
584
+ # # => #<DaruLite::DataFrame(2x3)>
585
+ # # a b c
586
+ # # 11 1 a a
587
+ # # 18 7 8 7
588
+ def reject_values(*values)
589
+ positions =
590
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
591
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
592
+ if positions.size == 1
593
+ pos = positions.first
594
+ row_at(pos..pos)
595
+ else
596
+ row_at(*positions)
597
+ end
598
+ end
599
+
600
+ # Replace specified values with given value
601
+ # @param [Array] old_values values to replace with new value
602
+ # @param [object] new_value new value to replace with
603
+ # @return [DaruLite::DataFrame] Data Frame itself with old values replace
604
+ # with new value
605
+ # @example
606
+ # df = DaruLite::DataFrame.new({
607
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
608
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
609
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
610
+ # }, index: 11..18)
611
+ # df.replace_values nil, Float::NAN
612
+ # # => #<DaruLite::DataFrame(8x3)>
613
+ # # a b c
614
+ # # 11 1 a a
615
+ # # 12 2 b NaN
616
+ # # 13 3 NaN 3
617
+ # # 14 NaN NaN 4
618
+ # # 15 NaN NaN 3
619
+ # # 16 NaN 3 5
620
+ # # 17 1 5 NaN
621
+ # # 18 7 8 7
622
+ def replace_values(old_values, new_value)
623
+ @data.each { |vec| vec.replace_values old_values, new_value }
624
+ self
625
+ end
626
+
627
+ # Rolling fillna
628
+ # replace all Float::NAN and NIL values with the preceeding or following value
629
+ #
630
+ # @param direction [Symbol] (:forward, :backward) whether replacement value is preceeding or following
631
+ #
632
+ # @example
633
+ # df = DaruLite::DataFrame.new({
634
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
635
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, nil],
636
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
637
+ # })
638
+ #
639
+ # => #<DaruLite::DataFrame(8x3)>
640
+ # a b c
641
+ # 0 1 a a
642
+ # 1 2 b NaN
643
+ # 2 3 nil 3
644
+ # 3 nil NaN 4
645
+ # 4 NaN nil 3
646
+ # 5 nil 3 5
647
+ # 6 1 5 nil
648
+ # 7 7 nil 7
649
+ #
650
+ # 2.3.3 :068 > df.rolling_fillna(:forward)
651
+ # => #<DaruLite::DataFrame(8x3)>
652
+ # a b c
653
+ # 0 1 a a
654
+ # 1 2 b a
655
+ # 2 3 b 3
656
+ # 3 3 b 4
657
+ # 4 3 b 3
658
+ # 5 3 3 5
659
+ # 6 1 5 5
660
+ # 7 7 5 7
661
+ #
662
+ def rolling_fillna!(direction = :forward)
663
+ @data.each { |vec| vec.rolling_fillna!(direction) }
664
+ self
665
+ end
666
+
667
+ def rolling_fillna(direction = :forward)
668
+ dup.rolling_fillna!(direction)
669
+ end
670
+
671
+ # Return unique rows by vector specified or all vectors
672
+ #
673
+ # @param vtrs [String][Symbol] vector names(s) that should be considered
674
+ #
675
+ # @example
676
+ #
677
+ # => #<DaruLite::DataFrame(6x2)>
678
+ # a b
679
+ # 0 1 a
680
+ # 1 2 b
681
+ # 2 3 c
682
+ # 3 4 d
683
+ # 2 3 c
684
+ # 3 4 f
685
+ #
686
+ # 2.3.3 :> df.unique
687
+ # => #<DaruLite::DataFrame(5x2)>
688
+ # a b
689
+ # 0 1 a
690
+ # 1 2 b
691
+ # 2 3 c
692
+ # 3 4 d
693
+ # 3 4 f
694
+ #
695
+ # 2.3.3 :> df.unique(:a)
696
+ # => #<DaruLite::DataFrame(5x2)>
697
+ # a b
698
+ # 0 1 a
699
+ # 1 2 b
700
+ # 2 3 c
701
+ # 3 4 d
702
+ #
703
+ def uniq(*vtrs)
704
+ vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
705
+ grouped = group_by(vecs)
706
+ indexes = grouped.groups.values.map { |v| v[0] }.sort
707
+ row[*indexes]
708
+ end
709
+
710
+ # Iterate over each index of the DataFrame.
711
+ def each_index(&block)
712
+ return to_enum(:each_index) unless block
713
+
714
+ @index.each(&block)
715
+
716
+ self
717
+ end
718
+
719
+ # Iterate over each vector
720
+ def each_vector(&block)
721
+ return to_enum(:each_vector) unless block
722
+
723
+ @data.each(&block)
724
+
725
+ self
726
+ end
727
+
728
+ alias each_column each_vector
729
+
730
+ # Iterate over each vector alongwith the name of the vector
731
+ def each_vector_with_index
732
+ return to_enum(:each_vector_with_index) unless block_given?
733
+
734
+ @vectors.each do |vector|
735
+ yield @data[@vectors[vector]], vector
736
+ end
737
+
738
+ self
739
+ end
740
+
741
+ alias each_column_with_index each_vector_with_index
742
+
743
+ # Iterate over each row
744
+ def each_row
745
+ return to_enum(:each_row) unless block_given?
746
+
747
+ @index.size.times do |pos|
748
+ yield row_at(pos)
749
+ end
750
+
751
+ self
752
+ end
753
+
754
+ def each_row_with_index
755
+ return to_enum(:each_row_with_index) unless block_given?
756
+
757
+ @index.each do |index|
758
+ yield access_row(index), index
759
+ end
760
+
761
+ self
762
+ end
763
+
764
+ # Iterate over each row or vector of the DataFrame. Specify axis
765
+ # by passing :vector or :row as the argument. Default to :vector.
766
+ #
767
+ # == Description
768
+ #
769
+ # `#each` works exactly like Array#each. The default mode for `each`
770
+ # is to iterate over the columns of the DataFrame. To iterate over
771
+ # rows you must pass the axis, i.e `:row` as an argument.
772
+ #
773
+ # == Arguments
774
+ #
775
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
776
+ # or :row. Default to :vector.
777
+ def each(axis = :vector, &block)
778
+ dispatch_to_axis axis, :each, &block
779
+ end
780
+
781
+ # Iterate over a row or vector and return results in a DaruLite::Vector.
782
+ # Specify axis with :vector or :row. Default to :vector.
783
+ #
784
+ # == Description
785
+ #
786
+ # The #collect iterator works similar to #map, the only difference
787
+ # being that it returns a DaruLite::Vector comprising of the results of
788
+ # each block run. The resultant Vector has the same index as that
789
+ # of the axis over which collect has iterated. It also accepts the
790
+ # optional axis argument.
791
+ #
792
+ # == Arguments
793
+ #
794
+ # * +axis+ - The axis to iterate over. Can be :vector (or :column)
795
+ # or :row. Default to :vector.
796
+ def collect(axis = :vector, &block)
797
+ dispatch_to_axis_pl axis, :collect, &block
798
+ end
799
+
800
+ # Map over each vector or row of the data frame according to
801
+ # the argument specified. Will return an Array of the resulting
802
+ # elements. To map over each row/vector and get a DataFrame,
803
+ # see #recode.
804
+ #
805
+ # == Description
806
+ #
807
+ # The #map iterator works like Array#map. The value returned by
808
+ # each run of the block is added to an Array and the Array is
809
+ # returned. This method also accepts an axis argument, like #each.
810
+ # The default is :vector.
811
+ #
812
+ # == Arguments
813
+ #
814
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
815
+ # Default to :vector.
816
+ def map(axis = :vector, &block)
817
+ dispatch_to_axis_pl axis, :map, &block
818
+ end
819
+
820
+ # Destructive map. Modifies the DataFrame. Each run of the block
821
+ # must return a DaruLite::Vector. You can specify the axis to map over
822
+ # as the argument. Default to :vector.
823
+ #
824
+ # == Arguments
825
+ #
826
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
827
+ # Default to :vector.
828
+ def map!(axis = :vector, &block)
829
+ if %i[vector column].include?(axis)
830
+ map_vectors!(&block)
831
+ elsif axis == :row
832
+ map_rows!(&block)
833
+ end
834
+ end
835
+
836
+ # Maps over the DataFrame and returns a DataFrame. Each run of the
837
+ # block must return a DaruLite::Vector object. You can specify the axis
838
+ # to map over. Default to :vector.
839
+ #
840
+ # == Description
841
+ #
842
+ # Recode works similarly to #map, but an important difference between
843
+ # the two is that recode returns a modified DaruLite::DataFrame instead
844
+ # of an Array. For this reason, #recode expects that every run of the
845
+ # block to return a DaruLite::Vector.
846
+ #
847
+ # Just like map and each, recode also accepts an optional _axis_ argument.
848
+ #
849
+ # == Arguments
850
+ #
851
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
852
+ # Default to :vector.
853
+ def recode(axis = :vector, &block)
854
+ dispatch_to_axis_pl axis, :recode, &block
855
+ end
856
+
857
+ # Retain vectors or rows if the block returns a truthy value.
858
+ #
859
+ # == Description
860
+ #
861
+ # For filtering out certain rows/vectors based on their values,
862
+ # use the #filter method. By default it iterates over vectors and
863
+ # keeps those vectors for which the block returns true. It accepts
864
+ # an optional axis argument which lets you specify whether you want
865
+ # to iterate over vectors or rows.
866
+ #
867
+ # == Arguments
868
+ #
869
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
870
+ # Default to :vector.
871
+ #
872
+ # == Usage
873
+ #
874
+ # # Filter vectors
875
+ #
876
+ # df.filter do |vector|
877
+ # vector.type == :numeric and vector.median < 50
878
+ # end
879
+ #
880
+ # # Filter rows
881
+ #
882
+ # df.filter(:row) do |row|
883
+ # row[:a] + row[:d] < 100
884
+ # end
885
+ def filter(axis = :vector, &block)
886
+ dispatch_to_axis_pl axis, :filter, &block
887
+ end
888
+
889
+ def recode_vectors
890
+ block_given? or return to_enum(:recode_vectors)
891
+
892
+ dup.tap do |df|
893
+ df.each_vector_with_index do |v, i|
894
+ df[*i] = should_be_vector!(yield(v))
895
+ end
896
+ end
897
+ end
898
+
899
+ def recode_rows
900
+ block_given? or return to_enum(:recode_rows)
901
+
902
+ dup.tap do |df|
903
+ df.each_row_with_index do |r, i|
904
+ df.row[i] = should_be_vector!(yield(r))
905
+ end
906
+ end
907
+ end
908
+
909
+ # Map each vector and return an Array.
910
+ def map_vectors(&block)
911
+ return to_enum(:map_vectors) unless block
912
+
913
+ @data.map(&block)
914
+ end
915
+
916
+ # Destructive form of #map_vectors
917
+ def map_vectors!
918
+ return to_enum(:map_vectors!) unless block_given?
919
+
920
+ vectors.dup.each do |n|
921
+ self[n] = should_be_vector!(yield(self[n]))
922
+ end
923
+
924
+ self
925
+ end
926
+
927
+ # Map vectors alongwith the index.
928
+ def map_vectors_with_index(&block)
929
+ return to_enum(:map_vectors_with_index) unless block
930
+
931
+ each_vector_with_index.map(&block)
932
+ end
933
+
934
+ # Map each row
935
+ def map_rows(&block)
936
+ return to_enum(:map_rows) unless block
937
+
938
+ each_row.map(&block)
939
+ end
940
+
941
+ def map_rows_with_index(&block)
942
+ return to_enum(:map_rows_with_index) unless block
943
+
944
+ each_row_with_index.map(&block)
945
+ end
946
+
947
+ def map_rows!
948
+ return to_enum(:map_rows!) unless block_given?
949
+
950
+ index.dup.each do |i|
951
+ row[i] = should_be_vector!(yield(row[i]))
952
+ end
953
+
954
+ self
955
+ end
956
+
957
+ def apply_method(method, keys: nil, by_position: true)
958
+ df = keys ? get_sub_dataframe(keys, by_position: by_position) : self
959
+
960
+ case method
961
+ when Symbol then df.send(method)
962
+ when Proc then method.call(df)
963
+ when Array then method.map(&:to_proc).map { |proc| proc.call(df) } # works with Array of both Symbol and/or Proc
964
+ else raise
965
+ end
966
+ end
967
+ alias apply_method_on_sub_df apply_method
968
+
969
+ # Retrieves a DaruLite::Vector, based on the result of calculation
970
+ # performed on each row.
971
+ def collect_rows(&block)
972
+ return to_enum(:collect_rows) unless block
973
+
974
+ DaruLite::Vector.new(each_row.map(&block), index: @index)
975
+ end
976
+
977
+ def collect_row_with_index(&block)
978
+ return to_enum(:collect_row_with_index) unless block
979
+
980
+ DaruLite::Vector.new(each_row_with_index.map(&block), index: @index)
981
+ end
982
+
983
+ # Retrives a DaruLite::Vector, based on the result of calculation
984
+ # performed on each vector.
985
+ def collect_vectors(&block)
986
+ return to_enum(:collect_vectors) unless block
987
+
988
+ DaruLite::Vector.new(each_vector.map(&block), index: @vectors)
989
+ end
990
+
991
+ def collect_vector_with_index(&block)
992
+ return to_enum(:collect_vector_with_index) unless block
993
+
994
+ DaruLite::Vector.new(each_vector_with_index.map(&block), index: @vectors)
995
+ end
996
+
997
+ # Generate a matrix, based on vector names of the DataFrame.
998
+ #
999
+ # @return {::Matrix}
1000
+ # :nocov:
1001
+ # FIXME: Even not trying to cover this: I can't get, how it is expected
1002
+ # to work.... -- zverok
1003
+ def collect_matrix
1004
+ return to_enum(:collect_matrix) unless block_given?
1005
+
1006
+ vecs = vectors.to_a
1007
+ rows = vecs.collect do |row|
1008
+ vecs.collect do |col|
1009
+ yield row, col
1010
+ end
1011
+ end
1012
+
1013
+ Matrix.rows(rows)
1014
+ end
1015
+ # :nocov:
1016
+
1017
+ # Delete a vector
1018
+ def delete_vector(vector)
1019
+ raise IndexError, "Vector #{vector} does not exist." unless @vectors.include?(vector)
1020
+
1021
+ @data.delete_at @vectors[vector]
1022
+ @vectors = DaruLite::Index.new @vectors.to_a - [vector]
1023
+
1024
+ self
1025
+ end
1026
+
1027
+ # Deletes a list of vectors
1028
+ def delete_vectors(*vectors)
1029
+ Array(vectors).each { |vec| delete_vector vec }
1030
+
1031
+ self
1032
+ end
1033
+
1034
+ # Delete a row
1035
+ def delete_row(index)
1036
+ idx = named_index_for index
1037
+
1038
+ raise IndexError, "Index #{index} does not exist." unless @index.include? idx
1039
+
1040
+ @index = DaruLite::Index.new(@index.to_a - [idx])
1041
+ each_vector do |vector|
1042
+ vector.delete_at idx
1043
+ end
1044
+
1045
+ set_size
1046
+ end
1047
+
1048
+ # Creates a DataFrame with the random data, of n size.
1049
+ # If n not given, uses original number of rows.
1050
+ #
1051
+ # @return {DaruLite::DataFrame}
1052
+ def bootstrap(n = nil)
1053
+ n ||= nrows
1054
+ DaruLite::DataFrame.new({}, order: @vectors).tap do |df_boot|
1055
+ n.times do
1056
+ df_boot.add_row(row[rand(n)])
1057
+ end
1058
+ df_boot.update
1059
+ end
1060
+ end
1061
+
1062
+ def keep_row_if
1063
+ @index
1064
+ .reject { |idx| yield access_row(idx) }
1065
+ .each { |idx| delete_row idx }
1066
+ end
1067
+
1068
+ def keep_vector_if
1069
+ @vectors.each do |vector|
1070
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
1071
+ end
1072
+ end
1073
+
1074
+ # creates a new vector with the data of a given field which the block returns true
1075
+ def filter_vector(vec, &block)
1076
+ DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
1077
+ end
1078
+
1079
+ # Iterates over each row and retains it in a new DataFrame if the block returns
1080
+ # true for that row.
1081
+ def filter_rows
1082
+ return to_enum(:filter_rows) unless block_given?
1083
+
1084
+ keep_rows = @index.map { |index| yield access_row(index) }
1085
+
1086
+ where keep_rows
1087
+ end
1088
+
1089
+ # Iterates over each vector and retains it in a new DataFrame if the block returns
1090
+ # true for that vector.
1091
+ def filter_vectors(&block)
1092
+ return to_enum(:filter_vectors) unless block
1093
+
1094
+ dup.tap { |df| df.keep_vector_if(&block) }
1095
+ end
1096
+
1097
+ # Test each row with one or more tests.
1098
+ # @param tests [Proc] Each test is a Proc with the form
1099
+ # *Proc.new {|row| row[:age] > 0}*
1100
+ # The function returns an array with all errors.
1101
+ #
1102
+ # FIXME: description here is too sparse. As far as I can get,
1103
+ # it should tell something about that each test is [descr, fields, block],
1104
+ # and that first value may be column name to output. - zverok, 2016-05-18
1105
+ def verify(*tests)
1106
+ id = tests.first.is_a?(Symbol) ? tests.shift : @vectors.first
1107
+
1108
+ each_row_with_index.map do |row, i|
1109
+ tests.reject { |*_, block| block.call(row) }
1110
+ .map { |test| verify_error_message row, test, id, i }
1111
+ end.flatten
1112
+ end
1113
+
1114
+ # DSL for yielding each row and returning a DaruLite::Vector based on the
1115
+ # value each run of the block returns.
1116
+ #
1117
+ # == Usage
1118
+ #
1119
+ # a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
1120
+ # a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
1121
+ # a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
1122
+ # ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
1123
+ # total = ds.vector_by_calculation { a + b + c }
1124
+ # # <DaruLite::Vector:82314050 @name = nil @size = 7 >
1125
+ # # nil
1126
+ # # 0 111
1127
+ # # 1 222
1128
+ # # 2 333
1129
+ # # 3 444
1130
+ # # 4 555
1131
+ # # 5 666
1132
+ # # 6 777
1133
+ def vector_by_calculation(&block)
1134
+ a = each_row.map { |r| r.instance_eval(&block) }
1135
+
1136
+ DaruLite::Vector.new a, index: @index
1137
+ end
1138
+
1139
+ # Reorder the vectors in a dataframe
1140
+ # @param [Array] order_array new order of the vectors
1141
+ # @example
1142
+ # df = DaruLite::DataFrame({
1143
+ # a: [1, 2, 3],
1144
+ # b: [4, 5, 6]
1145
+ # }, order: [:a, :b])
1146
+ # df.order = [:b, :a]
1147
+ # df
1148
+ # # => #<DaruLite::DataFrame(3x2)>
1149
+ # # b a
1150
+ # # 0 4 1
1151
+ # # 1 5 2
1152
+ # # 2 6 3
1153
+ def order=(order_array)
1154
+ raise ArgumentError, 'Invalid order' unless
1155
+ order_array.sort == vectors.to_a.sort
1156
+
1157
+ initialize(to_h, order: order_array)
1158
+ end
1159
+
1160
+ # Return the dataframe with rotate vectors positions, the vector at position count is now
1161
+ # the first vector of the dataframe.
1162
+ # If only one vector in the dataframe, the dataframe is return without any change.
1163
+ # @param count => Integer, the vector at position count will be the first vector of the dataframe.
1164
+ # @example
1165
+ # df = DaruLite::DataFrame({
1166
+ # a: [1, 2, 3],
1167
+ # b: [4, 5, 6],
1168
+ # total: [5, 7, 9],
1169
+ # })
1170
+ # df.rotate_vectors(-1)
1171
+ # df
1172
+ # # => #<DaruLite::DataFrame(3x3)>
1173
+ # # total b a
1174
+ # # 0 5 4 1
1175
+ # # 1 7 5 2
1176
+ # # 2 9 6 3
1177
+ def rotate_vectors(count = -1)
1178
+ return self unless vectors.many?
1179
+
1180
+ self.order = vectors.to_a.rotate(count)
1181
+ self
1182
+ end
1183
+
1184
+ # Returns a vector, based on a string with a calculation based
1185
+ # on vector.
1186
+ #
1187
+ # The calculation will be eval'ed, so you can put any variable
1188
+ # or expression valid on ruby.
1189
+ #
1190
+ # For example:
1191
+ # a = DaruLite::Vector.new [1,2]
1192
+ # b = DaruLite::Vector.new [3,4]
1193
+ # ds = DaruLite::DataFrame.new({:a => a,:b => b})
1194
+ # ds.compute("a+b")
1195
+ # => Vector [4,6]
1196
+ def compute(text, &block)
1197
+ return instance_eval(&block) if block
1198
+
1199
+ instance_eval(text)
1200
+ end
1201
+
1202
+ # Return a vector with the number of missing values in each row.
1203
+ #
1204
+ # == Arguments
1205
+ #
1206
+ # * +missing_values+ - An Array of the values that should be
1207
+ # treated as 'missing'. The default missing value is *nil*.
1208
+ def missing_values_rows(missing_values = [nil])
1209
+ number_of_missing = each_row.map do |row|
1210
+ row.indexes(*missing_values).size
1211
+ end
1212
+
1213
+ DaruLite::Vector.new number_of_missing, index: @index, name: "#{@name}_missing_rows"
1214
+ end
1215
+
1216
+ # TODO: remove next version
1217
+ alias vector_missing_values missing_values_rows
1218
+
1219
+ def has_missing_data?
1220
+ @data.any? { |vec| vec.include_values?(*DaruLite::MISSING_VALUES) }
1221
+ end
1222
+ alias flawed? has_missing_data?
1223
+ deprecate :has_missing_data?, :include_values?, 2016, 10
1224
+ deprecate :flawed?, :include_values?, 2016, 10
1225
+
1226
+ # Check if any of given values occur in the data frame
1227
+ # @param [Array] values to check for
1228
+ # @return [true, false] true if any of the given values occur in the
1229
+ # dataframe, false otherwise
1230
+ # @example
1231
+ # df = DaruLite::DataFrame.new({
1232
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
1233
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
1234
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
1235
+ # }, index: 11..18)
1236
+ # df.include_values? nil
1237
+ # # => true
1238
+ def include_values?(*values)
1239
+ @data.any? { |vec| vec.include_values?(*values) }
1240
+ end
1241
+
1242
+ # Return a nested hash using vector names as keys and an array constructed of
1243
+ # hashes with other values. If block provided, is used to provide the
1244
+ # values, with parameters +row+ of dataset, +current+ last hash on
1245
+ # hierarchy and +name+ of the key to include
1246
+ def nest(*tree_keys, &block)
1247
+ tree_keys = tree_keys[0] if tree_keys[0].is_a? Array
1248
+
1249
+ each_row.with_object({}) do |row, current|
1250
+ # Create tree
1251
+ *keys, last = tree_keys
1252
+ current = keys.inject(current) { |c, f| c[row[f]] ||= {} }
1253
+ name = row[last]
1254
+
1255
+ if block
1256
+ current[name] = yield(row, current, name)
1257
+ else
1258
+ current[name] ||= []
1259
+ current[name].push(row.to_h.delete_if { |key, _value| tree_keys.include? key })
1260
+ end
1261
+ end
1262
+ end
1263
+
1264
+ def vector_count_characters(vecs = nil)
1265
+ vecs ||= @vectors.to_a
1266
+
1267
+ collect_rows do |row|
1268
+ vecs.sum { |v| row[v].to_s.size }
1269
+ end
1270
+ end
1271
+
1272
+ def add_vectors_by_split(name, join = '-', sep = DaruLite::SPLIT_TOKEN)
1273
+ self[name]
1274
+ .split_by_separator(sep)
1275
+ .each { |k, v| self[:"#{name}#{join}#{k}"] = v }
1276
+ end
1277
+
1278
+ # Return the number of rows and columns of the DataFrame in an Array.
1279
+ def shape
1280
+ [nrows, ncols]
1281
+ end
1282
+
1283
+ # The number of rows
1284
+ def nrows
1285
+ @index.size
1286
+ end
1287
+
1288
+ # The number of vectors
1289
+ def ncols
1290
+ @vectors.size
1291
+ end
1292
+
1293
+ # Check if a vector is present
1294
+ def has_vector?(vector)
1295
+ @vectors.include? vector
1296
+ end
1297
+
1298
+ # Works like Array#any?.
1299
+ #
1300
+ # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1301
+ # :row. A DaruLite::Vector object is yielded in the block.
1302
+ # @example Using any?
1303
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1304
+ # df.any?(:row) do |row|
1305
+ # row[:a] < 3 and row[:b] == 'b'
1306
+ # end #=> true
1307
+ def any?(axis = :vector, &block)
1308
+ if %i[vector column].include?(axis)
1309
+ @data.any?(&block)
1310
+ elsif axis == :row
1311
+ each_row do |row|
1312
+ return true if yield(row)
1313
+ end
1314
+ false
1315
+ else
1316
+ raise ArgumentError, "Unidentified axis #{axis}"
1317
+ end
1318
+ end
1319
+
1320
+ # Works like Array#all?
1321
+ #
1322
+ # @param [Symbol] axis (:vector) The axis to iterate over. Can be :vector or
1323
+ # :row. A DaruLite::Vector object is yielded in the block.
1324
+ # @example Using all?
1325
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4,5], b: ['a', 'b', 'c', 'd', 'e']})
1326
+ # df.all?(:row) do |row|
1327
+ # row[:a] < 10
1328
+ # end #=> true
1329
+ def all?(axis = :vector, &block)
1330
+ if %i[vector column].include?(axis)
1331
+ @data.all?(&block)
1332
+ elsif axis == :row
1333
+ each_row.all?(&block)
1334
+ else
1335
+ raise ArgumentError, "Unidentified axis #{axis}"
1336
+ end
1337
+ end
1338
+
1339
+ # The first ten elements of the DataFrame
1340
+ #
1341
+ # @param [Fixnum] quantity (10) The number of elements to display from the top.
1342
+ def head(quantity = 10)
1343
+ row.at 0..(quantity - 1)
1344
+ end
1345
+
1346
+ alias first head
1347
+
1348
+ # The last ten elements of the DataFrame
1349
+ #
1350
+ # @param [Fixnum] quantity (10) The number of elements to display from the bottom.
1351
+ def tail(quantity = 10)
1352
+ start = [-quantity, -size].max
1353
+ row.at start..-1
1354
+ end
1355
+
1356
+ alias last tail
1357
+
1358
+ # Sum all numeric/specified vectors in the DataFrame.
1359
+ #
1360
+ # Returns a new vector that's a containing a sum of all numeric
1361
+ # or specified vectors of the DataFrame. By default, if the vector
1362
+ # contains a nil, the sum is nil.
1363
+ # With :skipnil argument set to true, nil values are assumed to be
1364
+ # 0 (zero) and the sum vector is returned.
1365
+ #
1366
+ # @param args [Array] List of vectors to sum. Default is nil in which case
1367
+ # all numeric vectors are summed.
1368
+ #
1369
+ # @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
1370
+ #
1371
+ # @return Vector with sum of all vectors specified in the argument.
1372
+ # If vecs parameter is empty, sum all numeric vector.
1373
+ #
1374
+ # @example
1375
+ # df = DaruLite::DataFrame.new({
1376
+ # a: [1, 2, nil],
1377
+ # b: [2, 1, 3],
1378
+ # c: [1, 1, 1]
1379
+ # })
1380
+ # => #<DaruLite::DataFrame(3x3)>
1381
+ # a b c
1382
+ # 0 1 2 1
1383
+ # 1 2 1 1
1384
+ # 2 nil 3 1
1385
+ # df.vector_sum [:a, :c]
1386
+ # => #<DaruLite::Vector(3)>
1387
+ # 0 2
1388
+ # 1 3
1389
+ # 2 nil
1390
+ # df.vector_sum
1391
+ # => #<DaruLite::Vector(3)>
1392
+ # 0 4
1393
+ # 1 4
1394
+ # 2 nil
1395
+ # df.vector_sum skipnil: true
1396
+ # => #<DaruLite::Vector(3)>
1397
+ # c
1398
+ # 0 4
1399
+ # 1 4
1400
+ # 2 4
1401
+ #
1402
+ def vector_sum(*args)
1403
+ defaults = { vecs: nil, skipnil: false }
1404
+ options = args.last.is_a?(::Hash) ? args.pop : {}
1405
+ options = defaults.merge(options)
1406
+ vecs = args[0] || options[:vecs]
1407
+ skipnil = args[1] || options[:skipnil]
1408
+
1409
+ vecs ||= numeric_vectors
1410
+ sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
1411
+ vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
1412
+ end
1413
+
1414
+ # Calculate mean of the rows of the dataframe.
1415
+ #
1416
+ # == Arguments
1417
+ #
1418
+ # * +max_missing+ - The maximum number of elements in the row that can be
1419
+ # zero for the mean calculation to happen. Default to 0.
1420
+ def vector_mean(max_missing = 0)
1421
+ # FIXME: in vector_sum we preserve created vector dtype, but
1422
+ # here we are not. Is this by design or ...? - zverok, 2016-05-18
1423
+ mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
1424
+
1425
+ each_row_with_index.with_object(mean_vec) do |(row, i), memo|
1426
+ memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
1427
+ end
1428
+ end
1429
+
1430
+ # Group elements by vector to perform operations on them. Returns a
1431
+ # DaruLite::Core::GroupBy object.See the DaruLite::Core::GroupBy docs for a detailed
1432
+ # list of possible operations.
1433
+ #
1434
+ # == Arguments
1435
+ #
1436
+ # * vectors - An Array contatining names of vectors to group by.
1437
+ #
1438
+ # == Usage
1439
+ #
1440
+ # df = DaruLite::DataFrame.new({
1441
+ # a: %w{foo bar foo bar foo bar foo foo},
1442
+ # b: %w{one one two three two two one three},
1443
+ # c: [1 ,2 ,3 ,1 ,3 ,6 ,3 ,8],
1444
+ # d: [11 ,22 ,33 ,44 ,55 ,66 ,77 ,88]
1445
+ # })
1446
+ # df.group_by([:a,:b,:c]).groups
1447
+ # #=> {["bar", "one", 2]=>[1],
1448
+ # # ["bar", "three", 1]=>[3],
1449
+ # # ["bar", "two", 6]=>[5],
1450
+ # # ["foo", "one", 1]=>[0],
1451
+ # # ["foo", "one", 3]=>[6],
1452
+ # # ["foo", "three", 8]=>[7],
1453
+ # # ["foo", "two", 3]=>[2, 4]}
1454
+ def group_by(*vectors)
1455
+ vectors.flatten!
1456
+ missing = vectors - @vectors.to_a
1457
+ raise(ArgumentError, "Vector(s) missing: #{missing.join(', ')}") unless missing.empty?
1458
+
1459
+ vectors = [@vectors.first] if vectors.empty?
1460
+
1461
+ DaruLite::Core::GroupBy.new(self, vectors)
1462
+ end
1463
+
1464
+ def reindex_vectors(new_vectors)
1465
+ unless new_vectors.is_a?(DaruLite::Index)
1466
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
1467
+ "subclasses, not #{new_vectors.class}"
1468
+ end
1469
+
1470
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
1471
+ new_vectors.each_with_object(cl) do |vec, memo|
1472
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
1473
+ end
1474
+ end
1475
+
1476
+ def get_vector_anyways(v)
1477
+ @vectors.include?(v) ? self[v].to_a : Array.new(size)
1478
+ end
1479
+
1480
+ # Concatenate another DataFrame along corresponding columns.
1481
+ # If columns do not exist in both dataframes, they are filled with nils
1482
+ def concat(other_df)
1483
+ vectors = (@vectors.to_a + other_df.vectors.to_a).uniq
1484
+
1485
+ data = vectors.map do |v|
1486
+ get_vector_anyways(v).dup.concat(other_df.get_vector_anyways(v))
1487
+ end
1488
+
1489
+ DaruLite::DataFrame.new(data, order: vectors)
1490
+ end
1491
+
1492
+ # Concatenates another DataFrame as #concat.
1493
+ # Additionally it tries to preserve the index. If the indices contain
1494
+ # common elements, #union will overwrite the according rows in the
1495
+ # first dataframe.
1496
+ def union(other_df)
1497
+ index = (@index.to_a + other_df.index.to_a).uniq
1498
+ df = row[*(@index.to_a - other_df.index.to_a)]
1499
+
1500
+ df = df.concat(other_df)
1501
+ df.index = DaruLite::Index.new(index)
1502
+ df
1503
+ end
1504
+
1505
+ module SetSingleIndexStrategy
1506
+ def self.uniq_size(df, col)
1507
+ df[col].uniq.size
1508
+ end
1509
+
1510
+ def self.new_index(df, col)
1511
+ DaruLite::Index.new(df[col].to_a)
1512
+ end
1513
+
1514
+ def self.delete_vector(df, col)
1515
+ df.delete_vector(col)
1516
+ end
1517
+ end
1518
+
1519
+ module SetCategoricalIndexStrategy
1520
+ def self.new_index(df, col)
1521
+ DaruLite::CategoricalIndex.new(df[col].to_a)
1522
+ end
1523
+
1524
+ def self.delete_vector(df, col)
1525
+ df.delete_vector(col)
1526
+ end
1527
+ end
1528
+
1529
+ module SetMultiIndexStrategy
1530
+ def self.uniq_size(df, cols)
1531
+ df[*cols].uniq.size
1532
+ end
1533
+
1534
+ def self.new_index(df, cols)
1535
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
1536
+ mi.name = cols
1537
+ end
1538
+ end
1539
+
1540
+ def self.delete_vector(df, cols)
1541
+ df.delete_vectors(*cols)
1542
+ end
1543
+ end
1544
+
1545
+ # Set a particular column as the new DF
1546
+ def set_index(new_index_col, keep: false, categorical: false)
1547
+ if categorical
1548
+ strategy = SetCategoricalIndexStrategy
1549
+ elsif new_index_col.respond_to?(:to_a)
1550
+ strategy = SetMultiIndexStrategy
1551
+ new_index_col = new_index_col.to_a
1552
+ else
1553
+ strategy = SetSingleIndexStrategy
1554
+ end
1555
+
1556
+ unless categorical
1557
+ uniq_size = strategy.uniq_size(self, new_index_col)
1558
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
1559
+ end
1560
+
1561
+ self.index = strategy.new_index(self, new_index_col)
1562
+ strategy.delete_vector(self, new_index_col) unless keep
1563
+ self
1564
+ end
1565
+
1566
+ # Change the index of the DataFrame and preserve the labels of the previous
1567
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
1568
+ #
1569
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
1570
+ # @example Reindexing DataFrame
1571
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
1572
+ # index: ['a','b','c','d'])
1573
+ # #=>
1574
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1575
+ # # a b
1576
+ # # a 1 11
1577
+ # # b 2 22
1578
+ # # c 3 33
1579
+ # # d 4 44
1580
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
1581
+ # #=>
1582
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
1583
+ # # a b
1584
+ # # b 2 22
1585
+ # # 0 nil nil
1586
+ # # a 1 11
1587
+ # # g nil nil
1588
+ def reindex(new_index)
1589
+ unless new_index.is_a?(DaruLite::Index)
1590
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
1591
+ "subclasses, not #{new_index.class}"
1592
+ end
1593
+
1594
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
1595
+ new_index.each_with_object(cl) do |idx, memo|
1596
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
1597
+ end
1598
+ end
1599
+
1600
+ def reset_index
1601
+ index_df = index.to_df
1602
+ names = index.name
1603
+ names = [names] unless names.instance_of?(Array)
1604
+ new_vectors = names + vectors.to_a
1605
+ self.index = index_df.index
1606
+ names.each do |name|
1607
+ self[name] = index_df[name]
1608
+ end
1609
+ self.order = new_vectors
1610
+ self
1611
+ end
1612
+
1613
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
1614
+ #
1615
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
1616
+ # are to be indexed.
1617
+ # @example Reassigining index of a DataFrame
1618
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
1619
+ # df.index.to_a #=> [0,1,2,3]
1620
+ #
1621
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
1622
+ # df.index.to_a #=> ['a','b','c','d']
1623
+ # df.row['a'].to_a #=> [1,11]
1624
+ def index=(idx)
1625
+ @index = Index.coerce idx
1626
+ @data.each { |vec| vec.index = @index }
1627
+
1628
+ self
1629
+ end
1630
+
1631
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
1632
+ #
1633
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
1634
+ # be indexed. Must of the same size as ncols.
1635
+ # @example Reassigning vectors of a DataFrame
1636
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
1637
+ # df.vectors.to_a #=> [:a, :b, :c]
1638
+ #
1639
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
1640
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
1641
+ def vectors=(new_index)
1642
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
1643
+
1644
+ if new_index.size != ncols
1645
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
1646
+ "dataframe size #{ncols}"
1647
+ end
1648
+
1649
+ @vectors = new_index
1650
+ @data.zip(new_index.to_a).each do |vect, name|
1651
+ vect.name = name
1652
+ end
1653
+ self
1654
+ end
1655
+
1656
+ # Renames the vectors
1657
+ #
1658
+ # == Arguments
1659
+ #
1660
+ # * name_map - A hash where the keys are the exising vector names and
1661
+ # the values are the new names. If a vector is renamed
1662
+ # to a vector name that is already in use, the existing
1663
+ # one is overwritten.
1664
+ #
1665
+ # == Usage
1666
+ #
1667
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1668
+ # df.rename_vectors :a => :alpha, :c => :gamma
1669
+ # df.vectors.to_a #=> [:alpha, :b, :gamma]
1670
+ def rename_vectors(name_map)
1671
+ existing_targets = name_map.reject { |k, v| k == v }.values & vectors.to_a
1672
+ delete_vectors(*existing_targets)
1673
+
1674
+ new_names = vectors.to_a.map { |v| name_map[v] || v }
1675
+ self.vectors = DaruLite::Index.new new_names
1676
+ end
1677
+
1678
+ # Renames the vectors and returns itself
1679
+ #
1680
+ # == Arguments
1681
+ #
1682
+ # * name_map - A hash where the keys are the exising vector names and
1683
+ # the values are the new names. If a vector is renamed
1684
+ # to a vector name that is already in use, the existing
1685
+ # one is overwritten.
1686
+ #
1687
+ # == Usage
1688
+ #
1689
+ # df = DaruLite::DataFrame.new({ a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44] })
1690
+ # df.rename_vectors! :a => :alpha, :c => :gamma # df
1691
+ def rename_vectors!(name_map)
1692
+ rename_vectors(name_map)
1693
+ self
1694
+ end
1695
+
1696
+ # Converts the vectors to a DaruLite::MultiIndex.
1697
+ # The argument passed is used as the MultiIndex's top level
1698
+ def add_level_to_vectors(top_level_label)
1699
+ tuples = vectors.map { |label| [top_level_label, *label] }
1700
+ self.vectors = DaruLite::MultiIndex.from_tuples(tuples)
1701
+ end
1702
+
1703
+ # Return the indexes of all the numeric vectors. Will include vectors with nils
1704
+ # alongwith numbers.
1705
+ def numeric_vectors
1706
+ # FIXME: Why _with_index ?..
1707
+ each_vector_with_index
1708
+ .select { |vec, _i| vec.numeric? }
1709
+ .map(&:last)
1710
+ end
1711
+
1712
+ def numeric_vector_names
1713
+ @vectors.select { |v| self[v].numeric? }
1714
+ end
1715
+
1716
+ # Return a DataFrame of only the numerical Vectors. If clone: false
1717
+ # is specified as option, only a *view* of the Vectors will be
1718
+ # returned. Defaults to clone: true.
1719
+ def only_numerics(opts = {})
1720
+ cln = opts[:clone] != false
1721
+ arry = numeric_vectors.map { |v| self[v] }
1722
+
1723
+ order = Index.new(numeric_vectors)
1724
+ DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
1725
+ end
1726
+
1727
+ # Generate a summary of this DataFrame based on individual vectors in the DataFrame
1728
+ # @return [String] String containing the summary of the DataFrame
1729
+ def summary
1730
+ summary = "= #{name}"
1731
+ summary << "\n Number of rows: #{nrows}"
1732
+ @vectors.each do |v|
1733
+ summary << "\n Element:[#{v}]\n"
1734
+ summary << self[v].summary(1)
1735
+ end
1736
+ summary
1737
+ end
1738
+
1739
+ # Sorts a dataframe (ascending/descending) in the given pripority sequence of
1740
+ # vectors, with or without a block.
1741
+ #
1742
+ # @param vector_order [Array] The order of vector names in which the DataFrame
1743
+ # should be sorted.
1744
+ # @param opts [Hash] opts The options to sort with.
1745
+ # @option opts [TrueClass,FalseClass,Array] :ascending (true) Sort in ascending
1746
+ # or descending order. Specify Array corresponding to *order* for multiple
1747
+ # sort orders.
1748
+ # @option opts [Hash] :by (lambda{|a| a }) Specify attributes of objects to
1749
+ # to be used for sorting, for each vector name in *order* as a hash of
1750
+ # vector name and lambda expressions. In case a lambda for a vector is not
1751
+ # specified, the default will be used.
1752
+ # @option opts [TrueClass,FalseClass,Array] :handle_nils (false) Handle nils
1753
+ # automatically or not when a block is provided.
1754
+ # If set to True, nils will appear at top after sorting.
1755
+ #
1756
+ # @example Sort a dataframe with a vector sequence.
1757
+ #
1758
+ #
1759
+ # df = DaruLite::DataFrame.new({a: [1,2,1,2,3], b: [5,4,3,2,1]})
1760
+ #
1761
+ # df.sort [:a, :b]
1762
+ # # =>
1763
+ # # <DaruLite::DataFrame:30604000 @name = d6a9294e-2c09-418f-b646-aa9244653444 @size = 5>
1764
+ # # a b
1765
+ # # 2 1 3
1766
+ # # 0 1 5
1767
+ # # 3 2 2
1768
+ # # 1 2 4
1769
+ # # 4 3 1
1770
+ #
1771
+ # @example Sort a dataframe without a block. Here nils will be handled automatically.
1772
+ #
1773
+ # df = DaruLite::DataFrame.new({a: [-3,nil,-1,nil,5], b: [4,3,2,1,4]})
1774
+ #
1775
+ # df.sort([:a])
1776
+ # # =>
1777
+ # # <DaruLite::DataFrame:14810920 @name = c07fb5c7-2201-458d-b679-6a1f7ebfe49f @size = 5>
1778
+ # # a b
1779
+ # # 1 nil 3
1780
+ # # 3 nil 1
1781
+ # # 0 -3 4
1782
+ # # 2 -1 2
1783
+ # # 4 5 4
1784
+ #
1785
+ # @example Sort a dataframe with a block with nils handled automatically.
1786
+ #
1787
+ # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1788
+ #
1789
+ # df.sort [:b], by: {b: lambda { |a| a.length } }
1790
+ # # NoMethodError: undefined method `length' for nil:NilClass
1791
+ # # from (pry):8:in `block in __pry__'
1792
+ #
1793
+ # df.sort [:b], by: {b: lambda { |a| a.length } }, handle_nils: true
1794
+ #
1795
+ # # =>
1796
+ # # <DaruLite::DataFrame:28469540 @name = 5f986508-556f-468b-be0c-88cc3534445c @size = 6>
1797
+ # # a b
1798
+ # # 2 1 nil
1799
+ # # 5 1 nil
1800
+ # # 4 -1 x
1801
+ # # 1 -1 aa
1802
+ # # 0 nil aaa
1803
+ # # 3 nil baaa
1804
+ #
1805
+ # @example Sort a dataframe with a block with nils handled manually.
1806
+ #
1807
+ # df = DaruLite::DataFrame.new({a: [nil,-1,1,nil,-1,1], b: ['aaa','aa',nil,'baaa','x',nil] })
1808
+ #
1809
+ # # To print nils at the bottom one can use lambda { |a| (a.nil?)[1]:[0,a.length] }
1810
+ # df.sort [:b], by: {b: lambda { |a| (a.nil?)?[1]:[0,a.length] } }, handle_nils: true
1811
+ #
1812
+ # # =>
1813
+ # #<DaruLite::DataFrame:22214180 @name = cd7703c7-1dca-4560-840b-5ea51a852ef9 @size = 6>
1814
+ # # a b
1815
+ # # 4 -1 x
1816
+ # # 1 -1 aa
1817
+ # # 0 nil aaa
1818
+ # # 3 nil baaa
1819
+ # # 2 1 nil
1820
+ # # 5 1 nil
1821
+
1822
+ def sort!(vector_order, opts = {})
1823
+ raise ArgumentError, 'Required atleast one vector name' if vector_order.empty?
1824
+
1825
+ # To enable sorting with categorical data,
1826
+ # map categories to integers preserving their order
1827
+ old = convert_categorical_vectors vector_order
1828
+ block = sort_prepare_block vector_order, opts
1829
+
1830
+ order = @index.size.times.sort(&block)
1831
+ new_index = @index.reorder order
1832
+
1833
+ # To reverse map mapping of categorical data to integers
1834
+ restore_categorical_vectors old
1835
+
1836
+ @data.each do |vector|
1837
+ vector.reorder! order
1838
+ end
1839
+
1840
+ self.index = new_index
1841
+
1842
+ self
1843
+ end
1844
+
1845
+ # Non-destructive version of #sort!
1846
+ def sort(vector_order, opts = {})
1847
+ dup.sort! vector_order, opts
1848
+ end
1849
+
1850
+ # Pivots a data frame on specified vectors and applies an aggregate function
1851
+ # to quickly generate a summary.
1852
+ #
1853
+ # == Options
1854
+ #
1855
+ # +:index+ - Keys to group by on the pivot table row index. Pass vector names
1856
+ # contained in an Array.
1857
+ #
1858
+ # +:vectors+ - Keys to group by on the pivot table column index. Pass vector
1859
+ # names contained in an Array.
1860
+ #
1861
+ # +:agg+ - Function to aggregate the grouped values. Default to *:mean*. Can
1862
+ # use any of the statistics functions applicable on Vectors that can be found in
1863
+ # the DaruLite::Statistics::Vector module.
1864
+ #
1865
+ # +:values+ - Columns to aggregate. Will consider all numeric columns not
1866
+ # specified in *:index* or *:vectors*. Optional.
1867
+ #
1868
+ # == Usage
1869
+ #
1870
+ # df = DaruLite::DataFrame.new({
1871
+ # a: ['foo' , 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar'],
1872
+ # b: ['one' , 'one', 'one', 'two', 'two', 'one', 'one', 'two', 'two'],
1873
+ # c: ['small','large','large','small','small','large','small','large','small'],
1874
+ # d: [1,2,2,3,3,4,5,6,7],
1875
+ # e: [2,4,4,6,6,8,10,12,14]
1876
+ # })
1877
+ # df.pivot_table(index: [:a], vectors: [:b], agg: :sum, values: :e)
1878
+ #
1879
+ # #=>
1880
+ # # #<DaruLite::DataFrame:88342020 @name = 08cdaf4e-b154-4186-9084-e76dd191b2c9 @size = 2>
1881
+ # # [:e, :one] [:e, :two]
1882
+ # # [:bar] 18 26
1883
+ # # [:foo] 10 12
1884
+ def pivot_table(opts = {})
1885
+ raise ArgumentError, 'Specify grouping index' if Array(opts[:index]).empty?
1886
+
1887
+ index = opts[:index]
1888
+ vectors = opts[:vectors] || []
1889
+ aggregate_function = opts[:agg] || :mean
1890
+ values = prepare_pivot_values index, vectors, opts
1891
+ raise IndexError, 'No numeric vectors to aggregate' if values.empty?
1892
+
1893
+ grouped = group_by(index)
1894
+ return grouped.send(aggregate_function) if vectors.empty?
1895
+
1896
+ super_hash = make_pivot_hash grouped, vectors, values, aggregate_function
1897
+
1898
+ pivot_dataframe super_hash
1899
+ end
1900
+
1901
+ # Merge vectors from two DataFrames. In case of name collision,
1902
+ # the vectors names are changed to x_1, x_2 ....
1903
+ #
1904
+ # @return {DaruLite::DataFrame}
1905
+ def merge(other_df)
1906
+ unless nrows == other_df.nrows
1907
+ raise ArgumentError,
1908
+ "Number of rows must be equal in this: #{nrows} and other: #{other_df.nrows}"
1909
+ end
1910
+
1911
+ new_fields = (@vectors.to_a + other_df.vectors.to_a)
1912
+ new_fields = ArrayHelper.recode_repeated(new_fields)
1913
+ DataFrame.new({}, order: new_fields).tap do |df_new|
1914
+ (0...nrows).each do |i|
1915
+ df_new.add_row row[i].to_a + other_df.row[i].to_a
1916
+ end
1917
+ df_new.index = @index if @index == other_df.index
1918
+ df_new.update
1919
+ end
1920
+ end
1921
+
1922
+ # Join 2 DataFrames with SQL style joins. Currently supports inner, left
1923
+ # outer, right outer and full outer joins.
1924
+ #
1925
+ # @param [DaruLite::DataFrame] other_df Another DataFrame on which the join is
1926
+ # to be performed.
1927
+ # @param [Hash] opts Options Hash
1928
+ # @option :how [Symbol] Can be one of :inner, :left, :right or :outer.
1929
+ # @option :on [Array] The columns on which the join is to be performed.
1930
+ # Column names specified here must be common to both DataFrames.
1931
+ # @option :indicator [Symbol] The name of a vector to add to the resultant
1932
+ # dataframe that indicates whether the record was in the left (:left_only),
1933
+ # right (:right_only), or both (:both) joining dataframes.
1934
+ # @return [DaruLite::DataFrame]
1935
+ # @example Inner Join
1936
+ # left = DaruLite::DataFrame.new({
1937
+ # :id => [1,2,3,4],
1938
+ # :name => ['Pirate', 'Monkey', 'Ninja', 'Spaghetti']
1939
+ # })
1940
+ # right = DaruLite::DataFrame.new({
1941
+ # :id => [1,2,3,4],
1942
+ # :name => ['Rutabaga', 'Pirate', 'Darth Vader', 'Ninja']
1943
+ # })
1944
+ # left.join(right, how: :inner, on: [:name])
1945
+ # #=>
1946
+ # ##<DaruLite::DataFrame:82416700 @name = 74c0811b-76c6-4c42-ac93-e6458e82afb0 @size = 2>
1947
+ # # id_1 name id_2
1948
+ # # 0 1 Pirate 2
1949
+ # # 1 3 Ninja 4
1950
+ def join(other_df, opts = {})
1951
+ DaruLite::Core::Merge.join(self, other_df, opts)
1952
+ end
1953
+
1954
+ # Creates a new dataset for one to many relations
1955
+ # on a dataset, based on pattern of field names.
1956
+ #
1957
+ # for example, you have a survey for number of children
1958
+ # with this structure:
1959
+ # id, name, child_name_1, child_age_1, child_name_2, child_age_2
1960
+ # with
1961
+ # ds.one_to_many([:id], "child_%v_%n"
1962
+ # the field of first parameters will be copied verbatim
1963
+ # to new dataset, and fields which responds to second
1964
+ # pattern will be added one case for each different %n.
1965
+ #
1966
+ # @example
1967
+ # cases=[
1968
+ # ['1','george','red',10,'blue',20,nil,nil],
1969
+ # ['2','fred','green',15,'orange',30,'white',20],
1970
+ # ['3','alfred',nil,nil,nil,nil,nil,nil]
1971
+ # ]
1972
+ # ds=DaruLite::DataFrame.rows(cases, order:
1973
+ # [:id, :name,
1974
+ # :car_color1, :car_value1,
1975
+ # :car_color2, :car_value2,
1976
+ # :car_color3, :car_value3])
1977
+ # ds.one_to_many([:id],'car_%v%n').to_matrix
1978
+ # #=> Matrix[
1979
+ # # ["red", "1", 10],
1980
+ # # ["blue", "1", 20],
1981
+ # # ["green", "2", 15],
1982
+ # # ["orange", "2", 30],
1983
+ # # ["white", "2", 20]
1984
+ # # ]
1985
+ def one_to_many(parent_fields, pattern)
1986
+ vars, numbers = one_to_many_components(pattern)
1987
+
1988
+ DataFrame.new([], order: [*parent_fields, '_col_id', *vars]).tap do |ds|
1989
+ each_row do |row|
1990
+ verbatim = parent_fields.map { |f| [f, row[f]] }.to_h
1991
+ numbers.each do |n|
1992
+ generated = one_to_many_row row, n, vars, pattern
1993
+ next if generated.values.all?(&:nil?)
1994
+
1995
+ ds.add_row(verbatim.merge(generated).merge('_col_id' => n))
1996
+ end
1997
+ end
1998
+ ds.update
1999
+ end
2000
+ end
2001
+
2002
+ def add_vectors_by_split_recode(nm, join = '-', sep = DaruLite::SPLIT_TOKEN)
2003
+ self[nm]
2004
+ .split_by_separator(sep)
2005
+ .each_with_index do |(k, v), i|
2006
+ v.rename "#{nm}:#{k}"
2007
+ self[:"#{nm}#{join}#{i + 1}"] = v
2008
+ end
2009
+ end
2010
+
2011
+ # Create a sql, basen on a given Dataset
2012
+ #
2013
+ # == Arguments
2014
+ #
2015
+ # * table - String specifying name of the table that will created in SQL.
2016
+ # * charset - Character set. Default is "UTF8".
2017
+ #
2018
+ # @example
2019
+ #
2020
+ # ds = DaruLite::DataFrame.new({
2021
+ # :id => DaruLite::Vector.new([1,2,3,4,5]),
2022
+ # :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
2023
+ # })
2024
+ # ds.create_sql('names')
2025
+ # #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
2026
+ #
2027
+ def create_sql(table, charset = 'UTF8')
2028
+ sql = "CREATE TABLE #{table} ("
2029
+ fields = vectors.to_a.collect do |f|
2030
+ v = self[f]
2031
+ "#{f} #{v.db_type}"
2032
+ end
2033
+
2034
+ sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
2035
+ end
2036
+
2037
+ # Returns the dataframe. This can be convenient when the user does not
2038
+ # know whether the object is a vector or a dataframe.
2039
+ # @return [self] the dataframe
2040
+ def to_df
2041
+ self
2042
+ end
2043
+
2044
+ # Convert all vectors of type *:numeric* into a Matrix.
2045
+ def to_matrix
2046
+ Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
2047
+ end
2048
+
2049
+ # Converts the DataFrame into an array of hashes where key is vector name
2050
+ # and value is the corresponding element. The 0th index of the array contains
2051
+ # the array of hashes while the 1th index contains the indexes of each row
2052
+ # of the dataframe. Each element in the index array corresponds to its row
2053
+ # in the array of hashes, which has the same index.
2054
+ def to_a
2055
+ [each_row.map(&:to_h), @index.to_a]
2056
+ end
2057
+
2058
+ # Convert to json. If no_index is false then the index will NOT be included
2059
+ # in the JSON thus created.
2060
+ def to_json(no_index = true)
2061
+ if no_index
2062
+ to_a[0].to_json
2063
+ else
2064
+ to_a.to_json
2065
+ end
2066
+ end
2067
+
2068
+ # Converts DataFrame to a hash (explicit) with keys as vector names and values as
2069
+ # the corresponding vectors.
2070
+ def to_h
2071
+ @vectors
2072
+ .each_with_index
2073
+ .map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
2074
+ end
2075
+
2076
+ # Convert to html for IRuby.
2077
+ def to_html(threshold = DaruLite.max_rows)
2078
+ table_thead = to_html_thead
2079
+ table_tbody = to_html_tbody(threshold)
2080
+ path = if index.is_a?(MultiIndex)
2081
+ File.expand_path('iruby/templates/dataframe_mi.html.erb', __dir__)
2082
+ else
2083
+ File.expand_path('iruby/templates/dataframe.html.erb', __dir__)
2084
+ end
2085
+ ERB.new(File.read(path).strip).result(binding)
2086
+ end
2087
+
2088
+ def to_html_thead
2089
+ table_thead_path =
2090
+ if index.is_a?(MultiIndex)
2091
+ File.expand_path('iruby/templates/dataframe_mi_thead.html.erb', __dir__)
2092
+ else
2093
+ File.expand_path('iruby/templates/dataframe_thead.html.erb', __dir__)
2094
+ end
2095
+ ERB.new(File.read(table_thead_path).strip).result(binding)
2096
+ end
2097
+
2098
+ def to_html_tbody(threshold = DaruLite.max_rows)
2099
+ threshold ||= @size
2100
+ table_tbody_path =
2101
+ if index.is_a?(MultiIndex)
2102
+ File.expand_path('iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
2103
+ else
2104
+ File.expand_path('iruby/templates/dataframe_tbody.html.erb', __dir__)
2105
+ end
2106
+ ERB.new(File.read(table_tbody_path).strip).result(binding)
2107
+ end
2108
+
2109
+ def to_s
2110
+ "#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
2111
+ end
2112
+
2113
+ # Method for updating the metadata (i.e. missing value positions) of the
2114
+ # after assingment/deletion etc. are complete. This is provided so that
2115
+ # time is not wasted in creating the metadata for the vector each time
2116
+ # assignment/deletion of elements is done. Updating data this way is called
2117
+ # lazy loading. To set or unset lazy loading, see the .lazy_update= method.
2118
+ def update
2119
+ @data.each(&:update) if DaruLite.lazy_update
2120
+ end
2121
+
2122
+ # Rename the DataFrame.
2123
+ def rename(new_name)
2124
+ @name = new_name
2125
+ self
2126
+ end
2127
+
2128
+ alias name= rename
2129
+
2130
+ # Write this DataFrame to a CSV file.
2131
+ #
2132
+ # == Arguments
2133
+ #
2134
+ # * filename - Path of CSV file where the DataFrame is to be saved.
2135
+ #
2136
+ # == Options
2137
+ #
2138
+ # * convert_comma - If set to *true*, will convert any commas in any
2139
+ # of the data to full stops ('.').
2140
+ # All the options accepted by CSV.read() can also be passed into this
2141
+ # function.
2142
+ def write_csv(filename, opts = {})
2143
+ DaruLite::IO.dataframe_write_csv self, filename, opts
2144
+ end
2145
+
2146
+ # Write this dataframe to an Excel Spreadsheet
2147
+ #
2148
+ # == Arguments
2149
+ #
2150
+ # * filename - The path of the file where the DataFrame should be written.
2151
+ def write_excel(filename, opts = {})
2152
+ DaruLite::IO.dataframe_write_excel self, filename, opts
2153
+ end
2154
+
2155
+ # Insert each case of the Dataset on the selected table
2156
+ #
2157
+ # == Arguments
2158
+ #
2159
+ # * dbh - DBI database connection object.
2160
+ # * query - Query string.
2161
+ #
2162
+ # == Usage
2163
+ #
2164
+ # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
2165
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
2166
+ # ds.write_sql(dbh,"test")
2167
+ def write_sql(dbh, table)
2168
+ DaruLite::IO.dataframe_write_sql self, dbh, table
2169
+ end
2170
+
2171
+ # Use marshalling to save dataframe to a file.
2172
+ def save(filename)
2173
+ DaruLite::IO.save self, filename
2174
+ end
2175
+
2176
+ def _dump(_depth)
2177
+ Marshal.dump(
2178
+ data: @data,
2179
+ index: @index.to_a,
2180
+ order: @vectors.to_a,
2181
+ name: @name
2182
+ )
2183
+ end
2184
+
2185
+ def self._load(data)
2186
+ h = Marshal.load data
2187
+ DaruLite::DataFrame.new(h[:data],
2188
+ index: h[:index],
2189
+ order: h[:order],
2190
+ name: h[:name])
2191
+ end
2192
+
2193
+ # Transpose a DataFrame, tranposing elements and row, column indexing.
2194
+ def transpose
2195
+ DaruLite::DataFrame.new(
2196
+ each_vector.map(&:to_a).transpose,
2197
+ index: @vectors,
2198
+ order: @index,
2199
+ dtype: @dtype,
2200
+ name: @name
2201
+ )
2202
+ end
2203
+
2204
+ # Pretty print in a nice table format for the command line (irb/pry/iruby)
2205
+ def inspect(spacing = DaruLite.spacing, threshold = DaruLite.max_rows)
2206
+ name_part = @name ? ": #{@name} " : ''
2207
+ spacing = [headers.to_a.map(&:length).max, spacing].max
2208
+
2209
+ "#<#{self.class}#{name_part}(#{nrows}x#{ncols})>#{$INPUT_RECORD_SEPARATOR}" +
2210
+ Formatters::Table.format(
2211
+ each_row.lazy,
2212
+ row_headers: row_headers,
2213
+ headers: headers,
2214
+ threshold: threshold,
2215
+ spacing: spacing
2216
+ )
2217
+ end
2218
+
2219
+ # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
2220
+ def where(bool_array)
2221
+ DaruLite::Core::Query.df_where self, bool_array
2222
+ end
2223
+
2224
+ def ==(other)
2225
+ self.class == other.class &&
2226
+ @size == other.size &&
2227
+ @index == other.index &&
2228
+ @vectors == other.vectors &&
2229
+ @vectors.to_a.all? { |v| self[v] == other[v] }
2230
+ end
2231
+
2232
+ # Converts the specified non category type vectors to category type vectors
2233
+ # @param [Array] names of non category type vectors to be converted
2234
+ # @return [DaruLite::DataFrame] data frame in which specified vectors have been
2235
+ # converted to category type
2236
+ # @example
2237
+ # df = DaruLite::DataFrame.new({
2238
+ # a: [1, 2, 3],
2239
+ # b: ['a', 'a', 'b']
2240
+ # })
2241
+ # df.to_category :b
2242
+ # df[:b].type
2243
+ # # => :category
2244
+ def to_category(*names)
2245
+ names.each { |n| self[n] = self[n].to_category }
2246
+ self
2247
+ end
2248
+
2249
+ def method_missing(name, *args, &block)
2250
+ if /(.+)=/.match?(name)
2251
+ name = name[/(.+)=/].delete('=')
2252
+ name = name.to_sym unless has_vector?(name)
2253
+ insert_or_modify_vector [name], args[0]
2254
+ elsif has_vector?(name)
2255
+ self[name]
2256
+ elsif has_vector?(name.to_s)
2257
+ self[name.to_s]
2258
+ else
2259
+ super
2260
+ end
2261
+ end
2262
+
2263
+ def respond_to_missing?(name, include_private = false)
2264
+ name.to_s.end_with?('=') || has_vector?(name) || super
2265
+ end
2266
+
2267
+ def interact_code(vector_names, full)
2268
+ dfs = vector_names.zip(full).map do |vec_name, f|
2269
+ self[vec_name].contrast_code(full: f).each.to_a
2270
+ end
2271
+
2272
+ all_vectors = recursive_product(dfs)
2273
+ DaruLite::DataFrame.new all_vectors,
2274
+ order: all_vectors.map(&:name)
2275
+ end
2276
+
2277
+ # Split the dataframe into many dataframes based on category vector
2278
+ # @param [object] cat_name name of category vector to split the dataframe
2279
+ # @return [Array] array of dataframes split by category with category vector
2280
+ # used to split not included
2281
+ # @example
2282
+ # df = DaruLite::DataFrame.new({
2283
+ # a: [1, 2, 3],
2284
+ # b: ['a', 'a', 'b']
2285
+ # })
2286
+ # df.to_category :b
2287
+ # df.split_by_category :b
2288
+ # # => [#<DaruLite::DataFrame: a (2x1)>
2289
+ # # a
2290
+ # # 0 1
2291
+ # # 1 2,
2292
+ # # #<DaruLite::DataFrame: b (1x1)>
2293
+ # # a
2294
+ # # 2 3]
2295
+ def split_by_category(cat_name)
2296
+ cat_dv = self[cat_name]
2297
+ raise ArgumentError, "#{cat_name} is not a category vector" unless
2298
+ cat_dv.category?
2299
+
2300
+ cat_dv.categories.map do |cat|
2301
+ where(cat_dv.eq cat)
2302
+ .rename(cat)
2303
+ .delete_vector cat_name
2304
+ end
2305
+ end
2306
+
2307
+ # @param indexes [Array] index(s) at which row tuples are retrieved
2308
+ # @return [Array] returns array of row tuples at given index(s)
2309
+ # @example Using DaruLite::Index
2310
+ # df = DaruLite::DataFrame.new({
2311
+ # a: [1, 2, 3],
2312
+ # b: ['a', 'a', 'b']
2313
+ # })
2314
+ #
2315
+ # df.access_row_tuples_by_indexs(1,2)
2316
+ # # => [[2, "a"], [3, "b"]]
2317
+ #
2318
+ # df.index = DaruLite::Index.new([:one,:two,:three])
2319
+ # df.access_row_tuples_by_indexs(:one,:three)
2320
+ # # => [[1, "a"], [3, "b"]]
2321
+ #
2322
+ # @example Using DaruLite::MultiIndex
2323
+ # mi_idx = DaruLite::MultiIndex.from_tuples [
2324
+ # [:a,:one,:bar],
2325
+ # [:a,:one,:baz],
2326
+ # [:b,:two,:bar],
2327
+ # [:a,:two,:baz],
2328
+ # ]
2329
+ # df_mi = DaruLite::DataFrame.new({
2330
+ # a: 1..4,
2331
+ # b: 'a'..'d'
2332
+ # }, index: mi_idx )
2333
+ #
2334
+ # df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
2335
+ # # => [[3, "c"]]
2336
+ # df_mi.access_row_tuples_by_indexs(:a)
2337
+ # # => [[1, "a"], [2, "b"], [4, "d"]]
2338
+ def access_row_tuples_by_indexs(*indexes)
2339
+ return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
2340
+ @index.is_a?(DaruLite::MultiIndex)
2341
+
2342
+ positions = @index.pos(*indexes)
2343
+ if positions.is_a? Numeric
2344
+ row = get_rows_for([positions])
2345
+ row.first.is_a?(Array) ? row : [row]
2346
+ else
2347
+ new_rows = get_rows_for(indexes, by_position: false)
2348
+ indexes.map { |index| new_rows.map { |r| r[index] } }
2349
+ end
2350
+ end
2351
+
2352
+ # Function to use for aggregating the data.
2353
+ #
2354
+ # @param options [Hash] options for column, you want in resultant dataframe
2355
+ #
2356
+ # @return [DaruLite::DataFrame]
2357
+ #
2358
+ # @example
2359
+ # df = DaruLite::DataFrame.new(
2360
+ # {col: [:a, :b, :c, :d, :e], num: [52,12,07,17,01]})
2361
+ # => #<DaruLite::DataFrame(5x2)>
2362
+ # col num
2363
+ # 0 a 52
2364
+ # 1 b 12
2365
+ # 2 c 7
2366
+ # 3 d 17
2367
+ # 4 e 1
2368
+ #
2369
+ # df.aggregate(num_100_times: ->(df) { (df.num*100).first })
2370
+ # => #<DaruLite::DataFrame(5x1)>
2371
+ # num_100_ti
2372
+ # 0 5200
2373
+ # 1 1200
2374
+ # 2 700
2375
+ # 3 1700
2376
+ # 4 100
2377
+ #
2378
+ # When we have duplicate index :
2379
+ #
2380
+ # idx = DaruLite::CategoricalIndex.new [:a, :b, :a, :a, :c]
2381
+ # df = DaruLite::DataFrame.new({num: [52,12,07,17,01]}, index: idx)
2382
+ # => #<DaruLite::DataFrame(5x1)>
2383
+ # num
2384
+ # a 52
2385
+ # b 12
2386
+ # a 7
2387
+ # a 17
2388
+ # c 1
2389
+ #
2390
+ # df.aggregate(num: :mean)
2391
+ # => #<DaruLite::DataFrame(3x1)>
2392
+ # num
2393
+ # a 25.3333333
2394
+ # b 12
2395
+ # c 1
2396
+ #
2397
+ # Note: `GroupBy` class `aggregate` method uses this `aggregate` method
2398
+ # internally.
2399
+ def aggregate(options = {}, multi_index_level = -1)
2400
+ if block_given?
2401
+ positions_tuples, new_index = yield(@index) # NOTE: use of yield is private for now
2402
+ else
2403
+ positions_tuples, new_index = group_index_for_aggregation(@index, multi_index_level)
2404
+ end
2405
+
2406
+ colmn_value = aggregate_by_positions_tuples(options, positions_tuples)
2407
+
2408
+ DaruLite::DataFrame.new(colmn_value, index: new_index, order: options.keys)
2409
+ end
2410
+
2411
+ def group_by_and_aggregate(*group_by_keys, **aggregation_map)
2412
+ group_by(*group_by_keys).aggregate(aggregation_map)
2413
+ end
2414
+
2415
+ private
2416
+
2417
+ def headers
2418
+ DaruLite::Index.new(Array(index.name) + @vectors.to_a)
2419
+ end
2420
+
2421
+ def row_headers
2422
+ index.is_a?(MultiIndex) ? index.sparse_tuples : index.to_a
2423
+ end
2424
+
2425
+ def convert_categorical_vectors(names)
2426
+ names.filter_map do |n|
2427
+ next unless self[n].category?
2428
+
2429
+ old = [n, self[n]]
2430
+ self[n] = DaruLite::Vector.new(self[n].to_ints)
2431
+ old
2432
+ end
2433
+ end
2434
+
2435
+ def restore_categorical_vectors(old)
2436
+ old.each { |name, vector| self[name] = vector }
2437
+ end
2438
+
2439
+ def recursive_product(dfs)
2440
+ return dfs.first if dfs.size == 1
2441
+
2442
+ left = dfs.first
2443
+ dfs.shift
2444
+ right = recursive_product dfs
2445
+ left.product(right).map do |dv1, dv2|
2446
+ (dv1 * dv2).rename "#{dv1.name}:#{dv2.name}"
2447
+ end
2448
+ end
2449
+
2450
+ def should_be_vector!(val)
2451
+ return val if val.is_a?(DaruLite::Vector)
2452
+
2453
+ raise TypeError, "Every iteration must return DaruLite::Vector not #{val.class}"
2454
+ end
2455
+
2456
+ def dispatch_to_axis(axis, method, *args, &block)
2457
+ if %i[vector column].include?(axis)
2458
+ send(:"#{method}_vector", *args, &block)
2459
+ elsif axis == :row
2460
+ send(:"#{method}_row", *args, &block)
2461
+ else
2462
+ raise ArgumentError, "Unknown axis #{axis}"
2463
+ end
2464
+ end
2465
+
2466
+ def dispatch_to_axis_pl(axis, method, *args, &block)
2467
+ if %i[vector column].include?(axis)
2468
+ send(:"#{method}_vectors", *args, &block)
2469
+ elsif axis == :row
2470
+ send(:"#{method}_rows", *args, &block)
2471
+ else
2472
+ raise ArgumentError, "Unknown axis #{axis}"
2473
+ end
2474
+ end
2475
+
2476
+ AXES = %i[row vector].freeze
2477
+
2478
+ def extract_axis(names, default = :vector)
2479
+ if AXES.include?(names.last)
2480
+ names.pop
2481
+ else
2482
+ default
2483
+ end
2484
+ end
2485
+
2486
+ def access_vector(*names)
2487
+ if names.first.is_a?(Range)
2488
+ dup(@vectors.subset(names.first))
2489
+ elsif @vectors.is_a?(MultiIndex)
2490
+ access_vector_multi_index(*names)
2491
+ else
2492
+ access_vector_single_index(*names)
2493
+ end
2494
+ end
2495
+
2496
+ def access_vector_multi_index(*names)
2497
+ pos = @vectors[names]
2498
+
2499
+ return @data[pos] if pos.is_a?(Integer)
2500
+
2501
+ new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
2502
+
2503
+ pos = pos.drop_left_level(names.size) if names.size < @vectors.width
2504
+
2505
+ DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
2506
+ end
2507
+
2508
+ def access_vector_single_index(*names)
2509
+ if names.count < 2
2510
+ begin
2511
+ pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
2512
+ rescue IndexError
2513
+ raise IndexError, "Specified vector #{names.first} does not exist"
2514
+ end
2515
+ return @data[pos] if pos.is_a?(Numeric)
2516
+
2517
+ names = pos
2518
+ end
2519
+
2520
+ new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
2521
+
2522
+ order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
2523
+ DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
2524
+ end
2525
+
2526
+ def access_row(*indexes)
2527
+ positions = @index.pos(*indexes)
2528
+
2529
+ if positions.is_a? Numeric
2530
+ row = get_rows_for([positions])
2531
+ DaruLite::Vector.new row, index: @vectors, name: indexes.first
2532
+ else
2533
+ new_rows = get_rows_for(indexes, by_position: false)
2534
+ DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
2535
+ end
2536
+ end
2537
+
2538
+ # @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
2539
+ # because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
2540
+ # values (representing a row) or an array of Vectors (that can be seen as rows)
2541
+ def get_rows_for(keys, by_position: true)
2542
+ raise unless keys.is_a?(Array)
2543
+
2544
+ if by_position
2545
+ pos = keys
2546
+ @data.map { |vector| vector.at(*pos) }
2547
+ else
2548
+ # TODO: for now (2018-07-27), it is different than using
2549
+ # get_rows_for(@index.pos(*keys))
2550
+ # because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
2551
+ indexes = keys
2552
+ @data.map { |vec| vec[*indexes] }
2553
+ end
2554
+ end
2555
+
2556
+ def insert_or_modify_vector(name, vector)
2557
+ name = name[0] unless @vectors.is_a?(MultiIndex)
2558
+
2559
+ if @index.empty?
2560
+ insert_vector_in_empty name, vector
2561
+ else
2562
+ vec = prepare_for_insert name, vector
2563
+
2564
+ assign_or_add_vector name, vec
2565
+ end
2566
+ end
2567
+
2568
+ def assign_or_add_vector(name, v)
2569
+ # FIXME: fix this jugaad. need to make changes in Indexing itself.
2570
+ begin
2571
+ pos = @vectors[name]
2572
+ rescue IndexError
2573
+ pos = name
2574
+ end
2575
+
2576
+ if pos.is_a?(DaruLite::Index)
2577
+ assign_multiple_vectors pos, v
2578
+ elsif pos == name &&
2579
+ (@vectors.include?(name) || (pos.is_a?(Integer) && pos < @data.size))
2580
+
2581
+ @data[pos] = v
2582
+ else
2583
+ assign_or_add_vector_rough name, v
2584
+ end
2585
+ end
2586
+
2587
+ def assign_multiple_vectors(pos, v)
2588
+ pos.each do |p|
2589
+ @data[@vectors[p]] = v
2590
+ end
2591
+ end
2592
+
2593
+ def assign_or_add_vector_rough(name, v)
2594
+ @vectors |= [name] unless @vectors.include?(name)
2595
+ @data[@vectors[name]] = v
2596
+ end
2597
+
2598
+ def insert_vector_in_empty(name, vector)
2599
+ vec = Vector.coerce(vector.to_a, name: coerce_name(name))
2600
+
2601
+ @index = vec.index
2602
+ assign_or_add_vector name, vec
2603
+ set_size
2604
+
2605
+ @data.map! { |v| v.empty? ? v.reindex(@index) : v }
2606
+ end
2607
+
2608
+ def prepare_for_insert(name, arg)
2609
+ if arg.is_a? DaruLite::Vector
2610
+ prepare_vector_for_insert name, arg
2611
+ elsif arg.respond_to?(:to_a)
2612
+ prepare_enum_for_insert name, arg
2613
+ else
2614
+ prepare_value_for_insert name, arg
2615
+ end
2616
+ end
2617
+
2618
+ def prepare_vector_for_insert(name, vector)
2619
+ # so that index-by-index assignment is avoided when possible.
2620
+ return vector.dup if vector.index == @index
2621
+
2622
+ DaruLite::Vector.new([], name: coerce_name(name), index: @index).tap do |v|
2623
+ @index.each do |idx|
2624
+ v[idx] = vector.index.include?(idx) ? vector[idx] : nil
2625
+ end
2626
+ end
2627
+ end
2628
+
2629
+ def prepare_enum_for_insert(name, enum)
2630
+ if @size != enum.size
2631
+ raise "Specified vector of length #{enum.size} cannot be inserted in DataFrame of size #{@size}"
2632
+ end
2633
+
2634
+ DaruLite::Vector.new(enum, name: coerce_name(name), index: @index)
2635
+ end
2636
+
2637
+ def prepare_value_for_insert(name, value)
2638
+ DaruLite::Vector.new(Array(value) * @size, name: coerce_name(name), index: @index)
2639
+ end
2640
+
2641
+ def insert_or_modify_row(indexes, vector)
2642
+ vector = coerce_vector vector
2643
+
2644
+ raise SizeError, 'Vector length should match row length' if
2645
+ vector.size != @vectors.size
2646
+
2647
+ @data.each_with_index do |vec, pos|
2648
+ vec.send(:set, indexes, vector.at(pos))
2649
+ end
2650
+ @index = @data[0].index
2651
+
2652
+ set_size
2653
+ end
2654
+
2655
+ def create_empty_vectors(vectors, index)
2656
+ @vectors = Index.coerce vectors
2657
+ @index = Index.coerce index
2658
+
2659
+ @data = @vectors.map do |name|
2660
+ DaruLite::Vector.new([], name: coerce_name(name), index: @index)
2661
+ end
2662
+ end
2663
+
2664
+ def validate_labels
2665
+ if @vectors && @vectors.size != @data.size
2666
+ raise IndexError, "Expected equal number of vector names (#{@vectors.size}) " \
2667
+ "for number of vectors (#{@data.size})."
2668
+ end
2669
+
2670
+ return unless @index && @data[0] && @index.size != @data[0].size
2671
+
2672
+ raise IndexError, 'Expected number of indexes same as number of rows'
2673
+ end
2674
+
2675
+ def validate_vector_sizes
2676
+ @data.each do |vector|
2677
+ raise IndexError, 'Expected vectors with equal length' if vector.size != @size
2678
+ end
2679
+ end
2680
+
2681
+ def validate
2682
+ validate_labels
2683
+ validate_vector_sizes
2684
+ end
2685
+
2686
+ def set_size
2687
+ @size = @index.size
2688
+ end
2689
+
2690
+ def named_index_for(index)
2691
+ if @index.include? index
2692
+ index
2693
+ elsif @index.key index
2694
+ @index.key index
2695
+ else
2696
+ raise IndexError, "Specified index #{index} does not exist."
2697
+ end
2698
+ end
2699
+
2700
+ def create_vectors_index_with(vectors, source)
2701
+ vectors = source.keys if vectors.nil?
2702
+
2703
+ @vectors =
2704
+ if vectors.is_a?(Index) || vectors.is_a?(MultiIndex)
2705
+ vectors
2706
+ else
2707
+ DaruLite::Index.new((vectors + (source.keys - vectors)).uniq)
2708
+ end
2709
+ end
2710
+
2711
+ def all_vectors_have_equal_indexes?(source)
2712
+ idx = source.values[0].index
2713
+
2714
+ source.values.all? { |vector| idx == vector.index }
2715
+ end
2716
+
2717
+ def coerce_name(potential_name)
2718
+ potential_name.is_a?(Array) ? potential_name.join : potential_name
2719
+ end
2720
+
2721
+ def initialize_from_array(source, vectors, index, opts)
2722
+ raise ArgumentError, 'All objects in data source should be same class' \
2723
+ unless source.map(&:class).uniq.size == 1
2724
+
2725
+ case source.first
2726
+ when Array
2727
+ vectors ||= (0..source.size - 1).to_a
2728
+ initialize_from_array_of_arrays source, vectors, index, opts
2729
+ when Vector
2730
+ vectors ||= (0..source.size - 1).to_a
2731
+ initialize_from_array_of_vectors source, vectors, index, opts
2732
+ when Hash
2733
+ initialize_from_array_of_hashes source, vectors, index, opts
2734
+ else
2735
+ raise ArgumentError, "Can't create DataFrame from #{source}"
2736
+ end
2737
+ end
2738
+
2739
+ def initialize_from_array_of_arrays(source, vectors, index, _opts)
2740
+ if source.size != vectors.size
2741
+ raise ArgumentError, "Number of vectors (#{vectors.size}) should " \
2742
+ "equal order size (#{source.size})"
2743
+ end
2744
+
2745
+ @index = Index.coerce(index || source[0].size)
2746
+ @vectors = Index.coerce(vectors)
2747
+
2748
+ update_data source, vectors
2749
+ end
2750
+
2751
+ def initialize_from_array_of_vectors(source, vectors, index, opts)
2752
+ clone = opts[:clone] != false
2753
+ hsh = vectors.each_with_index.to_h do |name, idx|
2754
+ [name, source[idx]]
2755
+ end
2756
+ initialize(hsh, index: index, order: vectors, name: @name, clone: clone)
2757
+ end
2758
+
2759
+ def initialize_from_array_of_hashes(source, vectors, index, _opts)
2760
+ names =
2761
+ if vectors.nil?
2762
+ source[0].keys
2763
+ else
2764
+ (vectors + source[0].keys).uniq
2765
+ end
2766
+ @vectors = DaruLite::Index.new(names)
2767
+ @index = DaruLite::Index.new(index || source.size)
2768
+
2769
+ @data = @vectors.map do |name|
2770
+ v = source.map { |h| h.fetch(name) { h[name.to_s] } }
2771
+ DaruLite::Vector.new(v, name: coerce_name(name), index: @index)
2772
+ end
2773
+ end
2774
+
2775
+ def initialize_from_hash(source, vectors, index, opts)
2776
+ create_vectors_index_with vectors, source
2777
+
2778
+ if ArrayHelper.array_of?(source.values, Vector)
2779
+ initialize_from_hash_with_vectors source, index, opts
2780
+ else
2781
+ initialize_from_hash_with_arrays source, index, opts
2782
+ end
2783
+ end
2784
+
2785
+ def initialize_from_hash_with_vectors(source, index, opts)
2786
+ vectors_have_same_index = all_vectors_have_equal_indexes?(source)
2787
+
2788
+ clone = opts[:clone] != false
2789
+ clone = true unless index || vectors_have_same_index
2790
+
2791
+ @index = deduce_index index, source, vectors_have_same_index
2792
+
2793
+ if clone
2794
+ @data = clone_vectors source, vectors_have_same_index
2795
+ else
2796
+ @data.concat source.values
2797
+ end
2798
+ end
2799
+
2800
+ def deduce_index(index, source, vectors_have_same_index)
2801
+ if !index.nil?
2802
+ Index.coerce index
2803
+ elsif vectors_have_same_index
2804
+ source.values[0].index.dup
2805
+ else
2806
+ all_indexes = source
2807
+ .values.map { |v| v.index.to_a }
2808
+ .flatten.uniq.sort # sort only if missing indexes detected
2809
+
2810
+ DaruLite::Index.new all_indexes
2811
+ end
2812
+ end
2813
+
2814
+ def clone_vectors(source, vectors_have_same_index)
2815
+ @vectors.map do |vector|
2816
+ # avoids matching indexes of vectors if all the supplied vectors
2817
+ # have the same index.
2818
+ if vectors_have_same_index
2819
+ source[vector].dup
2820
+ else
2821
+ DaruLite::Vector.new([], name: vector, index: @index).tap do |v|
2822
+ @index.each do |idx|
2823
+ v[idx] = source[vector].index.include?(idx) ? source[vector][idx] : nil
2824
+ end
2825
+ end
2826
+ end
2827
+ end
2828
+ end
2829
+
2830
+ def initialize_from_hash_with_arrays(source, index, _opts)
2831
+ @index = Index.coerce(index || source.values[0].size)
2832
+
2833
+ @vectors.each do |name|
2834
+ @data << DaruLite::Vector.new(source[name].dup, name: coerce_name(name), index: @index)
2835
+ end
2836
+ end
2837
+
2838
+ def sort_build_row(vector_locs, by_blocks, ascending, handle_nils, r1, r2) # rubocop:disable Metrics/ParameterLists
2839
+ # Create an array to be used for comparison of two rows in sorting
2840
+ vector_locs
2841
+ .zip(by_blocks, ascending, handle_nils)
2842
+ .map do |vector_loc, by, asc, handle_nil|
2843
+ value = @data[vector_loc].data[asc ? r1 : r2]
2844
+
2845
+ if by
2846
+ value = begin
2847
+ by.call(value)
2848
+ rescue StandardError
2849
+ nil
2850
+ end
2851
+ end
2852
+
2853
+ sort_handle_nils value, asc, handle_nil || !by
2854
+ end
2855
+ end
2856
+
2857
+ def sort_handle_nils(value, asc, handle_nil)
2858
+ if !handle_nil
2859
+ value
2860
+ elsif asc
2861
+ [value.nil? ? 0 : 1, value]
2862
+ else
2863
+ [value.nil? ? 1 : 0, value]
2864
+ end
2865
+ end
2866
+
2867
+ def sort_coerce_boolean(opts, symbol, default, size)
2868
+ val = opts[symbol]
2869
+ case val
2870
+ when true, false
2871
+ Array.new(size, val)
2872
+ when nil
2873
+ Array.new(size, default)
2874
+ when Array
2875
+ raise ArgumentError, "Specify same number of vector names and #{symbol}" if
2876
+ size != val.size
2877
+
2878
+ val
2879
+ else
2880
+ raise ArgumentError, "Can't coerce #{symbol} from #{val.class} to boolean option"
2881
+ end
2882
+ end
2883
+
2884
+ def sort_prepare_block(vector_order, opts)
2885
+ ascending = sort_coerce_boolean opts, :ascending, true, vector_order.size
2886
+ handle_nils = sort_coerce_boolean opts, :handle_nils, false, vector_order.size
2887
+
2888
+ by_blocks = vector_order.map { |v| (opts[:by] || {})[v] }
2889
+ vector_locs = vector_order.map { |v| @vectors[v] }
2890
+
2891
+ lambda do |index1, index2|
2892
+ # Build left and right array to compare two rows
2893
+ left = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index1, index2
2894
+ right = sort_build_row vector_locs, by_blocks, ascending, handle_nils, index2, index1
2895
+
2896
+ # Resolve conflict by Index if all attributes are same
2897
+ left << index1
2898
+ right << index2
2899
+ left <=> right
2900
+ end
2901
+ end
2902
+
2903
+ def verify_error_message(row, test, id, i)
2904
+ description, fields, = test
2905
+ values = fields.empty? ? '' : " (#{fields.collect { |k| "#{k}=#{row[k]}" }.join(', ')})"
2906
+ "#{i + 1} [#{row[id]}]: #{description}#{values}"
2907
+ end
2908
+
2909
+ def prepare_pivot_values(index, vectors, opts)
2910
+ case opts[:values]
2911
+ when nil # values not specified at all.
2912
+ (@vectors.to_a - (index | vectors)) & numeric_vector_names
2913
+ when Array # multiple values specified.
2914
+ opts[:values]
2915
+ else # single value specified.
2916
+ [opts[:values]]
2917
+ end
2918
+ end
2919
+
2920
+ def make_pivot_hash(grouped, vectors, values, aggregate_function)
2921
+ grouped.groups.transform_values { |_| {} }.tap do |super_hash|
2922
+ values.each do |value|
2923
+ grouped.groups.each do |group_name, row_numbers|
2924
+ row_numbers.each do |num|
2925
+ arry = [value, *vectors.map { |v| self[v][num] }]
2926
+ sub_hash = super_hash[group_name]
2927
+ sub_hash[arry] ||= []
2928
+
2929
+ sub_hash[arry] << self[value][num]
2930
+ end
2931
+ end
2932
+ end
2933
+
2934
+ setup_pivot_aggregates super_hash, aggregate_function
2935
+ end
2936
+ end
2937
+
2938
+ def setup_pivot_aggregates(super_hash, aggregate_function)
2939
+ super_hash.each_value do |sub_hash|
2940
+ sub_hash.each do |group_name, aggregates|
2941
+ sub_hash[group_name] = DaruLite::Vector.new(aggregates).send(aggregate_function)
2942
+ end
2943
+ end
2944
+ end
2945
+
2946
+ def pivot_dataframe(super_hash)
2947
+ df_index = DaruLite::MultiIndex.from_tuples super_hash.keys
2948
+ df_vectors = DaruLite::MultiIndex.from_tuples super_hash.values.flat_map(&:keys).uniq
2949
+
2950
+ DaruLite::DataFrame.new({}, index: df_index, order: df_vectors).tap do |pivoted_dataframe|
2951
+ super_hash.each do |row_index, sub_h|
2952
+ sub_h.each do |vector_index, val|
2953
+ pivoted_dataframe[vector_index][row_index] = val
2954
+ end
2955
+ end
2956
+ end
2957
+ end
2958
+
2959
+ def one_to_many_components(pattern)
2960
+ re = Regexp.new pattern.gsub('%v', '(.+?)').gsub('%n', '(\\d+?)')
2961
+
2962
+ vars, numbers =
2963
+ @vectors
2964
+ .map { |v| v.scan(re) }
2965
+ .reject(&:empty?).flatten(1).transpose
2966
+
2967
+ [vars.uniq, numbers.map(&:to_i).sort.uniq]
2968
+ end
2969
+
2970
+ def one_to_many_row(row, number, vars, pattern)
2971
+ vars
2972
+ .to_h do |v|
2973
+ name = pattern.sub('%v', v).sub('%n', number.to_s)
2974
+ [v, row[name]]
2975
+ end
2976
+ end
2977
+
2978
+ # Raises IndexError when one of the positions is not a valid position
2979
+ def validate_positions(*positions, size)
2980
+ positions.each do |pos|
2981
+ raise IndexError, "#{pos} is not a valid position." if pos >= size
2982
+ end
2983
+ end
2984
+
2985
+ # Accepts hash, enumerable and vector and align it properly so it can be added
2986
+ def coerce_vector(vector)
2987
+ case vector
2988
+ when DaruLite::Vector
2989
+ vector.reindex @vectors
2990
+ when Hash
2991
+ DaruLite::Vector.new(vector).reindex @vectors
2992
+ else
2993
+ DaruLite::Vector.new vector
2994
+ end
2995
+ end
2996
+
2997
+ def update_data(source, vectors)
2998
+ @data = @vectors.each_with_index.map do |_vec, idx|
2999
+ DaruLite::Vector.new(source[idx], index: @index, name: vectors[idx])
3000
+ end
3001
+ end
3002
+
3003
+ def aggregate_by_positions_tuples(options, positions_tuples)
3004
+ agg_over_vectors_only, options = cast_aggregation_options(options)
3005
+
3006
+ if agg_over_vectors_only
3007
+ options.map do |vect_name, method|
3008
+ vect = self[vect_name]
3009
+
3010
+ positions_tuples.map do |positions|
3011
+ vect.apply_method_on_sub_vector(method, keys: positions)
3012
+ end
3013
+ end
3014
+ else
3015
+ methods = options.values
3016
+
3017
+ # NOTE: because we aggregate over rows, we don't have to re-get sub-dfs for each method (which is expensive)
3018
+ rows = positions_tuples.map do |positions|
3019
+ apply_method_on_sub_df(methods, keys: positions)
3020
+ end
3021
+
3022
+ rows.transpose
3023
+ end
3024
+ end
3025
+
3026
+ # convert operations over sub-vectors to operations over sub-dfs when it improves perf
3027
+ # note: we don't always "cast" because aggregation over a single vector / a few vector is faster
3028
+ # than aggregation over (sub-)dfs
3029
+ def cast_aggregation_options(options)
3030
+ vects, non_vects = options.keys.partition { |k| @vectors.include?(k) }
3031
+
3032
+ over_vectors = true
3033
+
3034
+ if non_vects.any?
3035
+ options = options.clone
3036
+
3037
+ vects.each do |name|
3038
+ proc_on_vect = options[name].to_proc
3039
+ options[name] = ->(sub_df) { proc_on_vect.call(sub_df[name]) }
3040
+ end
3041
+
3042
+ over_vectors = false
3043
+ end
3044
+
3045
+ [over_vectors, options]
3046
+ end
3047
+
3048
+ def group_index_for_aggregation(index, multi_index_level = -1)
3049
+ case index
3050
+ when DaruLite::MultiIndex
3051
+ groups_by_pos = DaruLite::Core::GroupBy.get_positions_group_for_aggregation(index, multi_index_level)
3052
+
3053
+ new_index = DaruLite::MultiIndex.from_tuples(groups_by_pos.keys).coerce_index
3054
+ pos_tuples = groups_by_pos.values
3055
+ when DaruLite::Index, DaruLite::CategoricalIndex
3056
+ new_index = Array(index).uniq
3057
+ pos_tuples = new_index.map { |idx| [*index.pos(idx)] }
3058
+ else raise
3059
+ end
3060
+
3061
+ [pos_tuples, new_index]
3062
+ end
3063
+
3064
+ # coerce ranges, integers and array in appropriate ways
3065
+ def coerce_positions(*positions, size)
3066
+ if positions.size == 1
3067
+ case positions.first
3068
+ when Integer
3069
+ positions.first
3070
+ when Range
3071
+ size.times.to_a[positions.first]
3072
+ else
3073
+ raise ArgumentError, 'Unknown position type.'
3074
+ end
3075
+ else
3076
+ positions
3077
+ end
3078
+ end
3079
+ end
3080
+ end