polars-df 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,33 +26,135 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
35
91
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
- file = Utils.format_path(file)
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
37
113
  end
38
114
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
122
+ end
123
+
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
40
155
  end
41
156
 
157
+ # @private
42
158
  def self._read_parquet(file)
43
159
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
160
  file = Utils.format_path(file)
@@ -47,6 +163,44 @@ module Polars
47
163
  _from_rbdf(RbDataFrame.read_parquet(file))
48
164
  end
49
165
 
166
+ # def self._read_avro
167
+ # end
168
+
169
+ # @private
170
+ def self._read_ipc(
171
+ file,
172
+ columns: nil,
173
+ n_rows: nil,
174
+ row_count_name: nil,
175
+ row_count_offset: 0,
176
+ rechunk: true,
177
+ memory_map: true
178
+ )
179
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
180
+ file = Utils.format_path(file)
181
+ end
182
+ if columns.is_a?(String)
183
+ columns = [columns]
184
+ end
185
+
186
+ if file.is_a?(String) && file.include?("*")
187
+ raise Todo
188
+ end
189
+
190
+ projection, columns = Utils.handle_projection_columns(columns)
191
+ _from_rbdf(
192
+ RbDataFrame.read_ipc(
193
+ file,
194
+ columns,
195
+ projection,
196
+ n_rows,
197
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
198
+ memory_map
199
+ )
200
+ )
201
+ end
202
+
203
+ # @private
50
204
  def self._read_json(file)
51
205
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
206
  file = Utils.format_path(file)
@@ -55,6 +209,7 @@ module Polars
55
209
  _from_rbdf(RbDataFrame.read_json(file))
56
210
  end
57
211
 
212
+ # @private
58
213
  def self._read_ndjson(file)
59
214
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
215
  file = Utils.format_path(file)
@@ -63,26 +218,157 @@ module Polars
63
218
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
219
  end
65
220
 
221
+ # Get the shape of the DataFrame.
222
+ #
223
+ # @return [Array]
224
+ #
225
+ # @example
226
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
227
+ # df.shape
228
+ # # => [5, 1]
66
229
  def shape
67
230
  _df.shape
68
231
  end
69
232
 
233
+ # Get the height of the DataFrame.
234
+ #
235
+ # @return [Integer]
236
+ #
237
+ # @example
238
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
239
+ # df.height
240
+ # # => 5
70
241
  def height
71
242
  _df.height
72
243
  end
73
244
 
245
+ # Get the width of the DataFrame.
246
+ #
247
+ # @return [Integer]
248
+ #
249
+ # @example
250
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
251
+ # df.width
252
+ # # => 1
74
253
  def width
75
254
  _df.width
76
255
  end
77
256
 
257
+ # Get column names.
258
+ #
259
+ # @return [Array]
260
+ #
261
+ # @example
262
+ # df = Polars::DataFrame.new({
263
+ # "foo" => [1, 2, 3],
264
+ # "bar" => [6, 7, 8],
265
+ # "ham" => ["a", "b", "c"]
266
+ # })
267
+ # df.columns
268
+ # # => ["foo", "bar", "ham"]
78
269
  def columns
79
270
  _df.columns
80
271
  end
81
272
 
273
+ # Change the column names of the DataFrame.
274
+ #
275
+ # @param columns [Array]
276
+ # A list with new names for the DataFrame.
277
+ # The length of the list should be equal to the width of the DataFrame.
278
+ #
279
+ # @return [Object]
280
+ #
281
+ # @example
282
+ # df = Polars::DataFrame.new({
283
+ # "foo" => [1, 2, 3],
284
+ # "bar" => [6, 7, 8],
285
+ # "ham" => ["a", "b", "c"]
286
+ # })
287
+ # df.columns = ["apple", "banana", "orange"]
288
+ # df
289
+ # # =>
290
+ # # shape: (3, 3)
291
+ # # ┌───────┬────────┬────────┐
292
+ # # │ apple ┆ banana ┆ orange │
293
+ # # │ --- ┆ --- ┆ --- │
294
+ # # │ i64 ┆ i64 ┆ str │
295
+ # # ╞═══════╪════════╪════════╡
296
+ # # │ 1 ┆ 6 ┆ a │
297
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
298
+ # # │ 2 ┆ 7 ┆ b │
299
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
300
+ # # │ 3 ┆ 8 ┆ c │
301
+ # # └───────┴────────┴────────┘
302
+ def columns=(columns)
303
+ _df.set_column_names(columns)
304
+ end
305
+
306
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
307
+ #
308
+ # @return [Array]
309
+ #
310
+ # @example
311
+ # df = Polars::DataFrame.new({
312
+ # "foo" => [1, 2, 3],
313
+ # "bar" => [6.0, 7.0, 8.0],
314
+ # "ham" => ["a", "b", "c"]
315
+ # })
316
+ # df.dtypes
317
+ # # => [:i64, :f64, :str]
82
318
  def dtypes
83
- _df.dtypes.map(&:to_sym)
319
+ _df.dtypes
320
+ end
321
+
322
+ # Get the schema.
323
+ #
324
+ # @return [Hash]
325
+ #
326
+ # @example
327
+ # df = Polars::DataFrame.new({
328
+ # "foo" => [1, 2, 3],
329
+ # "bar" => [6.0, 7.0, 8.0],
330
+ # "ham" => ["a", "b", "c"]
331
+ # })
332
+ # df.schema
333
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
334
+ def schema
335
+ columns.zip(dtypes).to_h
84
336
  end
85
337
 
338
+ # def ==(other)
339
+ # end
340
+
341
+ # def !=(other)
342
+ # end
343
+
344
+ # def >(other)
345
+ # end
346
+
347
+ # def <(other)
348
+ # end
349
+
350
+ # def >=(other)
351
+ # end
352
+
353
+ # def <=(other)
354
+ # end
355
+
356
+ # def *(other)
357
+ # end
358
+
359
+ # def /(other)
360
+ # end
361
+
362
+ # def +(other)
363
+ # end
364
+
365
+ # def -(other)
366
+ # end
367
+
368
+ # def %(other)
369
+ # end
370
+
371
+ #
86
372
  def to_s
87
373
  _df.to_s
88
374
  end
@@ -92,10 +378,64 @@ module Polars
92
378
  columns.include?(name)
93
379
  end
94
380
 
381
+ # def each
382
+ # end
383
+
384
+ # def _pos_idx
385
+ # end
386
+
387
+ # def _pos_idxs
388
+ # end
389
+
390
+ #
95
391
  def [](name)
96
392
  Utils.wrap_s(_df.column(name))
97
393
  end
98
394
 
395
+ # def []=(key, value)
396
+ # end
397
+
398
+ # no to_arrow
399
+
400
+ #
401
+ def to_h(as_series: true)
402
+ if as_series
403
+ get_columns.to_h { |s| [s.name, s] }
404
+ else
405
+ get_columns.to_h { |s| [s.name, s.to_a] }
406
+ end
407
+ end
408
+
409
+ # def to_hashes / to_a
410
+ # end
411
+
412
+ # def to_numo
413
+ # end
414
+
415
+ # no to_pandas
416
+
417
+ # Select column as Series at index location.
418
+ #
419
+ # @param index [Integer]
420
+ # Location of selection.
421
+ #
422
+ # @return [Series]
423
+ #
424
+ # @example
425
+ # df = Polars::DataFrame.new({
426
+ # "foo" => [1, 2, 3],
427
+ # "bar" => [6, 7, 8],
428
+ # "ham" => ["a", "b", "c"]
429
+ # })
430
+ # df.to_series(1)
431
+ # # =>
432
+ # # shape: (3,)
433
+ # # Series: 'bar' [i64]
434
+ # # [
435
+ # # 6
436
+ # # 7
437
+ # # 8
438
+ # # ]
99
439
  def to_series(index = 0)
100
440
  if index < 0
101
441
  index = columns.length + index
@@ -103,6 +443,18 @@ module Polars
103
443
  Utils.wrap_s(_df.select_at_idx(index))
104
444
  end
105
445
 
446
+ # Serialize to JSON representation.
447
+ #
448
+ # @return [nil]
449
+ #
450
+ # @param file [String]
451
+ # File path to which the result should be written.
452
+ # @param pretty [Boolean]
453
+ # Pretty serialize json.
454
+ # @param row_oriented [Boolean]
455
+ # Write to row oriented json. This is slower, but more common.
456
+ #
457
+ # @see #write_ndjson
106
458
  def write_json(
107
459
  file,
108
460
  pretty: false,
@@ -116,6 +468,12 @@ module Polars
116
468
  nil
117
469
  end
118
470
 
471
+ # Serialize to newline delimited JSON representation.
472
+ #
473
+ # @param file [String]
474
+ # File path to which the result should be written.
475
+ #
476
+ # @return [nil]
119
477
  def write_ndjson(file)
120
478
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
121
479
  file = Utils.format_path(file)
@@ -125,6 +483,48 @@ module Polars
125
483
  nil
126
484
  end
127
485
 
486
+ # Write to comma-separated values (CSV) file.
487
+ #
488
+ # @param file [String, nil]
489
+ # File path to which the result should be written. If set to `nil`
490
+ # (default), the output is returned as a string instead.
491
+ # @param has_header [Boolean]
492
+ # Whether to include header in the CSV output.
493
+ # @param sep [String]
494
+ # Separate CSV fields with this symbol.
495
+ # @param quote [String]
496
+ # Byte to use as quoting character.
497
+ # @param batch_size [Integer]
498
+ # Number of rows that will be processed per thread.
499
+ # @param datetime_format [String, nil]
500
+ # A format string, with the specifiers defined by the
501
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
502
+ # Rust crate. If no format specified, the default fractional-second
503
+ # precision is inferred from the maximum timeunit found in the frame's
504
+ # Datetime cols (if any).
505
+ # @param date_format [String, nil]
506
+ # A format string, with the specifiers defined by the
507
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
508
+ # Rust crate.
509
+ # @param time_format [String, nil]
510
+ # A format string, with the specifiers defined by the
511
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
512
+ # Rust crate.
513
+ # @param float_precision [Integer, nil]
514
+ # Number of decimal places to write, applied to both `:f32` and
515
+ # `:f64` datatypes.
516
+ # @param null_value [String, nil]
517
+ # A string representing null values (defaulting to the empty string).
518
+ #
519
+ # @return [String, nil]
520
+ #
521
+ # @example
522
+ # df = Polars::DataFrame.new({
523
+ # "foo" => [1, 2, 3, 4, 5],
524
+ # "bar" => [6, 7, 8, 9, 10],
525
+ # "ham" => ["a", "b", "c", "d", "e"]
526
+ # })
527
+ # df.write_csv("file.csv")
128
528
  def write_csv(
129
529
  file = nil,
130
530
  has_header: true,
@@ -160,8 +560,7 @@ module Polars
160
560
  float_precision,
161
561
  null_value
162
562
  )
163
- buffer.rewind
164
- return buffer.read.force_encoding(Encoding::UTF_8)
563
+ return buffer.string.force_encoding(Encoding::UTF_8)
165
564
  end
166
565
 
167
566
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -183,6 +582,53 @@ module Polars
183
582
  nil
184
583
  end
185
584
 
585
+ # def write_avro
586
+ # end
587
+
588
+ # Write to Arrow IPC binary stream or Feather file.
589
+ #
590
+ # @param file [String]
591
+ # File path to which the file should be written.
592
+ # @param compression ["uncompressed", "lz4", "zstd"]
593
+ # Compression method. Defaults to "uncompressed".
594
+ #
595
+ # @return [nil]
596
+ def write_ipc(file, compression: "uncompressed")
597
+ if compression.nil?
598
+ compression = "uncompressed"
599
+ end
600
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
601
+ file = Utils.format_path(file)
602
+ end
603
+
604
+ _df.write_ipc(file, compression)
605
+ end
606
+
607
+ # Write to Apache Parquet file.
608
+ #
609
+ # @param file [String]
610
+ # File path to which the file should be written.
611
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
612
+ # Choose "zstd" for good compression performance.
613
+ # Choose "lz4" for fast compression/decompression.
614
+ # Choose "snappy" for more backwards compatibility guarantees
615
+ # when you deal with older parquet readers.
616
+ # @param compression_level [Integer, nil]
617
+ # The level of compression to use. Higher compression means smaller files on
618
+ # disk.
619
+ #
620
+ # - "gzip" : min-level: 0, max-level: 10.
621
+ # - "brotli" : min-level: 0, max-level: 11.
622
+ # - "zstd" : min-level: 1, max-level: 22.
623
+ # @param statistics [Boolean]
624
+ # Write statistics to the parquet headers. This requires extra compute.
625
+ # @param row_group_size [Integer, nil]
626
+ # Size of the row groups in number of rows.
627
+ # If `nil` (default), the chunks of the DataFrame are
628
+ # used. Writing in smaller chunks may reduce memory pressure and improve
629
+ # writing speeds.
630
+ #
631
+ # @return [nil]
186
632
  def write_parquet(
187
633
  file,
188
634
  compression: "zstd",
@@ -202,10 +648,177 @@ module Polars
202
648
  )
203
649
  end
204
650
 
651
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
652
+ #
653
+ # Estimated size is given in the specified unit (bytes by default).
654
+ #
655
+ # This estimation is the sum of the size of its buffers, validity, including
656
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
657
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
658
+ # particular, StructArray's size is an upper bound.
659
+ #
660
+ # When an array is sliced, its allocated size remains constant because the buffer
661
+ # unchanged. However, this function will yield a smaller number. This is because
662
+ # this function returns the visible size of the buffer, not its total capacity.
663
+ #
664
+ # FFI buffers are included in this estimation.
665
+ #
666
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
667
+ # Scale the returned size to the given unit.
668
+ #
669
+ # @return [Numeric]
670
+ #
671
+ # @example
672
+ # df = Polars::DataFrame.new(
673
+ # {
674
+ # "x" => 1_000_000.times.to_a.reverse,
675
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
676
+ # "z" => 1_000_000.times.map(&:to_s)
677
+ # },
678
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
679
+ # )
680
+ # df.estimated_size
681
+ # # => 25888898
682
+ # df.estimated_size("mb")
683
+ # # => 24.689577102661133
684
+ def estimated_size(unit = "b")
685
+ sz = _df.estimated_size
686
+ Utils.scale_bytes(sz, to: unit)
687
+ end
688
+
689
+ # def transpose
690
+ # end
691
+
692
+ # Reverse the DataFrame.
693
+ #
694
+ # @return [DataFrame]
695
+ #
696
+ # @example
697
+ # df = Polars::DataFrame.new({
698
+ # "key" => ["a", "b", "c"],
699
+ # "val" => [1, 2, 3]
700
+ # })
701
+ # df.reverse()
702
+ # # =>
703
+ # # shape: (3, 2)
704
+ # # ┌─────┬─────┐
705
+ # # │ key ┆ val │
706
+ # # │ --- ┆ --- │
707
+ # # │ str ┆ i64 │
708
+ # # ╞═════╪═════╡
709
+ # # │ c ┆ 3 │
710
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
711
+ # # │ b ┆ 2 │
712
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
713
+ # # │ a ┆ 1 │
714
+ # # └─────┴─────┘
715
+ def reverse
716
+ select(Polars.col("*").reverse)
717
+ end
718
+
719
+ # Rename column names.
720
+ #
721
+ # @param mapping [Hash]
722
+ # Key value pairs that map from old name to new name.
723
+ #
724
+ # @return [DataFrame]
725
+ #
726
+ # @example
727
+ # df = Polars::DataFrame.new({
728
+ # "foo" => [1, 2, 3],
729
+ # "bar" => [6, 7, 8],
730
+ # "ham" => ["a", "b", "c"]
731
+ # })
732
+ # df.rename({"foo" => "apple"})
733
+ # # =>
734
+ # # shape: (3, 3)
735
+ # # ┌───────┬─────┬─────┐
736
+ # # │ apple ┆ bar ┆ ham │
737
+ # # │ --- ┆ --- ┆ --- │
738
+ # # │ i64 ┆ i64 ┆ str │
739
+ # # ╞═══════╪═════╪═════╡
740
+ # # │ 1 ┆ 6 ┆ a │
741
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 2 ┆ 7 ┆ b │
743
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 3 ┆ 8 ┆ c │
745
+ # # └───────┴─────┴─────┘
746
+ def rename(mapping)
747
+ lazy.rename(mapping).collect(no_optimization: true)
748
+ end
749
+
750
+ # Insert a Series at a certain column index. This operation is in place.
751
+ #
752
+ # @param index [Integer]
753
+ # Column to insert the new `Series` column.
754
+ # @param series [Series]
755
+ # `Series` to insert.
756
+ #
757
+ # @return [DataFrame]
758
+ #
759
+ # @example
760
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
761
+ # s = Polars::Series.new("baz", [97, 98, 99])
762
+ # df.insert_at_idx(1, s)
763
+ # # =>
764
+ # # shape: (3, 3)
765
+ # # ┌─────┬─────┬─────┐
766
+ # # │ foo ┆ baz ┆ bar │
767
+ # # │ --- ┆ --- ┆ --- │
768
+ # # │ i64 ┆ i64 ┆ i64 │
769
+ # # ╞═════╪═════╪═════╡
770
+ # # │ 1 ┆ 97 ┆ 4 │
771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
772
+ # # │ 2 ┆ 98 ┆ 5 │
773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
774
+ # # │ 3 ┆ 99 ┆ 6 │
775
+ # # └─────┴─────┴─────┘
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new({
779
+ # "a" => [1, 2, 3, 4],
780
+ # "b" => [0.5, 4, 10, 13],
781
+ # "c" => [true, true, false, true]
782
+ # })
783
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
+ # df.insert_at_idx(3, s)
785
+ # # =>
786
+ # # shape: (4, 4)
787
+ # # ┌─────┬──────┬───────┬──────┐
788
+ # # │ a ┆ b ┆ c ┆ d │
789
+ # # │ --- ┆ --- ┆ --- ┆ --- │
790
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
791
+ # # ╞═════╪══════╪═══════╪══════╡
792
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
793
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
794
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
795
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
796
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
797
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
798
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
799
+ # # └─────┴──────┴───────┴──────┘
800
+ def insert_at_idx(index, series)
801
+ if index < 0
802
+ index = columns.length + index
803
+ end
804
+ _df.insert_at_idx(index, series._s)
805
+ self
806
+ end
807
+
205
808
  def filter(predicate)
206
809
  lazy.filter(predicate).collect
207
810
  end
208
811
 
812
+ # def describe
813
+ # end
814
+
815
+ # def find_idx_by_name
816
+ # end
817
+
818
+ # def replace_at_idx
819
+ # end
820
+
821
+ #
209
822
  def sort(by, reverse: false, nulls_last: false)
210
823
  _from_rbdf(_df.sort(by, reverse, nulls_last))
211
824
  end
@@ -214,6 +827,17 @@ module Polars
214
827
  _df.frame_equal(other._df, null_equal)
215
828
  end
216
829
 
830
+ # def replace
831
+ # end
832
+
833
+ #
834
+ def slice(offset, length = nil)
835
+ if !length.nil? && length < 0
836
+ length = height - offset + length
837
+ end
838
+ _from_rbdf(_df.slice(offset, length))
839
+ end
840
+
217
841
  def limit(n = 5)
218
842
  head(n)
219
843
  end
@@ -226,10 +850,33 @@ module Polars
226
850
  _from_rbdf(_df.tail(n))
227
851
  end
228
852
 
853
+ # def drop_nulls
854
+ # end
855
+
856
+ # def pipe
857
+ # end
858
+
859
+ # def with_row_count
860
+ # end
861
+
862
+ #
229
863
  def groupby(by, maintain_order: false)
230
864
  lazy.groupby(by, maintain_order: maintain_order)
231
865
  end
232
866
 
867
+ # def groupby_rolling
868
+ # end
869
+
870
+ # def groupby_dynamic
871
+ # end
872
+
873
+ # def upsample
874
+ # end
875
+
876
+ # def join_asof
877
+ # end
878
+
879
+ #
233
880
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
234
881
  lazy
235
882
  .join(
@@ -243,12 +890,86 @@ module Polars
243
890
  .collect(no_optimization: true)
244
891
  end
245
892
 
893
+ # def apply
894
+ # end
895
+
896
+ #
246
897
  def with_column(column)
247
898
  lazy
248
899
  .with_column(column)
249
900
  .collect(no_optimization: true, string_cache: false)
250
901
  end
251
902
 
903
+ # def hstack
904
+ # end
905
+
906
+ # def vstack
907
+ # end
908
+
909
+ #
910
+ def extend(other)
911
+ _df.extend(other._df)
912
+ self
913
+ end
914
+
915
+ # def drop
916
+ # end
917
+
918
+ # def drop_in_place
919
+ # end
920
+
921
+ # def cleared
922
+ # end
923
+
924
+ # clone handled by initialize_copy
925
+
926
+ #
927
+ def get_columns
928
+ _df.get_columns.map { |s| Utils.wrap_s(s) }
929
+ end
930
+
931
+ def get_column(name)
932
+ self[name]
933
+ end
934
+
935
+ # def fill_null
936
+ # end
937
+
938
+ #
939
+ def fill_nan(fill_value)
940
+ lazy.fill_nan(fill_value).collect(no_optimization: true)
941
+ end
942
+
943
+ # def explode
944
+ # end
945
+
946
+ # def pivot
947
+ # end
948
+
949
+ # def melt
950
+ # end
951
+
952
+ # def unstack
953
+ # end
954
+
955
+ # def partition_by
956
+ # end
957
+
958
+ # def shift
959
+ # end
960
+
961
+ # def shift_and_fill
962
+ # end
963
+
964
+ #
965
+ def is_duplicated
966
+ Utils.wrap_s(_df.is_duplicated)
967
+ end
968
+
969
+ def is_unique
970
+ Utils.wrap_s(_df.is_unique)
971
+ end
972
+
252
973
  def lazy
253
974
  wrap_ldf(_df.lazy)
254
975
  end
@@ -262,6 +983,56 @@ module Polars
262
983
  )
263
984
  end
264
985
 
986
+ def with_columns(exprs)
987
+ if !exprs.nil? && !exprs.is_a?(Array)
988
+ exprs = [exprs]
989
+ end
990
+ lazy
991
+ .with_columns(exprs)
992
+ .collect(no_optimization: true, string_cache: false)
993
+ end
994
+
995
+ def n_chunks(strategy: "first")
996
+ if strategy == "first"
997
+ _df.n_chunks
998
+ elsif strategy == "all"
999
+ get_columns.map(&:n_chunks)
1000
+ else
1001
+ raise ArgumentError, "Strategy: '{strategy}' not understood. Choose one of {{'first', 'all'}}"
1002
+ end
1003
+ end
1004
+
1005
+ def max(axis: 0)
1006
+ if axis == 0
1007
+ _from_rbdf(_df.max)
1008
+ elsif axis == 1
1009
+ Utils.wrap_s(_df.hmax)
1010
+ else
1011
+ raise ArgumentError, "Axis should be 0 or 1."
1012
+ end
1013
+ end
1014
+
1015
+ def min(axis: 0)
1016
+ if axis == 0
1017
+ _from_rbdf(_df.min)
1018
+ elsif axis == 1
1019
+ Utils.wrap_s(_df.hmin)
1020
+ else
1021
+ raise ArgumentError, "Axis should be 0 or 1."
1022
+ end
1023
+ end
1024
+
1025
+ def sum(axis: 0, null_strategy: "ignore")
1026
+ case axis
1027
+ when 0
1028
+ _from_rbdf(_df.sum)
1029
+ when 1
1030
+ Utils.wrap_s(_df.hsum(null_strategy))
1031
+ else
1032
+ raise ArgumentError, "Axis should be 0 or 1."
1033
+ end
1034
+ end
1035
+
265
1036
  def mean(axis: 0, null_strategy: "ignore")
266
1037
  case axis
267
1038
  when 0
@@ -273,15 +1044,34 @@ module Polars
273
1044
  end
274
1045
  end
275
1046
 
276
- def with_columns(exprs)
277
- if !exprs.nil? && !exprs.is_a?(Array)
278
- exprs = [exprs]
279
- end
280
- lazy
281
- .with_columns(exprs)
282
- .collect(no_optimization: true, string_cache: false)
1047
+ def std(ddof: 1)
1048
+ _from_rbdf(_df.std(ddof))
1049
+ end
1050
+
1051
+ def var(ddof: 1)
1052
+ _from_rbdf(_df.var(ddof))
283
1053
  end
284
1054
 
1055
+ def median
1056
+ _from_rbdf(_df.median)
1057
+ end
1058
+
1059
+ # def product
1060
+ # end
1061
+
1062
+ # def quantile(quantile, interpolation: "nearest")
1063
+ # end
1064
+
1065
+ # def to_dummies
1066
+ # end
1067
+
1068
+ # def unique
1069
+ # end
1070
+
1071
+ # def n_unique
1072
+ # end
1073
+
1074
+ #
285
1075
  def rechunk
286
1076
  _from_rbdf(_df.rechunk)
287
1077
  end
@@ -290,17 +1080,98 @@ module Polars
290
1080
  _from_rbdf(_df.null_count)
291
1081
  end
292
1082
 
1083
+ # def sample
1084
+ # end
1085
+
1086
+ # def fold
1087
+ # end
1088
+
1089
+ # def row
1090
+ # end
1091
+
1092
+ # def rows
1093
+ # end
1094
+
1095
+ # def shrink_to_fit
1096
+ # end
1097
+
1098
+ # def take_every
1099
+ # end
1100
+
1101
+ # def hash_rows
1102
+ # end
1103
+
1104
+ # def interpolate
1105
+ # end
1106
+
1107
+ #
1108
+ def is_empty
1109
+ height == 0
1110
+ end
1111
+ alias_method :empty?, :is_empty
1112
+
1113
+ # def to_struct(name)
1114
+ # end
1115
+
1116
+ # def unnest
1117
+ # end
1118
+
293
1119
  private
294
1120
 
295
- def hash_to_rbdf(data)
1121
+ def initialize_copy(other)
1122
+ super
1123
+ self._df = _df._clone
1124
+ end
1125
+
1126
+ def hash_to_rbdf(data, columns: nil)
1127
+ if !columns.nil?
1128
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
+
1130
+ if !data && dtypes
1131
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
+ else
1133
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
1134
+ end
1135
+ data_series = _handle_columns_arg(data_series, columns: columns)
1136
+ return RbDataFrame.new(data_series)
1137
+ end
1138
+
296
1139
  RbDataFrame.read_hash(data)
297
1140
  end
298
1141
 
299
- def sequence_to_rbdf(data)
1142
+ def _unpack_columns(columns, lookup_names: nil)
1143
+ [columns.keys, columns]
1144
+ end
1145
+
1146
+ def _handle_columns_arg(data, columns: nil)
1147
+ if columns.nil?
1148
+ data
1149
+ else
1150
+ if !data
1151
+ columns.map { |c| Series.new(c, nil)._s }
1152
+ elsif data.length == columns.length
1153
+ columns.each_with_index do |c, i|
1154
+ # not in-place?
1155
+ data[i].rename(c)
1156
+ end
1157
+ data
1158
+ else
1159
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
1160
+ end
1161
+ end
1162
+ end
1163
+
1164
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
1165
+ if columns || orient
1166
+ raise Todo
1167
+ end
300
1168
  RbDataFrame.new(data.map(&:_s))
301
1169
  end
302
1170
 
303
- def series_to_rbdf(data)
1171
+ def series_to_rbdf(data, columns: nil)
1172
+ if columns
1173
+ raise Todo
1174
+ end
304
1175
  RbDataFrame.new([data._s])
305
1176
  end
306
1177