polars-df 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,33 +26,135 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
35
91
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
- file = Utils.format_path(file)
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
113
+ end
114
+
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
37
122
  end
38
123
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
40
155
  end
41
156
 
157
+ # @private
42
158
  def self._read_parquet(file)
43
159
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
160
  file = Utils.format_path(file)
@@ -47,6 +163,44 @@ module Polars
47
163
  _from_rbdf(RbDataFrame.read_parquet(file))
48
164
  end
49
165
 
166
+ # def self._read_avro
167
+ # end
168
+
169
+ # @private
170
+ def self._read_ipc(
171
+ file,
172
+ columns: nil,
173
+ n_rows: nil,
174
+ row_count_name: nil,
175
+ row_count_offset: 0,
176
+ rechunk: true,
177
+ memory_map: true
178
+ )
179
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
180
+ file = Utils.format_path(file)
181
+ end
182
+ if columns.is_a?(String)
183
+ columns = [columns]
184
+ end
185
+
186
+ if file.is_a?(String) && file.include?("*")
187
+ raise Todo
188
+ end
189
+
190
+ projection, columns = Utils.handle_projection_columns(columns)
191
+ _from_rbdf(
192
+ RbDataFrame.read_ipc(
193
+ file,
194
+ columns,
195
+ projection,
196
+ n_rows,
197
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
198
+ memory_map
199
+ )
200
+ )
201
+ end
202
+
203
+ # @private
50
204
  def self._read_json(file)
51
205
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
206
  file = Utils.format_path(file)
@@ -55,6 +209,7 @@ module Polars
55
209
  _from_rbdf(RbDataFrame.read_json(file))
56
210
  end
57
211
 
212
+ # @private
58
213
  def self._read_ndjson(file)
59
214
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
215
  file = Utils.format_path(file)
@@ -63,30 +218,119 @@ module Polars
63
218
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
219
  end
65
220
 
221
+ # Get the shape of the DataFrame.
222
+ #
223
+ # @return [Array]
224
+ #
225
+ # @example
226
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
227
+ # df.shape
228
+ # # => [5, 1]
66
229
  def shape
67
230
  _df.shape
68
231
  end
69
232
 
233
+ # Get the height of the DataFrame.
234
+ #
235
+ # @return [Integer]
236
+ #
237
+ # @example
238
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
239
+ # df.height
240
+ # # => 5
70
241
  def height
71
242
  _df.height
72
243
  end
73
244
 
245
+ # Get the width of the DataFrame.
246
+ #
247
+ # @return [Integer]
248
+ #
249
+ # @example
250
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
251
+ # df.width
252
+ # # => 1
74
253
  def width
75
254
  _df.width
76
255
  end
77
256
 
257
+ # Get column names.
258
+ #
259
+ # @return [Array]
260
+ #
261
+ # @example
262
+ # df = Polars::DataFrame.new({
263
+ # "foo" => [1, 2, 3],
264
+ # "bar" => [6, 7, 8],
265
+ # "ham" => ["a", "b", "c"]
266
+ # })
267
+ # df.columns
268
+ # # => ["foo", "bar", "ham"]
78
269
  def columns
79
270
  _df.columns
80
271
  end
81
272
 
273
+ # Change the column names of the DataFrame.
274
+ #
275
+ # @param columns [Array]
276
+ # A list with new names for the DataFrame.
277
+ # The length of the list should be equal to the width of the DataFrame.
278
+ #
279
+ # @return [Object]
280
+ #
281
+ # @example
282
+ # df = Polars::DataFrame.new({
283
+ # "foo" => [1, 2, 3],
284
+ # "bar" => [6, 7, 8],
285
+ # "ham" => ["a", "b", "c"]
286
+ # })
287
+ # df.columns = ["apple", "banana", "orange"]
288
+ # df
289
+ # # =>
290
+ # # shape: (3, 3)
291
+ # # ┌───────┬────────┬────────┐
292
+ # # │ apple ┆ banana ┆ orange │
293
+ # # │ --- ┆ --- ┆ --- │
294
+ # # │ i64 ┆ i64 ┆ str │
295
+ # # ╞═══════╪════════╪════════╡
296
+ # # │ 1 ┆ 6 ┆ a │
297
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
298
+ # # │ 2 ┆ 7 ┆ b │
299
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
300
+ # # │ 3 ┆ 8 ┆ c │
301
+ # # └───────┴────────┴────────┘
82
302
  def columns=(columns)
83
303
  _df.set_column_names(columns)
84
304
  end
85
305
 
306
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
307
+ #
308
+ # @return [Array]
309
+ #
310
+ # @example
311
+ # df = Polars::DataFrame.new({
312
+ # "foo" => [1, 2, 3],
313
+ # "bar" => [6.0, 7.0, 8.0],
314
+ # "ham" => ["a", "b", "c"]
315
+ # })
316
+ # df.dtypes
317
+ # # => [:i64, :f64, :str]
86
318
  def dtypes
87
- _df.dtypes.map(&:to_sym)
319
+ _df.dtypes
88
320
  end
89
321
 
322
+ # Get the schema.
323
+ #
324
+ # @return [Hash]
325
+ #
326
+ # @example
327
+ # df = Polars::DataFrame.new({
328
+ # "foo" => [1, 2, 3],
329
+ # "bar" => [6.0, 7.0, 8.0],
330
+ # "ham" => ["a", "b", "c"]
331
+ # })
332
+ # df.schema
333
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
90
334
  def schema
91
335
  columns.zip(dtypes).to_h
92
336
  end
@@ -124,6 +368,7 @@ module Polars
124
368
  # def %(other)
125
369
  # end
126
370
 
371
+ #
127
372
  def to_s
128
373
  _df.to_s
129
374
  end
@@ -133,6 +378,16 @@ module Polars
133
378
  columns.include?(name)
134
379
  end
135
380
 
381
+ # def each
382
+ # end
383
+
384
+ # def _pos_idx
385
+ # end
386
+
387
+ # def _pos_idxs
388
+ # end
389
+
390
+ #
136
391
  def [](name)
137
392
  Utils.wrap_s(_df.column(name))
138
393
  end
@@ -140,6 +395,9 @@ module Polars
140
395
  # def []=(key, value)
141
396
  # end
142
397
 
398
+ # no to_arrow
399
+
400
+ #
143
401
  def to_h(as_series: true)
144
402
  if as_series
145
403
  get_columns.to_h { |s| [s.name, s] }
@@ -148,7 +406,7 @@ module Polars
148
406
  end
149
407
  end
150
408
 
151
- # def to_hs / to_a
409
+ # def to_hashes / to_a
152
410
  # end
153
411
 
154
412
  # def to_numo
@@ -156,6 +414,28 @@ module Polars
156
414
 
157
415
  # no to_pandas
158
416
 
417
+ # Select column as Series at index location.
418
+ #
419
+ # @param index [Integer]
420
+ # Location of selection.
421
+ #
422
+ # @return [Series]
423
+ #
424
+ # @example
425
+ # df = Polars::DataFrame.new({
426
+ # "foo" => [1, 2, 3],
427
+ # "bar" => [6, 7, 8],
428
+ # "ham" => ["a", "b", "c"]
429
+ # })
430
+ # df.to_series(1)
431
+ # # =>
432
+ # # shape: (3,)
433
+ # # Series: 'bar' [i64]
434
+ # # [
435
+ # # 6
436
+ # # 7
437
+ # # 8
438
+ # # ]
159
439
  def to_series(index = 0)
160
440
  if index < 0
161
441
  index = columns.length + index
@@ -163,6 +443,18 @@ module Polars
163
443
  Utils.wrap_s(_df.select_at_idx(index))
164
444
  end
165
445
 
446
+ # Serialize to JSON representation.
447
+ #
448
+ # @return [nil]
449
+ #
450
+ # @param file [String]
451
+ # File path to which the result should be written.
452
+ # @param pretty [Boolean]
453
+ # Pretty serialize json.
454
+ # @param row_oriented [Boolean]
455
+ # Write to row oriented json. This is slower, but more common.
456
+ #
457
+ # @see #write_ndjson
166
458
  def write_json(
167
459
  file,
168
460
  pretty: false,
@@ -176,6 +468,12 @@ module Polars
176
468
  nil
177
469
  end
178
470
 
471
+ # Serialize to newline delimited JSON representation.
472
+ #
473
+ # @param file [String]
474
+ # File path to which the result should be written.
475
+ #
476
+ # @return [nil]
179
477
  def write_ndjson(file)
180
478
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
181
479
  file = Utils.format_path(file)
@@ -185,6 +483,48 @@ module Polars
185
483
  nil
186
484
  end
187
485
 
486
+ # Write to comma-separated values (CSV) file.
487
+ #
488
+ # @param file [String, nil]
489
+ # File path to which the result should be written. If set to `nil`
490
+ # (default), the output is returned as a string instead.
491
+ # @param has_header [Boolean]
492
+ # Whether to include header in the CSV output.
493
+ # @param sep [String]
494
+ # Separate CSV fields with this symbol.
495
+ # @param quote [String]
496
+ # Byte to use as quoting character.
497
+ # @param batch_size [Integer]
498
+ # Number of rows that will be processed per thread.
499
+ # @param datetime_format [String, nil]
500
+ # A format string, with the specifiers defined by the
501
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
502
+ # Rust crate. If no format specified, the default fractional-second
503
+ # precision is inferred from the maximum timeunit found in the frame's
504
+ # Datetime cols (if any).
505
+ # @param date_format [String, nil]
506
+ # A format string, with the specifiers defined by the
507
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
508
+ # Rust crate.
509
+ # @param time_format [String, nil]
510
+ # A format string, with the specifiers defined by the
511
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
512
+ # Rust crate.
513
+ # @param float_precision [Integer, nil]
514
+ # Number of decimal places to write, applied to both `:f32` and
515
+ # `:f64` datatypes.
516
+ # @param null_value [String, nil]
517
+ # A string representing null values (defaulting to the empty string).
518
+ #
519
+ # @return [String, nil]
520
+ #
521
+ # @example
522
+ # df = Polars::DataFrame.new({
523
+ # "foo" => [1, 2, 3, 4, 5],
524
+ # "bar" => [6, 7, 8, 9, 10],
525
+ # "ham" => ["a", "b", "c", "d", "e"]
526
+ # })
527
+ # df.write_csv("file.csv")
188
528
  def write_csv(
189
529
  file = nil,
190
530
  has_header: true,
@@ -220,8 +560,7 @@ module Polars
220
560
  float_precision,
221
561
  null_value
222
562
  )
223
- buffer.rewind
224
- return buffer.read.force_encoding(Encoding::UTF_8)
563
+ return buffer.string.force_encoding(Encoding::UTF_8)
225
564
  end
226
565
 
227
566
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -246,9 +585,50 @@ module Polars
246
585
  # def write_avro
247
586
  # end
248
587
 
249
- # def write_ipc
250
- # end
588
+ # Write to Arrow IPC binary stream or Feather file.
589
+ #
590
+ # @param file [String]
591
+ # File path to which the file should be written.
592
+ # @param compression ["uncompressed", "lz4", "zstd"]
593
+ # Compression method. Defaults to "uncompressed".
594
+ #
595
+ # @return [nil]
596
+ def write_ipc(file, compression: "uncompressed")
597
+ if compression.nil?
598
+ compression = "uncompressed"
599
+ end
600
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
601
+ file = Utils.format_path(file)
602
+ end
603
+
604
+ _df.write_ipc(file, compression)
605
+ end
251
606
 
607
+ # Write to Apache Parquet file.
608
+ #
609
+ # @param file [String]
610
+ # File path to which the file should be written.
611
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
612
+ # Choose "zstd" for good compression performance.
613
+ # Choose "lz4" for fast compression/decompression.
614
+ # Choose "snappy" for more backwards compatibility guarantees
615
+ # when you deal with older parquet readers.
616
+ # @param compression_level [Integer, nil]
617
+ # The level of compression to use. Higher compression means smaller files on
618
+ # disk.
619
+ #
620
+ # - "gzip" : min-level: 0, max-level: 10.
621
+ # - "brotli" : min-level: 0, max-level: 11.
622
+ # - "zstd" : min-level: 1, max-level: 22.
623
+ # @param statistics [Boolean]
624
+ # Write statistics to the parquet headers. This requires extra compute.
625
+ # @param row_group_size [Integer, nil]
626
+ # Size of the row groups in number of rows.
627
+ # If `nil` (default), the chunks of the DataFrame are
628
+ # used. Writing in smaller chunks may reduce memory pressure and improve
629
+ # writing speeds.
630
+ #
631
+ # @return [nil]
252
632
  def write_parquet(
253
633
  file,
254
634
  compression: "zstd",
@@ -268,6 +648,39 @@ module Polars
268
648
  )
269
649
  end
270
650
 
651
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
652
+ #
653
+ # Estimated size is given in the specified unit (bytes by default).
654
+ #
655
+ # This estimation is the sum of the size of its buffers, validity, including
656
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
657
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
658
+ # particular, StructArray's size is an upper bound.
659
+ #
660
+ # When an array is sliced, its allocated size remains constant because the buffer
661
+ # unchanged. However, this function will yield a smaller number. This is because
662
+ # this function returns the visible size of the buffer, not its total capacity.
663
+ #
664
+ # FFI buffers are included in this estimation.
665
+ #
666
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
667
+ # Scale the returned size to the given unit.
668
+ #
669
+ # @return [Numeric]
670
+ #
671
+ # @example
672
+ # df = Polars::DataFrame.new(
673
+ # {
674
+ # "x" => 1_000_000.times.to_a.reverse,
675
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
676
+ # "z" => 1_000_000.times.map(&:to_s)
677
+ # },
678
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
679
+ # )
680
+ # df.estimated_size
681
+ # # => 25888898
682
+ # df.estimated_size("mb")
683
+ # # => 24.689577102661133
271
684
  def estimated_size(unit = "b")
272
685
  sz = _df.estimated_size
273
686
  Utils.scale_bytes(sz, to: unit)
@@ -276,14 +689,114 @@ module Polars
276
689
  # def transpose
277
690
  # end
278
691
 
692
+ # Reverse the DataFrame.
693
+ #
694
+ # @return [DataFrame]
695
+ #
696
+ # @example
697
+ # df = Polars::DataFrame.new({
698
+ # "key" => ["a", "b", "c"],
699
+ # "val" => [1, 2, 3]
700
+ # })
701
+ # df.reverse()
702
+ # # =>
703
+ # # shape: (3, 2)
704
+ # # ┌─────┬─────┐
705
+ # # │ key ┆ val │
706
+ # # │ --- ┆ --- │
707
+ # # │ str ┆ i64 │
708
+ # # ╞═════╪═════╡
709
+ # # │ c ┆ 3 │
710
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
711
+ # # │ b ┆ 2 │
712
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
713
+ # # │ a ┆ 1 │
714
+ # # └─────┴─────┘
279
715
  def reverse
280
716
  select(Polars.col("*").reverse)
281
717
  end
282
718
 
719
+ # Rename column names.
720
+ #
721
+ # @param mapping [Hash]
722
+ # Key value pairs that map from old name to new name.
723
+ #
724
+ # @return [DataFrame]
725
+ #
726
+ # @example
727
+ # df = Polars::DataFrame.new({
728
+ # "foo" => [1, 2, 3],
729
+ # "bar" => [6, 7, 8],
730
+ # "ham" => ["a", "b", "c"]
731
+ # })
732
+ # df.rename({"foo" => "apple"})
733
+ # # =>
734
+ # # shape: (3, 3)
735
+ # # ┌───────┬─────┬─────┐
736
+ # # │ apple ┆ bar ┆ ham │
737
+ # # │ --- ┆ --- ┆ --- │
738
+ # # │ i64 ┆ i64 ┆ str │
739
+ # # ╞═══════╪═════╪═════╡
740
+ # # │ 1 ┆ 6 ┆ a │
741
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 2 ┆ 7 ┆ b │
743
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 3 ┆ 8 ┆ c │
745
+ # # └───────┴─────┴─────┘
283
746
  def rename(mapping)
284
747
  lazy.rename(mapping).collect(no_optimization: true)
285
748
  end
286
749
 
750
+ # Insert a Series at a certain column index. This operation is in place.
751
+ #
752
+ # @param index [Integer]
753
+ # Column to insert the new `Series` column.
754
+ # @param series [Series]
755
+ # `Series` to insert.
756
+ #
757
+ # @return [DataFrame]
758
+ #
759
+ # @example
760
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
761
+ # s = Polars::Series.new("baz", [97, 98, 99])
762
+ # df.insert_at_idx(1, s)
763
+ # # =>
764
+ # # shape: (3, 3)
765
+ # # ┌─────┬─────┬─────┐
766
+ # # │ foo ┆ baz ┆ bar │
767
+ # # │ --- ┆ --- ┆ --- │
768
+ # # │ i64 ┆ i64 ┆ i64 │
769
+ # # ╞═════╪═════╪═════╡
770
+ # # │ 1 ┆ 97 ┆ 4 │
771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
772
+ # # │ 2 ┆ 98 ┆ 5 │
773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
774
+ # # │ 3 ┆ 99 ┆ 6 │
775
+ # # └─────┴─────┴─────┘
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new({
779
+ # "a" => [1, 2, 3, 4],
780
+ # "b" => [0.5, 4, 10, 13],
781
+ # "c" => [true, true, false, true]
782
+ # })
783
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
+ # df.insert_at_idx(3, s)
785
+ # # =>
786
+ # # shape: (4, 4)
787
+ # # ┌─────┬──────┬───────┬──────┐
788
+ # # │ a ┆ b ┆ c ┆ d │
789
+ # # │ --- ┆ --- ┆ --- ┆ --- │
790
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
791
+ # # ╞═════╪══════╪═══════╪══════╡
792
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
793
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
794
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
795
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
796
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
797
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
798
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
799
+ # # └─────┴──────┴───────┴──────┘
287
800
  def insert_at_idx(index, series)
288
801
  if index < 0
289
802
  index = columns.length + index
@@ -305,6 +818,7 @@ module Polars
305
818
  # def replace_at_idx
306
819
  # end
307
820
 
821
+ #
308
822
  def sort(by, reverse: false, nulls_last: false)
309
823
  _from_rbdf(_df.sort(by, reverse, nulls_last))
310
824
  end
@@ -316,6 +830,7 @@ module Polars
316
830
  # def replace
317
831
  # end
318
832
 
833
+ #
319
834
  def slice(offset, length = nil)
320
835
  if !length.nil? && length < 0
321
836
  length = height - offset + length
@@ -344,6 +859,7 @@ module Polars
344
859
  # def with_row_count
345
860
  # end
346
861
 
862
+ #
347
863
  def groupby(by, maintain_order: false)
348
864
  lazy.groupby(by, maintain_order: maintain_order)
349
865
  end
@@ -360,6 +876,7 @@ module Polars
360
876
  # def join_asof
361
877
  # end
362
878
 
879
+ #
363
880
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
364
881
  lazy
365
882
  .join(
@@ -376,6 +893,7 @@ module Polars
376
893
  # def apply
377
894
  # end
378
895
 
896
+ #
379
897
  def with_column(column)
380
898
  lazy
381
899
  .with_column(column)
@@ -388,8 +906,11 @@ module Polars
388
906
  # def vstack
389
907
  # end
390
908
 
391
- # def extend
392
- # end
909
+ #
910
+ def extend(other)
911
+ _df.extend(other._df)
912
+ self
913
+ end
393
914
 
394
915
  # def drop
395
916
  # end
@@ -402,6 +923,7 @@ module Polars
402
923
 
403
924
  # clone handled by initialize_copy
404
925
 
926
+ #
405
927
  def get_columns
406
928
  _df.get_columns.map { |s| Utils.wrap_s(s) }
407
929
  end
@@ -413,6 +935,7 @@ module Polars
413
935
  # def fill_null
414
936
  # end
415
937
 
938
+ #
416
939
  def fill_nan(fill_value)
417
940
  lazy.fill_nan(fill_value).collect(no_optimization: true)
418
941
  end
@@ -438,6 +961,7 @@ module Polars
438
961
  # def shift_and_fill
439
962
  # end
440
963
 
964
+ #
441
965
  def is_duplicated
442
966
  Utils.wrap_s(_df.is_duplicated)
443
967
  end
@@ -547,6 +1071,7 @@ module Polars
547
1071
  # def n_unique
548
1072
  # end
549
1073
 
1074
+ #
550
1075
  def rechunk
551
1076
  _from_rbdf(_df.rechunk)
552
1077
  end
@@ -579,6 +1104,7 @@ module Polars
579
1104
  # def interpolate
580
1105
  # end
581
1106
 
1107
+ #
582
1108
  def is_empty
583
1109
  height == 0
584
1110
  end
@@ -597,15 +1123,55 @@ module Polars
597
1123
  self._df = _df._clone
598
1124
  end
599
1125
 
600
- def hash_to_rbdf(data)
1126
+ def hash_to_rbdf(data, columns: nil)
1127
+ if !columns.nil?
1128
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
+
1130
+ if !data && dtypes
1131
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
+ else
1133
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
1134
+ end
1135
+ data_series = _handle_columns_arg(data_series, columns: columns)
1136
+ return RbDataFrame.new(data_series)
1137
+ end
1138
+
601
1139
  RbDataFrame.read_hash(data)
602
1140
  end
603
1141
 
604
- def sequence_to_rbdf(data)
1142
+ def _unpack_columns(columns, lookup_names: nil)
1143
+ [columns.keys, columns]
1144
+ end
1145
+
1146
+ def _handle_columns_arg(data, columns: nil)
1147
+ if columns.nil?
1148
+ data
1149
+ else
1150
+ if !data
1151
+ columns.map { |c| Series.new(c, nil)._s }
1152
+ elsif data.length == columns.length
1153
+ columns.each_with_index do |c, i|
1154
+ # not in-place?
1155
+ data[i].rename(c)
1156
+ end
1157
+ data
1158
+ else
1159
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
1160
+ end
1161
+ end
1162
+ end
1163
+
1164
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
1165
+ if columns || orient
1166
+ raise Todo
1167
+ end
605
1168
  RbDataFrame.new(data.map(&:_s))
606
1169
  end
607
1170
 
608
- def series_to_rbdf(data)
1171
+ def series_to_rbdf(data, columns: nil)
1172
+ if columns
1173
+ raise Todo
1174
+ end
609
1175
  RbDataFrame.new([data._s])
610
1176
  end
611
1177