polars-df 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,22 @@
1
1
  module Polars
2
+ # Two-dimensional data structure representing data as a table with rows and columns.
2
3
  class DataFrame
4
+ # @private
3
5
  attr_accessor :_df
4
6
 
5
- def initialize(data = nil)
7
+ # Create a new DataFrame.
8
+ #
9
+ # @param data [Hash, Array, Series, nil]
10
+ # Two-dimensional data in various forms. Hash must contain Arrays.
11
+ # Array may contain Series.
12
+ # @param columns [Array, Hash, nil]
13
+ # Column labels to use for resulting DataFrame. If specified, overrides any
14
+ # labels already present in the data. Must match data dimensions.
15
+ # @param orient ["col", "row", nil]
16
+ # Whether to interpret two-dimensional data as columns or as rows. If `nil`,
17
+ # the orientation is inferred by matching the columns and data dimensions. If
18
+ # this does not yield conclusive results, column orientation is used.
19
+ def initialize(data = nil, columns: nil, orient: nil)
6
20
  if defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || data.is_a?(ActiveRecord::Result))
7
21
  result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.to_sql)
8
22
  data = {}
@@ -12,33 +26,135 @@ module Polars
12
26
  end
13
27
 
14
28
  if data.nil?
15
- self._df = hash_to_rbdf({})
29
+ self._df = hash_to_rbdf({}, columns: columns)
16
30
  elsif data.is_a?(Hash)
17
31
  data = data.transform_keys { |v| v.is_a?(Symbol) ? v.to_s : v }
18
- self._df = hash_to_rbdf(data)
32
+ self._df = hash_to_rbdf(data, columns: columns)
19
33
  elsif data.is_a?(Array)
20
- self._df = sequence_to_rbdf(data)
34
+ self._df = sequence_to_rbdf(data, columns: columns, orient: orient)
21
35
  elsif data.is_a?(Series)
22
- self._df = series_to_rbdf(data)
36
+ self._df = series_to_rbdf(data, columns: columns)
23
37
  else
24
38
  raise ArgumentError, "DataFrame constructor called with unsupported type; got #{data.class.name}"
25
39
  end
26
40
  end
27
41
 
42
+ # @private
28
43
  def self._from_rbdf(rb_df)
29
44
  df = DataFrame.allocate
30
45
  df._df = rb_df
31
46
  df
32
47
  end
33
48
 
34
- def self._read_csv(file, has_header: true)
49
+ # def self._from_hashes
50
+ # end
51
+
52
+ # def self._from_hash
53
+ # end
54
+
55
+ # def self._from_records
56
+ # end
57
+
58
+ # def self._from_numo
59
+ # end
60
+
61
+ # no self._from_arrow
62
+
63
+ # no self._from_pandas
64
+
65
+ # @private
66
+ def self._read_csv(
67
+ file,
68
+ has_header: true,
69
+ columns: nil,
70
+ sep: str = ",",
71
+ comment_char: nil,
72
+ quote_char: '"',
73
+ skip_rows: 0,
74
+ dtypes: nil,
75
+ null_values: nil,
76
+ ignore_errors: false,
77
+ parse_dates: false,
78
+ n_threads: nil,
79
+ infer_schema_length: 100,
80
+ batch_size: 8192,
81
+ n_rows: nil,
82
+ encoding: "utf8",
83
+ low_memory: false,
84
+ rechunk: true,
85
+ skip_rows_after_header: 0,
86
+ row_count_name: nil,
87
+ row_count_offset: 0,
88
+ sample_size: 1024,
89
+ eol_char: "\n"
90
+ )
35
91
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
36
- file = Utils.format_path(file)
92
+ path = Utils.format_path(file)
93
+ else
94
+ path = nil
95
+ # if defined?(StringIO) && file.is_a?(StringIO)
96
+ # file = file.string
97
+ # end
98
+ end
99
+
100
+ dtype_list = nil
101
+ dtype_slice = nil
102
+ if !dtypes.nil?
103
+ if dtypes.is_a?(Hash)
104
+ dtype_list = []
105
+ dtypes.each do|k, v|
106
+ dtype_list << [k, Utils.rb_type_to_dtype(v)]
107
+ end
108
+ elsif dtypes.is_a?(Array)
109
+ dtype_slice = dtypes
110
+ else
111
+ raise ArgumentError, "dtype arg should be list or dict"
112
+ end
113
+ end
114
+
115
+ processed_null_values = Utils._process_null_values(null_values)
116
+
117
+ if columns.is_a?(String)
118
+ columns = [columns]
119
+ end
120
+ if file.is_a?(String) && file.include?("*")
121
+ raise Todo
37
122
  end
38
123
 
39
- _from_rbdf(RbDataFrame.read_csv(file, has_header))
124
+ projection, columns = Utils.handle_projection_columns(columns)
125
+
126
+ _from_rbdf(
127
+ RbDataFrame.read_csv(
128
+ file,
129
+ infer_schema_length,
130
+ batch_size,
131
+ has_header,
132
+ ignore_errors,
133
+ n_rows,
134
+ skip_rows,
135
+ projection,
136
+ sep,
137
+ rechunk,
138
+ columns,
139
+ encoding,
140
+ n_threads,
141
+ path,
142
+ dtype_list,
143
+ dtype_slice,
144
+ low_memory,
145
+ comment_char,
146
+ quote_char,
147
+ processed_null_values,
148
+ parse_dates,
149
+ skip_rows_after_header,
150
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
151
+ sample_size,
152
+ eol_char
153
+ )
154
+ )
40
155
  end
41
156
 
157
+ # @private
42
158
  def self._read_parquet(file)
43
159
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
44
160
  file = Utils.format_path(file)
@@ -47,6 +163,44 @@ module Polars
47
163
  _from_rbdf(RbDataFrame.read_parquet(file))
48
164
  end
49
165
 
166
+ # def self._read_avro
167
+ # end
168
+
169
+ # @private
170
+ def self._read_ipc(
171
+ file,
172
+ columns: nil,
173
+ n_rows: nil,
174
+ row_count_name: nil,
175
+ row_count_offset: 0,
176
+ rechunk: true,
177
+ memory_map: true
178
+ )
179
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
180
+ file = Utils.format_path(file)
181
+ end
182
+ if columns.is_a?(String)
183
+ columns = [columns]
184
+ end
185
+
186
+ if file.is_a?(String) && file.include?("*")
187
+ raise Todo
188
+ end
189
+
190
+ projection, columns = Utils.handle_projection_columns(columns)
191
+ _from_rbdf(
192
+ RbDataFrame.read_ipc(
193
+ file,
194
+ columns,
195
+ projection,
196
+ n_rows,
197
+ Utils._prepare_row_count_args(row_count_name, row_count_offset),
198
+ memory_map
199
+ )
200
+ )
201
+ end
202
+
203
+ # @private
50
204
  def self._read_json(file)
51
205
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
52
206
  file = Utils.format_path(file)
@@ -55,6 +209,7 @@ module Polars
55
209
  _from_rbdf(RbDataFrame.read_json(file))
56
210
  end
57
211
 
212
+ # @private
58
213
  def self._read_ndjson(file)
59
214
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
60
215
  file = Utils.format_path(file)
@@ -63,30 +218,119 @@ module Polars
63
218
  _from_rbdf(RbDataFrame.read_ndjson(file))
64
219
  end
65
220
 
221
+ # Get the shape of the DataFrame.
222
+ #
223
+ # @return [Array]
224
+ #
225
+ # @example
226
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
227
+ # df.shape
228
+ # # => [5, 1]
66
229
  def shape
67
230
  _df.shape
68
231
  end
69
232
 
233
+ # Get the height of the DataFrame.
234
+ #
235
+ # @return [Integer]
236
+ #
237
+ # @example
238
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
239
+ # df.height
240
+ # # => 5
70
241
  def height
71
242
  _df.height
72
243
  end
73
244
 
245
+ # Get the width of the DataFrame.
246
+ #
247
+ # @return [Integer]
248
+ #
249
+ # @example
250
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3, 4, 5]})
251
+ # df.width
252
+ # # => 1
74
253
  def width
75
254
  _df.width
76
255
  end
77
256
 
257
+ # Get column names.
258
+ #
259
+ # @return [Array]
260
+ #
261
+ # @example
262
+ # df = Polars::DataFrame.new({
263
+ # "foo" => [1, 2, 3],
264
+ # "bar" => [6, 7, 8],
265
+ # "ham" => ["a", "b", "c"]
266
+ # })
267
+ # df.columns
268
+ # # => ["foo", "bar", "ham"]
78
269
  def columns
79
270
  _df.columns
80
271
  end
81
272
 
273
+ # Change the column names of the DataFrame.
274
+ #
275
+ # @param columns [Array]
276
+ # A list with new names for the DataFrame.
277
+ # The length of the list should be equal to the width of the DataFrame.
278
+ #
279
+ # @return [Object]
280
+ #
281
+ # @example
282
+ # df = Polars::DataFrame.new({
283
+ # "foo" => [1, 2, 3],
284
+ # "bar" => [6, 7, 8],
285
+ # "ham" => ["a", "b", "c"]
286
+ # })
287
+ # df.columns = ["apple", "banana", "orange"]
288
+ # df
289
+ # # =>
290
+ # # shape: (3, 3)
291
+ # # ┌───────┬────────┬────────┐
292
+ # # │ apple ┆ banana ┆ orange │
293
+ # # │ --- ┆ --- ┆ --- │
294
+ # # │ i64 ┆ i64 ┆ str │
295
+ # # ╞═══════╪════════╪════════╡
296
+ # # │ 1 ┆ 6 ┆ a │
297
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
298
+ # # │ 2 ┆ 7 ┆ b │
299
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
300
+ # # │ 3 ┆ 8 ┆ c │
301
+ # # └───────┴────────┴────────┘
82
302
  def columns=(columns)
83
303
  _df.set_column_names(columns)
84
304
  end
85
305
 
306
+ # Get dtypes of columns in DataFrame. Dtypes can also be found in column headers when printing the DataFrame.
307
+ #
308
+ # @return [Array]
309
+ #
310
+ # @example
311
+ # df = Polars::DataFrame.new({
312
+ # "foo" => [1, 2, 3],
313
+ # "bar" => [6.0, 7.0, 8.0],
314
+ # "ham" => ["a", "b", "c"]
315
+ # })
316
+ # df.dtypes
317
+ # # => [:i64, :f64, :str]
86
318
  def dtypes
87
- _df.dtypes.map(&:to_sym)
319
+ _df.dtypes
88
320
  end
89
321
 
322
+ # Get the schema.
323
+ #
324
+ # @return [Hash]
325
+ #
326
+ # @example
327
+ # df = Polars::DataFrame.new({
328
+ # "foo" => [1, 2, 3],
329
+ # "bar" => [6.0, 7.0, 8.0],
330
+ # "ham" => ["a", "b", "c"]
331
+ # })
332
+ # df.schema
333
+ # # => {"foo"=>:i64, "bar"=>:f64, "ham"=>:str}
90
334
  def schema
91
335
  columns.zip(dtypes).to_h
92
336
  end
@@ -124,6 +368,7 @@ module Polars
124
368
  # def %(other)
125
369
  # end
126
370
 
371
+ #
127
372
  def to_s
128
373
  _df.to_s
129
374
  end
@@ -133,6 +378,16 @@ module Polars
133
378
  columns.include?(name)
134
379
  end
135
380
 
381
+ # def each
382
+ # end
383
+
384
+ # def _pos_idx
385
+ # end
386
+
387
+ # def _pos_idxs
388
+ # end
389
+
390
+ #
136
391
  def [](name)
137
392
  Utils.wrap_s(_df.column(name))
138
393
  end
@@ -140,6 +395,9 @@ module Polars
140
395
  # def []=(key, value)
141
396
  # end
142
397
 
398
+ # no to_arrow
399
+
400
+ #
143
401
  def to_h(as_series: true)
144
402
  if as_series
145
403
  get_columns.to_h { |s| [s.name, s] }
@@ -148,7 +406,7 @@ module Polars
148
406
  end
149
407
  end
150
408
 
151
- # def to_hs / to_a
409
+ # def to_hashes / to_a
152
410
  # end
153
411
 
154
412
  # def to_numo
@@ -156,6 +414,28 @@ module Polars
156
414
 
157
415
  # no to_pandas
158
416
 
417
+ # Select column as Series at index location.
418
+ #
419
+ # @param index [Integer]
420
+ # Location of selection.
421
+ #
422
+ # @return [Series]
423
+ #
424
+ # @example
425
+ # df = Polars::DataFrame.new({
426
+ # "foo" => [1, 2, 3],
427
+ # "bar" => [6, 7, 8],
428
+ # "ham" => ["a", "b", "c"]
429
+ # })
430
+ # df.to_series(1)
431
+ # # =>
432
+ # # shape: (3,)
433
+ # # Series: 'bar' [i64]
434
+ # # [
435
+ # # 6
436
+ # # 7
437
+ # # 8
438
+ # # ]
159
439
  def to_series(index = 0)
160
440
  if index < 0
161
441
  index = columns.length + index
@@ -163,6 +443,18 @@ module Polars
163
443
  Utils.wrap_s(_df.select_at_idx(index))
164
444
  end
165
445
 
446
+ # Serialize to JSON representation.
447
+ #
448
+ # @return [nil]
449
+ #
450
+ # @param file [String]
451
+ # File path to which the result should be written.
452
+ # @param pretty [Boolean]
453
+ # Pretty serialize json.
454
+ # @param row_oriented [Boolean]
455
+ # Write to row oriented json. This is slower, but more common.
456
+ #
457
+ # @see #write_ndjson
166
458
  def write_json(
167
459
  file,
168
460
  pretty: false,
@@ -176,6 +468,12 @@ module Polars
176
468
  nil
177
469
  end
178
470
 
471
+ # Serialize to newline delimited JSON representation.
472
+ #
473
+ # @param file [String]
474
+ # File path to which the result should be written.
475
+ #
476
+ # @return [nil]
179
477
  def write_ndjson(file)
180
478
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
181
479
  file = Utils.format_path(file)
@@ -185,6 +483,48 @@ module Polars
185
483
  nil
186
484
  end
187
485
 
486
+ # Write to comma-separated values (CSV) file.
487
+ #
488
+ # @param file [String, nil]
489
+ # File path to which the result should be written. If set to `nil`
490
+ # (default), the output is returned as a string instead.
491
+ # @param has_header [Boolean]
492
+ # Whether to include header in the CSV output.
493
+ # @param sep [String]
494
+ # Separate CSV fields with this symbol.
495
+ # @param quote [String]
496
+ # Byte to use as quoting character.
497
+ # @param batch_size [Integer]
498
+ # Number of rows that will be processed per thread.
499
+ # @param datetime_format [String, nil]
500
+ # A format string, with the specifiers defined by the
501
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
502
+ # Rust crate. If no format specified, the default fractional-second
503
+ # precision is inferred from the maximum timeunit found in the frame's
504
+ # Datetime cols (if any).
505
+ # @param date_format [String, nil]
506
+ # A format string, with the specifiers defined by the
507
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
508
+ # Rust crate.
509
+ # @param time_format [String, nil]
510
+ # A format string, with the specifiers defined by the
511
+ # [chrono](https://docs.rs/chrono/latest/chrono/format/strftime/index.html)
512
+ # Rust crate.
513
+ # @param float_precision [Integer, nil]
514
+ # Number of decimal places to write, applied to both `:f32` and
515
+ # `:f64` datatypes.
516
+ # @param null_value [String, nil]
517
+ # A string representing null values (defaulting to the empty string).
518
+ #
519
+ # @return [String, nil]
520
+ #
521
+ # @example
522
+ # df = Polars::DataFrame.new({
523
+ # "foo" => [1, 2, 3, 4, 5],
524
+ # "bar" => [6, 7, 8, 9, 10],
525
+ # "ham" => ["a", "b", "c", "d", "e"]
526
+ # })
527
+ # df.write_csv("file.csv")
188
528
  def write_csv(
189
529
  file = nil,
190
530
  has_header: true,
@@ -220,8 +560,7 @@ module Polars
220
560
  float_precision,
221
561
  null_value
222
562
  )
223
- buffer.rewind
224
- return buffer.read.force_encoding(Encoding::UTF_8)
563
+ return buffer.string.force_encoding(Encoding::UTF_8)
225
564
  end
226
565
 
227
566
  if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
@@ -246,9 +585,50 @@ module Polars
246
585
  # def write_avro
247
586
  # end
248
587
 
249
- # def write_ipc
250
- # end
588
+ # Write to Arrow IPC binary stream or Feather file.
589
+ #
590
+ # @param file [String]
591
+ # File path to which the file should be written.
592
+ # @param compression ["uncompressed", "lz4", "zstd"]
593
+ # Compression method. Defaults to "uncompressed".
594
+ #
595
+ # @return [nil]
596
+ def write_ipc(file, compression: "uncompressed")
597
+ if compression.nil?
598
+ compression = "uncompressed"
599
+ end
600
+ if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
601
+ file = Utils.format_path(file)
602
+ end
603
+
604
+ _df.write_ipc(file, compression)
605
+ end
251
606
 
607
+ # Write to Apache Parquet file.
608
+ #
609
+ # @param file [String]
610
+ # File path to which the file should be written.
611
+ # @param compression ["lz4", "uncompressed", "snappy", "gzip", "lzo", "brotli", "zstd"]
612
+ # Choose "zstd" for good compression performance.
613
+ # Choose "lz4" for fast compression/decompression.
614
+ # Choose "snappy" for more backwards compatibility guarantees
615
+ # when you deal with older parquet readers.
616
+ # @param compression_level [Integer, nil]
617
+ # The level of compression to use. Higher compression means smaller files on
618
+ # disk.
619
+ #
620
+ # - "gzip" : min-level: 0, max-level: 10.
621
+ # - "brotli" : min-level: 0, max-level: 11.
622
+ # - "zstd" : min-level: 1, max-level: 22.
623
+ # @param statistics [Boolean]
624
+ # Write statistics to the parquet headers. This requires extra compute.
625
+ # @param row_group_size [Integer, nil]
626
+ # Size of the row groups in number of rows.
627
+ # If `nil` (default), the chunks of the DataFrame are
628
+ # used. Writing in smaller chunks may reduce memory pressure and improve
629
+ # writing speeds.
630
+ #
631
+ # @return [nil]
252
632
  def write_parquet(
253
633
  file,
254
634
  compression: "zstd",
@@ -268,6 +648,39 @@ module Polars
268
648
  )
269
649
  end
270
650
 
651
+ # Return an estimation of the total (heap) allocated size of the DataFrame.
652
+ #
653
+ # Estimated size is given in the specified unit (bytes by default).
654
+ #
655
+ # This estimation is the sum of the size of its buffers, validity, including
656
+ # nested arrays. Multiple arrays may share buffers and bitmaps. Therefore, the
657
+ # size of 2 arrays is not the sum of the sizes computed from this function. In
658
+ # particular, StructArray's size is an upper bound.
659
+ #
660
+ # When an array is sliced, its allocated size remains constant because the buffer
661
+ # unchanged. However, this function will yield a smaller number. This is because
662
+ # this function returns the visible size of the buffer, not its total capacity.
663
+ #
664
+ # FFI buffers are included in this estimation.
665
+ #
666
+ # @param unit ["b", "kb", "mb", "gb", "tb"]
667
+ # Scale the returned size to the given unit.
668
+ #
669
+ # @return [Numeric]
670
+ #
671
+ # @example
672
+ # df = Polars::DataFrame.new(
673
+ # {
674
+ # "x" => 1_000_000.times.to_a.reverse,
675
+ # "y" => 1_000_000.times.map { |v| v / 1000.0 },
676
+ # "z" => 1_000_000.times.map(&:to_s)
677
+ # },
678
+ # columns: {"x" => :u32, "y" => :f64, "z" => :str}
679
+ # )
680
+ # df.estimated_size
681
+ # # => 25888898
682
+ # df.estimated_size("mb")
683
+ # # => 24.689577102661133
271
684
  def estimated_size(unit = "b")
272
685
  sz = _df.estimated_size
273
686
  Utils.scale_bytes(sz, to: unit)
@@ -276,14 +689,114 @@ module Polars
276
689
  # def transpose
277
690
  # end
278
691
 
692
+ # Reverse the DataFrame.
693
+ #
694
+ # @return [DataFrame]
695
+ #
696
+ # @example
697
+ # df = Polars::DataFrame.new({
698
+ # "key" => ["a", "b", "c"],
699
+ # "val" => [1, 2, 3]
700
+ # })
701
+ # df.reverse()
702
+ # # =>
703
+ # # shape: (3, 2)
704
+ # # ┌─────┬─────┐
705
+ # # │ key ┆ val │
706
+ # # │ --- ┆ --- │
707
+ # # │ str ┆ i64 │
708
+ # # ╞═════╪═════╡
709
+ # # │ c ┆ 3 │
710
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
711
+ # # │ b ┆ 2 │
712
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┤
713
+ # # │ a ┆ 1 │
714
+ # # └─────┴─────┘
279
715
  def reverse
280
716
  select(Polars.col("*").reverse)
281
717
  end
282
718
 
719
+ # Rename column names.
720
+ #
721
+ # @param mapping [Hash]
722
+ # Key value pairs that map from old name to new name.
723
+ #
724
+ # @return [DataFrame]
725
+ #
726
+ # @example
727
+ # df = Polars::DataFrame.new({
728
+ # "foo" => [1, 2, 3],
729
+ # "bar" => [6, 7, 8],
730
+ # "ham" => ["a", "b", "c"]
731
+ # })
732
+ # df.rename({"foo" => "apple"})
733
+ # # =>
734
+ # # shape: (3, 3)
735
+ # # ┌───────┬─────┬─────┐
736
+ # # │ apple ┆ bar ┆ ham │
737
+ # # │ --- ┆ --- ┆ --- │
738
+ # # │ i64 ┆ i64 ┆ str │
739
+ # # ╞═══════╪═════╪═════╡
740
+ # # │ 1 ┆ 6 ┆ a │
741
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
742
+ # # │ 2 ┆ 7 ┆ b │
743
+ # # ├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
744
+ # # │ 3 ┆ 8 ┆ c │
745
+ # # └───────┴─────┴─────┘
283
746
  def rename(mapping)
284
747
  lazy.rename(mapping).collect(no_optimization: true)
285
748
  end
286
749
 
750
+ # Insert a Series at a certain column index. This operation is in place.
751
+ #
752
+ # @param index [Integer]
753
+ # Column to insert the new `Series` column.
754
+ # @param series [Series]
755
+ # `Series` to insert.
756
+ #
757
+ # @return [DataFrame]
758
+ #
759
+ # @example
760
+ # df = Polars::DataFrame.new({"foo" => [1, 2, 3], "bar" => [4, 5, 6]})
761
+ # s = Polars::Series.new("baz", [97, 98, 99])
762
+ # df.insert_at_idx(1, s)
763
+ # # =>
764
+ # # shape: (3, 3)
765
+ # # ┌─────┬─────┬─────┐
766
+ # # │ foo ┆ baz ┆ bar │
767
+ # # │ --- ┆ --- ┆ --- │
768
+ # # │ i64 ┆ i64 ┆ i64 │
769
+ # # ╞═════╪═════╪═════╡
770
+ # # │ 1 ┆ 97 ┆ 4 │
771
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
772
+ # # │ 2 ┆ 98 ┆ 5 │
773
+ # # ├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
774
+ # # │ 3 ┆ 99 ┆ 6 │
775
+ # # └─────┴─────┴─────┘
776
+ #
777
+ # @example
778
+ # df = Polars::DataFrame.new({
779
+ # "a" => [1, 2, 3, 4],
780
+ # "b" => [0.5, 4, 10, 13],
781
+ # "c" => [true, true, false, true]
782
+ # })
783
+ # s = Polars::Series.new("d", [-2.5, 15, 20.5, 0])
784
+ # df.insert_at_idx(3, s)
785
+ # # =>
786
+ # # shape: (4, 4)
787
+ # # ┌─────┬──────┬───────┬──────┐
788
+ # # │ a ┆ b ┆ c ┆ d │
789
+ # # │ --- ┆ --- ┆ --- ┆ --- │
790
+ # # │ i64 ┆ f64 ┆ bool ┆ f64 │
791
+ # # ╞═════╪══════╪═══════╪══════╡
792
+ # # │ 1 ┆ 0.5 ┆ true ┆ -2.5 │
793
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
794
+ # # │ 2 ┆ 4.0 ┆ true ┆ 15.0 │
795
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
796
+ # # │ 3 ┆ 10.0 ┆ false ┆ 20.5 │
797
+ # # ├╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
798
+ # # │ 4 ┆ 13.0 ┆ true ┆ 0.0 │
799
+ # # └─────┴──────┴───────┴──────┘
287
800
  def insert_at_idx(index, series)
288
801
  if index < 0
289
802
  index = columns.length + index
@@ -305,6 +818,7 @@ module Polars
305
818
  # def replace_at_idx
306
819
  # end
307
820
 
821
+ #
308
822
  def sort(by, reverse: false, nulls_last: false)
309
823
  _from_rbdf(_df.sort(by, reverse, nulls_last))
310
824
  end
@@ -316,6 +830,7 @@ module Polars
316
830
  # def replace
317
831
  # end
318
832
 
833
+ #
319
834
  def slice(offset, length = nil)
320
835
  if !length.nil? && length < 0
321
836
  length = height - offset + length
@@ -344,6 +859,7 @@ module Polars
344
859
  # def with_row_count
345
860
  # end
346
861
 
862
+ #
347
863
  def groupby(by, maintain_order: false)
348
864
  lazy.groupby(by, maintain_order: maintain_order)
349
865
  end
@@ -360,6 +876,7 @@ module Polars
360
876
  # def join_asof
361
877
  # end
362
878
 
879
+ #
363
880
  def join(other, left_on: nil, right_on: nil, on: nil, how: "inner", suffix: "_right")
364
881
  lazy
365
882
  .join(
@@ -376,6 +893,7 @@ module Polars
376
893
  # def apply
377
894
  # end
378
895
 
896
+ #
379
897
  def with_column(column)
380
898
  lazy
381
899
  .with_column(column)
@@ -388,8 +906,11 @@ module Polars
388
906
  # def vstack
389
907
  # end
390
908
 
391
- # def extend
392
- # end
909
+ #
910
+ def extend(other)
911
+ _df.extend(other._df)
912
+ self
913
+ end
393
914
 
394
915
  # def drop
395
916
  # end
@@ -402,6 +923,7 @@ module Polars
402
923
 
403
924
  # clone handled by initialize_copy
404
925
 
926
+ #
405
927
  def get_columns
406
928
  _df.get_columns.map { |s| Utils.wrap_s(s) }
407
929
  end
@@ -413,6 +935,7 @@ module Polars
413
935
  # def fill_null
414
936
  # end
415
937
 
938
+ #
416
939
  def fill_nan(fill_value)
417
940
  lazy.fill_nan(fill_value).collect(no_optimization: true)
418
941
  end
@@ -438,6 +961,7 @@ module Polars
438
961
  # def shift_and_fill
439
962
  # end
440
963
 
964
+ #
441
965
  def is_duplicated
442
966
  Utils.wrap_s(_df.is_duplicated)
443
967
  end
@@ -547,6 +1071,7 @@ module Polars
547
1071
  # def n_unique
548
1072
  # end
549
1073
 
1074
+ #
550
1075
  def rechunk
551
1076
  _from_rbdf(_df.rechunk)
552
1077
  end
@@ -579,6 +1104,7 @@ module Polars
579
1104
  # def interpolate
580
1105
  # end
581
1106
 
1107
+ #
582
1108
  def is_empty
583
1109
  height == 0
584
1110
  end
@@ -597,15 +1123,55 @@ module Polars
597
1123
  self._df = _df._clone
598
1124
  end
599
1125
 
600
- def hash_to_rbdf(data)
1126
+ def hash_to_rbdf(data, columns: nil)
1127
+ if !columns.nil?
1128
+ columns, dtypes = _unpack_columns(columns, lookup_names: data.keys)
1129
+
1130
+ if !data && dtypes
1131
+ data_series = columns.map { |name| Series.new(name, [], dtype: dtypes[name])._s }
1132
+ else
1133
+ data_series = data.map { |name, values| Series.new(name, values, dtype: dtypes[name])._s }
1134
+ end
1135
+ data_series = _handle_columns_arg(data_series, columns: columns)
1136
+ return RbDataFrame.new(data_series)
1137
+ end
1138
+
601
1139
  RbDataFrame.read_hash(data)
602
1140
  end
603
1141
 
604
- def sequence_to_rbdf(data)
1142
+ def _unpack_columns(columns, lookup_names: nil)
1143
+ [columns.keys, columns]
1144
+ end
1145
+
1146
+ def _handle_columns_arg(data, columns: nil)
1147
+ if columns.nil?
1148
+ data
1149
+ else
1150
+ if !data
1151
+ columns.map { |c| Series.new(c, nil)._s }
1152
+ elsif data.length == columns.length
1153
+ columns.each_with_index do |c, i|
1154
+ # not in-place?
1155
+ data[i].rename(c)
1156
+ end
1157
+ data
1158
+ else
1159
+ raise ArgumentError, "Dimensions of columns arg must match data dimensions."
1160
+ end
1161
+ end
1162
+ end
1163
+
1164
+ def sequence_to_rbdf(data, columns: nil, orient: nil)
1165
+ if columns || orient
1166
+ raise Todo
1167
+ end
605
1168
  RbDataFrame.new(data.map(&:_s))
606
1169
  end
607
1170
 
608
- def series_to_rbdf(data)
1171
+ def series_to_rbdf(data, columns: nil)
1172
+ if columns
1173
+ raise Todo
1174
+ end
609
1175
  RbDataFrame.new([data._s])
610
1176
  end
611
1177